From 4bb4dcab1d5050ae6ef122afd76b57243f661a95 Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Wed, 24 May 2017 22:30:03 +0100 Subject: [PATCH] ffmpeg: update hevc commits --- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 11994 ++++++++++------ ...e6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch | 1 - 2 files changed, 7681 insertions(+), 4314 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index b4c15b782a..96cfa9ae30 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -11,7 +11,7 @@ index 524fb73..305632b 100644 /ffplay /ffprobe diff --git a/ffmpeg.c b/ffmpeg.c -index 9ffd833..7a86d7e 100644 +index 9ffd833..e2474e5 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -23,6 +23,11 @@ @@ -52,7 +52,7 @@ index 9ffd833..7a86d7e 100644 #if HAVE_SYS_RESOURCE_H #include #include -@@ -158,6 +182,169 @@ static int restore_tty; +@@ -158,6 +182,182 @@ static int restore_tty; static void free_input_threads(void); #endif @@ -100,7 +100,7 @@ index 9ffd833..7a86d7e 100644 + mmal_buffer_header_release(buffer); +} + -+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h) ++static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h) +{ + MMAL_COMPONENT_T* display; + MMAL_DISPLAYREGION_T region = @@ -111,7 +111,7 @@ index 9ffd833..7a86d7e 100644 + .fullscreen = 0, + .dest_rect = {x, y, w, h} + }; -+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h); ++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h); + + bcm_host_init(); // TODO is this needed? + mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display); @@ -121,7 +121,7 @@ index 9ffd833..7a86d7e 100644 + + { + MMAL_ES_FORMAT_T* format = display->input[0]->format; -+ format->encoding = MMAL_ENCODING_I420; ++ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420; + format->es->video.width = geo.stride_y; + format->es->video.height = geo.height_y; + format->es->video.crop.x = 0; @@ -138,7 +138,7 @@ index 9ffd833..7a86d7e 100644 + mmal_port_enable(display->input[0],display_cb_input); + mmal_port_enable(display->control,display_cb_control); + -+ printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y); ++ printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt); + + return display; +} @@ -168,12 +168,24 @@ index 9ffd833..7a86d7e 100644 +#ifdef RPI_ZERO_COPY +{ + const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1); ++ if (fr_buf == NULL) { ++ mmal_buffer_header_release(buf); ++ return; ++ } + + buf->user_data = fr_buf; + buf->data = av_rpi_zc_vc_handle(fr_buf); -+ buf->alloc_size = -+ buf->length = av_rpi_zc_numbytes(fr_buf); -+ ++ buf->offset = av_rpi_zc_offset(fr_buf); ++ buf->length = av_rpi_zc_length(fr_buf); ++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); ++#if 0 ++ { ++ unsigned int n; ++ for (n = 0; n < fr->width; n += 128) { ++ memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2); ++ } ++ } ++#endif + ++rpi_display_count; +} +#else @@ -208,6 +220,7 @@ index 9ffd833..7a86d7e 100644 + +static void display_exit(MMAL_COMPONENT_T* display) +{ ++// sleep(120); + if (display) { + mmal_component_destroy(display); + } @@ -222,7 +235,7 @@ index 9ffd833..7a86d7e 100644 /* sub2video hack: Convert subtitles to video with alpha to insert them in filter graphs. This is a temporary solution until libavfilter gets real subtitles support. -@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret) +@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret) avformat_close_input(&input_files[i]->ctx); av_freep(&input_files[i]); } @@ -234,7 +247,7 @@ index 9ffd833..7a86d7e 100644 for (i = 0; i < nb_input_streams; i++) { InputStream *ist = input_streams[i]; -@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret) +@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret) av_freep(&ist->filters); av_freep(&ist->hwaccel_device); @@ -244,7 +257,7 @@ index 9ffd833..7a86d7e 100644 avcodec_free_context(&ist->dec_ctx); av_freep(&input_streams[i]); -@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret) +@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret) } term_exit(); ffmpeg_exited = 1; @@ -252,7 +265,7 @@ index 9ffd833..7a86d7e 100644 } void remove_avoptions(AVDictionary **a, AVDictionary *b) -@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s, +@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s, if (ost->source_index >= 0) ist = input_streams[ost->source_index]; @@ -260,7 +273,7 @@ index 9ffd833..7a86d7e 100644 + if (next_picture && ist != NULL) + { + if (!rpi_display) -+ rpi_display = display_init(0,0,next_picture->width,next_picture->height); ++ rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height); + display_frame(ist->dec_ctx, rpi_display, next_picture); + } +#endif @@ -268,7 +281,7 @@ index 9ffd833..7a86d7e 100644 if (filter->inputs[0]->frame_rate.num > 0 && filter->inputs[0]->frame_rate.den > 0) duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base)); -@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) +@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; @@ -282,22 +295,23 @@ index 9ffd833..7a86d7e 100644 av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0); diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index fd0d1f0..40d22d2 100644 +index fd0d1f0..1740768 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -5,6 +5,11 @@ NAME = avcodec +@@ -5,6 +5,12 @@ NAME = avcodec HEADERS = avcodec.h \ avdct.h \ avfft.h \ + rpi_qpu.h \ + rpi_shader.h \ ++ rpi_shader_cmd.h \ + rpi_mailbox.h \ + rpi_hevc_transform.h \ + rpi_zc.h \ d3d11va.h \ dirac.h \ dv_profile.h \ -@@ -43,6 +48,10 @@ OBJS = allcodecs.o \ +@@ -43,6 +49,10 @@ OBJS = allcodecs.o \ resample.o \ resample2.o \ utils.o \ @@ -308,18 +322,22 @@ index fd0d1f0..40d22d2 100644 vorbis_parser.o \ xiph.o \ -@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h +@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h endif + ++QASM := $(SUBDIR)../pi-util/qasm.py ++ ++ifneq ("$(wildcard $(QASM))","") +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm -+ python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ + +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm -+ python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ ++endif + -+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h ++$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 54efaad..02a89c3 100644 --- a/libavcodec/allcodecs.c @@ -333,12 +351,14 @@ index 54efaad..02a89c3 100644 REGISTER_PARSER(MJPEG, mjpeg); REGISTER_PARSER(MLP, mlp); diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index a4ceca7..1354c14 100644 +index a4ceca7..cafd25d 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile -@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o +@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ + NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ ++ arm/hevc_misc_neon.o \ arm/hevcdsp_deblock_neon.o \ + arm/hevcdsp_epel_neon.o \ arm/hevcdsp_idct_neon.o \ @@ -1027,18 +1047,592 @@ index 0000000..31d3c59 +#endif /* HAVE_ARMV6T2_INLINE */ + +#endif /* AVCODEC_ARM_HEVC_CABAC_H */ +diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S +new file mode 100644 +index 0000000..373576b +--- /dev/null ++++ b/libavcodec/arm/hevc_misc_neon.S +@@ -0,0 +1,62 @@ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ rpi_zap_coeff_vals_neon( ++@ uint16_t * buf, [r0] ++@ unsigned int log_n_m2) [r1] ++ ++function rpi_zap_coeff_vals_neon, export=1 ++ vmov.i64 q8, #0 ++ adr r12, zc_tab ++ vmov.i64 q9, #0 ++ tst r0, #63 ++ vmov.i64 q10, #0 ++ add r0, #63 ++ vmov.i64 q11, #0 ++ and r0, #~63 ++ ldr pc, [r12, r1, lsl #2] ++ ++zc_tab: ++ .word zc_lc2 ++ .word zc_lc3 ++ .word zc_lc4 ++ .word zc_lc5 ++ ++@ 4*4*2: "32 bytes" 64 or 0 depending on dst address ++zc_lc2: ++ it eq ++ vstmeq r0, {q8-q11} ++ bx lr ++ ++@ 16*16*2 = 512 = 64 * 8 ++zc_lc4: ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++@ 8*8*2 = 128 ++zc_lc3: ++ vstm r0!, {q8-q11} ++ vstm r0, {q8-q11} ++ bx lr ++ ++@ 32*32*2 = 2048 = 128 * 16 ++zc_lc5: ++ vmov.i64 q12, #0 ++ vmov.i64 q13, #0 ++ vmov.i64 q14, #0 ++ vmov.i64 q15, #0 ++ mov r2, #4 ++1: ++ vstm r0!, {q8-q15} ++ subs r2, #1 ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ bne 1b ++ bx lr ++ ++endfunc ++ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S -index 166bddb..a088cc3 100644 +index 166bddb..9bd0a42 100644 --- a/libavcodec/arm/hevcdsp_deblock_neon.S +++ b/libavcodec/arm/hevcdsp_deblock_neon.S -@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1 +@@ -15,7 +15,7 @@ + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software +- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1 + */ + + +@@ -31,6 +31,9 @@ + bxeq lr + .endm + ++@ Uses: d2, d4, d18, d19 ++@ Returns: d2, d4 ++@ Modifies: d0-d7, d22-d25 + .macro hevc_loop_filter_chroma_body + vsubl.u8 q3, d4, d2 + vsubl.u8 q11, d18, d19 +@@ -49,6 +52,33 @@ + vqmovun.s16 d4, q2 + .endm + ++ ++@ Uses r2[0:7], r2[8:15] ++@ Modifies: d0-d7, d22-d25 ++.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1 ++ vsubl.u8 q3, \Q0, \P0 ++ vsubl.u8 q11, \P1, \Q1 ++ vshl.i16 q3, #2 ++ vadd.i16 q11, q3 ++ ++ @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all) ++ vdup.16 d0, r2 ++ vmovl.u8 q0, d0 ++ vuzp.16 d0, d1 ++ ++ vrshr.s16 q11, q11, #3 ++ vneg.s16 q12, q0 ++ vmovl.u8 q2, \Q0 ++ vmin.s16 q11, q11, q0 ++ vmax.s16 q11, q11, q12 ++ vaddw.u8 q1, q11, \P0 ++ vsub.i16 q2, q11 ++ vqmovun.s16 \P0, q1 ++ vqmovun.s16 \Q0, q2 ++.endm ++ ++ ++ + .macro hevc_loop_filter_luma_start + ldr r12, [r3] + ldr r3, [r3, #4] +@@ -60,15 +90,17 @@ + lsr r3, #16 + .endm + +-.macro hevc_loop_filter_luma_body ++@ Uses: r2, r3, r12 ++@ Modifies: r5, r6, r7, r8, r9 ++function hevc_loop_filter_luma_body ++ vmovl.u8 q15, d23 ++ vmovl.u8 q14, d22 ++ vmovl.u8 q13, d21 ++ vmovl.u8 q12, d20 ++ vmovl.u8 q11, d19 ++ vmovl.u8 q10, d18 ++ vmovl.u8 q9, d17 + vmovl.u8 q8, d16 +- vmovl.u8 q9, d18 +- vmovl.u8 q10, d20 +- vmovl.u8 q11, d22 +- vmovl.u8 q12, d24 +- vmovl.u8 q13, d26 +- vmovl.u8 q14, d28 +- vmovl.u8 q15, d30 + + vadd.i16 q7, q9, q11 + vadd.i16 q6, q14, q12 +@@ -77,7 +109,6 @@ + vabd.s16 q7, q7, q10 + vabd.s16 q6, q6, q13 + +- + vdup.16 q0, r2 + vmov q4, q7 + vmov q5, q6 +@@ -152,7 +183,7 @@ + + and r9, r8, r7 + cmp r9, #0 +- beq weakfilter_\@ ++ beq weakfilter_ + + vadd.i16 q2, q11, q12 + vadd.i16 q4, q9, q8 +@@ -210,11 +241,11 @@ + vbit q13, q3, q5 + vbit q14, q2, q5 + +-weakfilter_\@: ++weakfilter_: + mvn r8, r8 + and r9, r8, r7 + cmp r9, #0 +- beq ready_\@ ++ beq ready_ + + vdup.16 q4, r2 + +@@ -275,75 +306,345 @@ weakfilter_\@: + vbit q11, q0, q5 + vbit q12, q4, q5 + +-ready_\@: ++ready_: + vqmovun.s16 d16, q8 +- vqmovun.s16 d18, q9 +- vqmovun.s16 d20, q10 +- vqmovun.s16 d22, q11 +- vqmovun.s16 d24, q12 +- vqmovun.s16 d26, q13 +- vqmovun.s16 d28, q14 +- vqmovun.s16 d30, q15 +-.endm ++ vqmovun.s16 d17, q9 ++ vqmovun.s16 d18, q10 ++ vqmovun.s16 d19, q11 ++ vqmovun.s16 d20, q12 ++ vqmovun.s16 d21, q13 ++ vqmovun.s16 d22, q14 ++ vqmovun.s16 d23, q15 ++ mov pc, lr ++endfunc ++ ++@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8])) ++function ff_hevc_v_loop_filter_luma2_neon_8, export=1 ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} @ 8 regs = 32 bytes ++ ++ ldr r4, [sp, #40] ++ b v_loop_luma_common ++endfunc ++ + + function ff_hevc_v_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start +- push {r5-r11} ++ push {r4-r10,lr} ++ ++ sub r4, r0, #4 ++v_loop_luma_common: ++ @ Why this isn't a bitmask to start with I have no idea... ++ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 ++ ldr r5, [sp, #32] ++ ldrh r10, [r5] ++ ldr r5, [sp, #36] ++ ldrh r5, [r5] ++ orr r10, r10, r5, lsl #16 @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1] ++ + vpush {d8-d15} +- sub r0, #4 +- vld1.8 {d16}, [r0], r1 +- vld1.8 {d18}, [r0], r1 +- vld1.8 {d20}, [r0], r1 +- vld1.8 {d22}, [r0], r1 +- vld1.8 {d24}, [r0], r1 +- vld1.8 {d26}, [r0], r1 +- vld1.8 {d28}, [r0], r1 +- vld1.8 {d30}, [r0], r1 +- sub r0, r0, r1, lsl #3 +- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 +- hevc_loop_filter_luma_body +- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 +- vst1.8 {d16}, [r0], r1 +- vst1.8 {d18}, [r0], r1 +- vst1.8 {d20}, [r0], r1 +- vst1.8 {d22}, [r0], r1 +- vst1.8 {d24}, [r0], r1 +- vst1.8 {d26}, [r0], r1 +- vst1.8 {d28}, [r0], r1 +- vst1.8 {d30}, [r0] ++ ++ @ Uses slightly fewer instructions to do laned loads than unlaned ++ @ and transpose. This also means that we can use the same code for ++ @ both split & unsplit deblock ++ vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1 ++ vld4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1 ++ ++ vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 ++ vld4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 ++ ++ vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 ++ vld4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 ++ ++ vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 ++ vld4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 ++ ++ vld4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 ++ vld4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 ++ ++ vld4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 ++ vld4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 ++ ++ vld4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 ++ vld4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 ++ ++ vld4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32] ++ vld4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32] ++ ++ bl hevc_loop_filter_luma_body ++ ++ neg r1, r1 ++ ++ @ no_p[1] ++ tst r10, #0xff00 ++ itt ne ++ addne r4, r4, r1, lsl #2 ++ bne 1f ++ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 ++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 ++ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 ++ ++1: ++ @ no_q[1] ++ tst r10, #0xff000000 ++ itt ne ++ addne r0, r0, r1, lsl #2 ++ bne 2f ++ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 ++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 ++ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 ++ ++2: ++ @ no_p[0] ++ tst r10, #0xff ++ bne 3f ++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 ++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32] ++ ++3: ++ @ no_q[0] ++ tst r10, #0xff0000 ++ bne 4f ++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 ++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32] ++ ++4: ++bypasswrite: + vpop {d8-d15} +- pop {r5-r11} +- bx lr ++ pop {r4-r10,pc} + endfunc + ++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] ++@ ptrdiff_t stride, [r1] ++@ int beta, [r2] ++@ int32_t *tc, [r3] ++@ uint8_t *no_p, sp[0] ++@ uint8_t *no_q); sp[4] ++@ ++@ Src should always be on 8 byte boundry & all in the same slice ++ + function ff_hevc_h_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start +- push {r5-r11} ++ push {r4-r10,lr} ++ + vpush {d8-d15} + sub r0, r0, r1, lsl #2 ++ + vld1.8 {d16}, [r0], r1 ++ vld1.8 {d17}, [r0], r1 + vld1.8 {d18}, [r0], r1 ++ vld1.8 {d19}, [r0], r1 + vld1.8 {d20}, [r0], r1 ++ vld1.8 {d21}, [r0], r1 + vld1.8 {d22}, [r0], r1 +- vld1.8 {d24}, [r0], r1 +- vld1.8 {d26}, [r0], r1 +- vld1.8 {d28}, [r0], r1 +- vld1.8 {d30}, [r0], r1 +- sub r0, r0, r1, lsl #3 +- add r0, r1 +- hevc_loop_filter_luma_body +- vst1.8 {d18}, [r0], r1 +- vst1.8 {d20}, [r0], r1 +- vst1.8 {d22}, [r0], r1 +- vst1.8 {d24}, [r0], r1 +- vst1.8 {d26}, [r0], r1 +- vst1.8 {d28}, [r0] +-bypasswrite: ++ vld1.8 {d23}, [r0] ++ ++ bl hevc_loop_filter_luma_body ++ + vpop {d8-d15} +- pop {r5-r11} +- bx lr ++ ++ neg r1, r1 ++ add r0, r0, r1 ++ ++ @ Why this isn't a bitmask to start with I have no idea... ++ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 ++ ldr r5, [sp, #32] ++ ldrh r10, [r5] ++ ldr r5, [sp, #36] ++ ldrh r5, [r5] ++ orrs r10, r10, r5, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] ++ bne 1f ++ ++ vst1.8 {d22}, [r0], r1 ++ vst1.8 {d21}, [r0], r1 ++ vst1.8 {d20}, [r0], r1 ++ vst1.8 {d19}, [r0], r1 ++ vst1.8 {d18}, [r0], r1 ++ vst1.8 {d17}, [r0] ++ ++ pop {r4-r10,pc} ++ ++@ Partial write ++1: ++ vmov r2, r3, d22 ++ vmov r4, r5, d21 ++ vmov r6, r7, d20 ++ ++ tst r10, #0xff0000 ++ ittt eq ++ streq r2, [r0] ++ streq r4, [r0, r1] ++ streq r6, [r0, r1, lsl # 1] ++ ++ add r0, r0, #4 ++ tst r10, #0xff000000 ++ ittt eq ++ streq r3, [r0] ++ streq r5, [r0, r1] ++ streq r7, [r0, r1, lsl # 1] ++ ++ vmov r2, r3, d19 ++ vmov r4, r5, d18 ++ vmov r6, r7, d17 ++ add r0, r0, r1 ++ add r0, r0, r1, lsl # 1 ++ ++ tst r10, #0xff00 ++ ittt eq ++ streq r3, [r0] ++ streq r5, [r0, r1] ++ streq r7, [r0, r1, lsl # 1] ++ ++ tst r10, #0xff ++ ittt eq ++ streq r2, [r0, #-4]! ++ streq r4, [r0, r1] ++ streq r6, [r0, r1, lsl # 1] ++ ++ pop {r4-r10,pc} ++ + endfunc + ++@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++function ff_hevc_h_loop_filter_uv_neon_8, export=1 ++ sub r0, r0, r1, lsl #1 ++ vld2.8 {d16,d17}, [r0], r1 ++ vld2.8 {d18,d19}, [r0], r1 ++ vld2.8 {d26,d27}, [r0], r1 ++ vld2.8 {d28,d29}, [r0] ++ sub r0, r0, r1, lsl #1 ++ hevc_loop_filter_uv_body d16, d18, d26, d28 ++ lsr r2, r2, #16 ++ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ cmp r3, #0 ++ bne 1f ++ vst2.8 {d18,d19}, [r0], r1 ++ vst2.8 {d26,d27}, [r0] ++ bx lr ++ ++ @ At least one no_f bit is set ++ @ Which means we need to break this apart in an ugly fashion ++1: vzip.8 d18, d19 ++ vzip.8 d26, d27 ++ sub r1, r1, #8 ++ ++ tst r3, #1 ++ bne 1f ++ vst1.8 {d18}, [r0] ++1: add r0, r0, #8 ++ tst r3, #2 ++ bne 2f ++ vst1.8 {d19}, [r0] ++2: add r0, r0, r1 ++ ++ tst r3, #4 ++ bne 1f ++ vst1.8 {d26}, [r0] ++1: add r0, r0, #8 ++ tst r3, #8 ++ it ne ++ bxne lr ++ vst1.8 {d27}, [r0] ++ bx lr ++ ++endfunc ++ ++ ++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++function ff_hevc_v_loop_filter_uv2_neon_8, export=1 ++ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3], r1 ++ vld4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0], r1 ++ ++ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 ++ vld4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 ++ ++ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 ++ vld4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 ++ ++ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 ++ vld4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 ++ ++ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 ++ vld4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ ++ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 ++ vld4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 ++ ++ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 ++ vld4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 ++ ++ vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3] ++ vld4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0] ++ ++ hevc_loop_filter_uv_body d16, d18, d26, d28 ++ lsr r2, r2, #16 ++ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ ++ neg r1, r1 ++ ++ ldr r2, [sp, #0] ++ ++ @ p[1] ++ tst r2, #2 ++ itt ne ++ addne r3, r3, r1, lsl #2 ++ bne 1f ++ vst4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3], r1 ++ vst4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 ++ vst4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 ++ vst4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 ++ ++1: ++ @ q[1] ++ tst r2, #8 ++ itt ne ++ addne r0, r0, r1, lsl #2 ++ bne 2f ++ vst4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0], r1 ++ vst4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 ++ vst4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 ++ vst4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ ++2: ++ @ p[0] ++ tst r2, #1 ++ bne 3f ++ vst4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 ++ vst4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 ++ vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 ++ vst4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3] ++ ++3: ++ @ q[0] ++ tst r2, #4 ++ it ne ++ bxne lr ++ vst4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 ++ vst4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 ++ vst4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 ++ vst4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0] ++ ++ bx lr ++endfunc ++ ++ + function ff_hevc_v_loop_filter_chroma_neon, export=1 + hevc_loop_filter_chroma_start + sub r0, #4 +@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1 vst1.8 {d4}, [r0] bx lr endfunc + -+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, -+ * int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, -+ * MvField *curr, MvField *neigh, uint8_t *bs) ++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i ++ * int *curr_rpl0, int *curr_ ++ * MvField *curr, MvField *ne + */ +function ff_hevc_deblocking_boundary_strengths_neon, export=1 + add ip, sp, #4*4 @@ -1159,6 +1753,7 @@ index 166bddb..a088cc3 100644 +90: mov a3, #1 + b 11b +endfunc ++ diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S new file mode 100644 index 0000000..00eab9e @@ -1503,10 +2098,10 @@ index 0000000..00eab9e + .byte 2, 16, 54, 4 + .byte 2, 10, 58, 2 diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c -index 5591807..49c70dd 100644 +index 5591807..b6c48ee 100644 --- a/libavcodec/arm/hevcdsp_init_neon.c +++ b/libavcodec/arm/hevcdsp_init_neon.c -@@ -22,6 +22,8 @@ +@@ -22,11 +22,26 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/hevcdsp.h" #include "hevcdsp_arm.h" @@ -1515,7 +2110,25 @@ index 5591807..49c70dd 100644 void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, + void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++#ifdef RPI ++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++#endif ++ + void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit); + void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit); + void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs); +@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); @@ -1533,11 +2146,21 @@ index 5591807..49c70dd 100644 +void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); +void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); +void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); ++ ++void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, ++ const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo); ++ ++void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++ + #define PUT_PIXELS(name) \ void name(int16_t *dst, uint8_t *src, \ ptrdiff_t srcstride, int height, \ -@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); +@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8); #undef PUT_PIXELS @@ -1553,7 +2176,7 @@ index 5591807..49c70dd 100644 static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int width); -@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t +@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); } @@ -1599,6 +2222,50 @@ index 5591807..49c70dd 100644 + } +} + ++static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ // Width 32 already dealt with ++ // width 16 code works in double lines ++ if (width == 16 && (height & 1) == 0) { ++ ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst, ++ sao_offset_val_u, sao_left_class_u, ++ sao_offset_val_v, sao_left_class_v, ++ width, height); ++ } ++ else ++ { ++ const int shift = 3; // BIT_DEPTH - 5 ++ int k, y, x; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int8_t offset_table_u[32] = { 0 }; ++ int8_t offset_table_v[32] = { 0 }; ++ ++ stride_src /= sizeof(pixel); ++ stride_dst /= sizeof(pixel); ++ ++ for (k = 0; k < 4; k++) ++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; ++ for (k = 0; k < 4; k++) ++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width * 2; x += 2) ++ { ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); ++ } ++ dst += stride_dst; ++ src += stride_src; ++ ++ } ++ } ++} ++ +#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1)) +static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, + int16_t *_sao_offset_val, int eo, int width, int height) @@ -1677,6 +2344,54 @@ index 5591807..49c70dd 100644 + } + } +} ++ ++ ++static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ ++ if (width == 32 && (height & 7) == 0) { ++ ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo); ++ } ++ else ++ { ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ int8_t sao_offset_val_u[8]; // padding of 3 for vld ++ int8_t sao_offset_val_v[8]; // padding of 3 for vld ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ ++ for (x = 0; x < 5; x++) { ++ sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]]; ++ sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]]; ++ } ++ ++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width * 2; x += 2) { ++ int diff0u = CMP(src[x], src[x + a_stride]); ++ int diff1u = CMP(src[x], src[x + b_stride]); ++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); ++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]); ++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++ } ++} +#undef CMP + +void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, @@ -1686,18 +2401,36 @@ index 5591807..49c70dd 100644 av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) { if (bit_depth == 8) { -@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) + int x; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon; ++#ifdef RPI ++ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_8; ++ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_8; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_8; ++#endif + c->idct[0] = ff_hevc_transform_4x4_neon_8; + c->idct[1] = ff_hevc_transform_8x8_neon_8; + c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8; +@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8; c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8; c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; + for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) { + c->sao_band_filter[x] = ff_hevc_sao_band_neon_wrapper; ++ c->sao_band_filter_c[x] = ff_hevc_sao_band_c_neon_wrapper; + c->sao_edge_filter[x] = ff_hevc_sao_edge_neon_wrapper; ++ c->sao_edge_filter_c[x] = ff_hevc_sao_edge_c_neon_wrapper; + } ++ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_neon_8; // width=32 put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8; put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8; put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8; -@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper; @@ -1719,7 +2452,7 @@ index 5591807..49c70dd 100644 c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8; c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8; c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8; -@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8; c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8; } @@ -1731,10 +2464,10 @@ index 5591807..49c70dd 100644 } diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S new file mode 100644 -index 0000000..9c7808d +index 0000000..08a021d --- /dev/null +++ b/libavcodec/arm/hevcdsp_sao_neon.S -@@ -0,0 +1,510 @@ +@@ -0,0 +1,862 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * @@ -1860,24 +2593,186 @@ index 0000000..9c7808d + +function ff_hevc_sao_band_w64_neon_8, export=1 + init_sao_band -+1: subs r12, #1 -+ pld [r1, r3] -+ vld1.8 {q8-q9}, [r1, :128]! -+ vshr.u8 q12, q8, #3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {q10-q11}, [r1, :128], r3 -+ vshr.u8 q14, q10, #3 -+ vshr.u8 q15, q11, #3 -+ sub r1, #32 -+ sao_band_64 -+ vst1.8 {q8-q9}, [r0, :128]! -+ vst1.8 {q10-q11}, [r0, :128], r2 -+ sub r0, #32 -+ bne 1b + -+ bx lr ++ push {r4, lr} ++ subs r12, #1 ++ mov r4, r1 ++ it ne ++ addne r4, r3 ++ ++1: subs r12, #1 ++ vldm r1, {q8-q11} ++ pld [r4] ++ vshr.u8 q12, q8, #3 ++ vshr.u8 q13, q9, #3 ++ add r1, r3 ++ vshr.u8 q14, q10, #3 ++ vshr.u8 q15, q11, #3 ++ sao_band_64 ++ it ne ++ addne r4, r3 ++ vstm r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4, pc} +endfunc + ++ ++@ ff_hevc_sao_band_c_w64_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++@ As this is often done in-place on the frame buffer it is worth preloading ++@ the pixel values but we want to beware of loading ouside our buffer to avoid ++@ loading stuff into the cache that should still be invalid (in use by QPU, VPU) ++ ++function ff_hevc_sao_band_c_neon_8, export=1 ++ mov r12, sp ++ push {r4-r8, lr} // 24 bytes ++ ++ ldm r12, {r4-r7} ++ ++ add r4, #2 ++ add r6, #2 ++ vld1.16 {d16}, [r4] @ Unaligned ++ lsl r5, r5, #3 ++ vld1.16 {d18}, [r6] ++ pld [r1] ++ vmov.i8 d17, #0 ++ mov r4, r1 ++ vmov.i8 d19, #0 ++ lsl r7, r7, #3 ++ vdup.8 q1, r5 ++ ldr r5, [r12, #16] @ width ++ vdup.8 q2, r7 ++ ldr r12, [r12, #20] ++ vqmovn.s16 d0, q8 ++ cmp r5, #16 @ At some point we may want a table lookup ++ vqmovn.s16 d1, q9 ++ vmov.i8 q3, #128 ++ beq 16f ++ ++ @ d0 U lookup ++ @ d1 V lookup ++ @ q1 U raw offset ++ @ q2 V raw offset ++ @ q3 #128 ++ ++ @ r4 = r1 = src - Inteded for preload pointer ++ @ r12 = height ++ ++ @ Might (unlikely) be called with height == 1 ++ subs r12, #1 ++ it ne ++ addne r4, r3 ++ ++1: ++ subs r12, #1 ++ vld2.8 {q8-q9}, [r1, :128]! ++ vsub.u8 q12, q8, q1 ++ vld2.8 {q10-q11}, [r1, :128], r3 ++ vsub.u8 q14, q10, q1 ++ vsub.u8 q13, q9, q2 ++ sub r1, #32 ++ vsub.u8 q15, q11, q2 ++ pld [r4] ++ vshr.u8 q12, #3 ++ vadd.s8 q8, q3 ++ vshr.u8 q13, #3 ++ vadd.s8 q9, q3 ++ ++ vtbl.8 d24, {d0}, d24 ++ vshr.u8 q14, #3 ++ vtbl.8 d25, {d0}, d25 ++ vshr.u8 q15, #3 ++ vtbl.8 d26, {d1}, d26 ++ vadd.s8 q10, q3 ++ vtbl.8 d27, {d1}, d27 ++ vadd.s8 q11, q3 ++ vtbl.8 d28, {d0}, d28 ++ vqadd.s8 q8, q12 ++ vtbl.8 d29, {d0}, d29 ++ vqadd.s8 q9, q13 ++ vtbl.8 d30, {d1}, d30 ++ vqadd.s8 q10, q14 ++ vtbl.8 d31, {d1}, d31 ++ vsub.s8 q8, q3 ++ vqadd.s8 q11, q15 ++ vsub.s8 q9, q3 ++ vsub.s8 q10, q3 ++ vsub.s8 q11, q3 ++ ++ it ne ++ addne r4, r3 @ Do not inc on final pass ++ vst2.8 {q8-q9}, [r0, :128]! ++ vst2.8 {q10-q11}, [r0, :128], r2 ++ sub r0, #32 ++ bpl 1b ++ ++ pop {r4-r8, pc} ++ ++@ -- width 16 (UV pairs) -- ++16: ++ subs r12, #2 ++ it ne ++ addne r4, r4, r3, lsl #1 ++ ++1: ++ subs r12, #2 ++ vld2.8 {q8-q9}, [r1, :128], r3 ++ vsub.u8 q12, q8, q1 ++ vld2.8 {q10-q11}, [r1, :128], r3 ++ vsub.u8 q14, q10, q1 ++ vsub.u8 q13, q9, q2 ++ pld [r4] ++ vsub.u8 q15, q11, q2 ++ pld [r4, r3] ++ vshr.u8 q12, #3 ++ vadd.s8 q8, q3 ++ vshr.u8 q13, #3 ++ vadd.s8 q9, q3 ++ ++ vtbl.8 d24, {d0}, d24 ++ vshr.u8 q14, #3 ++ vtbl.8 d25, {d0}, d25 ++ vshr.u8 q15, #3 ++ vtbl.8 d26, {d1}, d26 ++ vadd.s8 q10, q3 ++ vtbl.8 d27, {d1}, d27 ++ vadd.s8 q11, q3 ++ vtbl.8 d28, {d0}, d28 ++ vqadd.s8 q8, q12 ++ vtbl.8 d29, {d0}, d29 ++ vqadd.s8 q9, q13 ++ vtbl.8 d30, {d1}, d30 ++ vqadd.s8 q10, q14 ++ vtbl.8 d31, {d1}, d31 ++ vsub.s8 q8, q3 ++ vqadd.s8 q11, q15 ++ vsub.s8 q9, q3 ++ vsub.s8 q10, q3 ++ vsub.s8 q11, q3 ++ ++ it ne ++ addne r4, r4, r3, lsl #1 ++ vst2.8 {q8-q9}, [r0, :128], r2 ++ vst2.8 {q10-q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ +.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3 + vcgt.u8 \out0, \in2, \in0 // c > a -> -1 , otherwise 0 + vcgt.u8 \tmp0, \in0, \in2 // a > c -> -1 , otherwise 0 @@ -1887,71 +2782,120 @@ index 0000000..9c7808d + vsub.s8 \out1, \tmp1, \out1 // diff0 part 2 +.endm + -+.macro table64 -+ vmov.s8 q13, #2 // 2 to all elements -+ vmov.32 d24[0], r4 // load offset table from general registers -+ vmov.32 d24[1], r5 // load rest of offset table -+ -+ vadd.s8 q0, q13 -+ vadd.s8 q1, q13 -+ vadd.s8 q2, q13 -+ vadd.s8 q3, q13 -+ -+ vmov.u8 q15, #128 // s8 #-128 -+ vtbl.8 d0, {d24}, d0 -+ vadd.s8 q13, q4, q15 -+ vtbl.8 d1, {d24}, d1 -+ vadd.s8 q14, q5, q15 -+ vtbl.8 d2, {d24}, d2 -+ vqadd.s8 q0, q13 -+ vtbl.8 d3, {d24}, d3 -+ vqadd.s8 q1, q14 -+ vtbl.8 d4, {d24}, d4 -+ vadd.s8 q13, q6, q15 -+ vtbl.8 d5, {d24}, d5 -+ vadd.s8 q14, q7, q15 -+ vtbl.8 d6, {d24}, d6 -+ vqadd.s8 q2, q13 -+ vtbl.8 d7, {d24}, d7 -+ vqadd.s8 q3, q14 -+ vsub.s8 q0, q15 -+ vsub.s8 q1, q15 -+ vsub.s8 q2, q15 -+ vsub.s8 q3, q15 -+ vst1.8 {q0-q1}, [r0, :128]! -+ vst1.8 {q2-q3}, [r0, :128], r2 -+ sub r0, #32 -+.endm + +// input +// a in q0 - q3 +// c in q4 - q7 +// b in q8 - q11 -+// offset table in r7 and r5 ++// offset table r4,r5 and r6,r7 ++// r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C +// output in q0 - q3 +// clobbers q12 - q15 -+.macro edge_w64_body -+ diff32 q12, q13, q0, q1, q0, q1, q4, q5 -+ diff32 q0, q1, q14, q15, q8, q9, q4, q5 + -+ vadd.s8 q0, q12 //diff0 + diff1 -+ vadd.s8 q1, q13 ++@ a <- c <- b ++@ ++@ It appears that Neon can stall if you try and use results too soon so we try to ++@ spread our instruction out + -+ diff32 q14, q15, q2, q3, q2, q3, q6, q7 -+ diff32 q2, q3, q12, q13, q10, q11, q6, q7 ++.macro edgeidx64 ++ ++ vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q1 ++ vcgt.u8 q14, q6, q2 ++ vcgt.u8 q15, q7, q3 ++ ++ vcgt.u8 q0, q0, q4 // a > c -> -1 , otherwise 0 ++ vcgt.u8 q1, q1, q5 ++ vcgt.u8 q2, q2, q6 ++ vcgt.u8 q3, q3, q7 ++ ++ vsub.s8 q0, q0, q12 // a = sign(c-a) ++ vsub.s8 q1, q1, q13 ++ vsub.s8 q2, q2, q14 ++ vsub.s8 q3, q3, q15 ++ ++ vcgt.u8 q12, q4, q8 // c > b -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q9 ++ vcgt.u8 q14, q6, q10 ++ vcgt.u8 q15, q7, q11 ++ ++ vsub.s8 q0, q0, q12 ++ vsub.s8 q1, q1, q13 ++ vsub.s8 q2, q2, q14 ++ vsub.s8 q3, q3, q15 ++ ++ vcgt.u8 q12, q8, q4 // c < b -> -1 , otherwise 0 ++ vcgt.u8 q13, q9, q5 ++ vcgt.u8 q14, q10, q6 ++ vcgt.u8 q15, q11, q7 ++ ++ vadd.s8 q0, q0, q12 // a = sign(c-a) + sign(c-b) ++ vadd.s8 q1, q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s8 q2, q2, q14 ++ vadd.s8 q3, q3, q15 ++ ++ vadd.s8 q0, q0, q12 ++ vadd.s8 q1, q1, q12 ++ @ whilst vmov dn, rm, rn exists it is a vfp instruction ++ @ and causes a stall till neon pipe empty - so don't do that! ++ vmov d26[0], r4 ++ vmov d26[1], r5 ++ vmov d27[0], r6 ++ vmov d27[1], r7 ++ vadd.s8 q2, q2, q12 ++ vuzp.8 q0, q1 ++ vmov.u8 q15, #128 ++ vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b) ++ ++ vtbl.8 d0, {d26}, d0 ++ vadd.s8 q12, q4, q15 // Add -128 so we can use saturating signed add ++ ++ vtbl.8 d1, {d26}, d1 ++ vadd.s8 q14, q5, q15 ++ ++ vtbl.8 d2, {d27}, d2 ++ vuzp.8 q2, q3 ++ ++ vtbl.8 d3, {d27}, d3 ++ ++ vtbl.8 d4, {d26}, d4 ++ vzip.8 q0, q1 ++ ++ vtbl.8 d5, {d26}, d5 ++ vqadd.s8 q0, q0, q12 ++ vqadd.s8 q1, q1, q14 ++ vadd.s8 q12, q6, q15 // Add -128 so we can use saturating signed add ++ ++ vtbl.8 d6, {d27}, d6 ++ vadd.s8 q14, q7, q15 // Add -128 so we can use saturating signed add ++ ++ vtbl.8 d7, {d27}, d7 ++ vzip.8 q2, q3 ++ ++ vsub.s8 q0, q0, q15 ++ vqadd.s8 q2, q2, q12 ++ vqadd.s8 q3, q3, q14 ++ vsub.s8 q1, q1, q15 ++ vsub.s8 q2, q2, q15 ++ vsub.s8 q3, q3, q15 + -+ vadd.s8 q2, q14 -+ vadd.s8 q3, q15 -+ table64 +.endm + ++function edge_w64_body ++ edgeidx64 ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bx lr ++endfunc ++ +.macro init_edge_64 -+ push {r4-r5} -+ ldr r12, [sp, #8] // height -+ ldr r5, [sp, #12] // sao_offset_val_table -+ ldr r4, [r5] -+ add r5, #4 -+ ldr r5, [r5] ++ push {r4-r8,lr} ++ ldr r12, [sp, #24] // height ++ ldr r5, [sp, #28] // sao_offset_val_table ++ ldrd r4, r5, [r5] ++ mov r6, r4 ++ mov r7, r5 +.endm + +function ff_hevc_sao_edge_eo0_w64_neon_8, export=1 @@ -1974,11 +2918,10 @@ index 0000000..9c7808d + vext.8 q9, q5, q6, #1 + vext.8 q10, q6, q7, #1 + vext.8 q11, q7, q12, #1 -+ edge_w64_body ++ bl edge_w64_body + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + +function ff_hevc_sao_edge_eo1_w64_neon_8, export=1 @@ -1998,7 +2941,7 @@ index 0000000..9c7808d + vld1.8 {q8-q9}, [r1, :128]! + vld1.8 {q10-q11}, [r1, :128], r3 + sub r1, #32 -+ edge_w64_body ++ bl edge_w64_body + // copy c to a + vmov.64 q0, q4 + vmov.64 q1, q5 @@ -2011,8 +2954,7 @@ index 0000000..9c7808d + vmov.64 q7, q11 + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + +function ff_hevc_sao_edge_eo2_w64_neon_8, export=1 @@ -2036,11 +2978,10 @@ index 0000000..9c7808d + vld1.8 {q8-q9}, [r1]! + vld1.8 {q10-q11}, [r1] + sub r1, #33 -+ edge_w64_body ++ bl edge_w64_body + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + +function ff_hevc_sao_edge_eo3_w64_neon_8, export=1 @@ -2064,13 +3005,157 @@ index 0000000..9c7808d + vld1.8 {q8-q9}, [r1]! + vld1.8 {q10-q11}, [r1] + sub r1, #31 -+ edge_w64_body ++ bl edge_w64_body + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + ++ ++@ void ff_hevc_sao_edge_c_eo1_w64_neon_8( ++@ uint8_t *_dst, r0 ++@ uint8_t *_src, r1 ++@ ptrdiff_t stride_dst, r2 ++@ ptrdiff_t stride_src, r3 ++@ int height, sp[0] ++@ int16_t *sao_offset_table_u, sp[4] ++@ int16_t *sao_offset_table_v); sp[8] ++@ int eo sp[12] ++ ++function ff_hevc_sao_edge_c_w64_neon_8, export=1 ++ push {r4-r8,lr} // 6 reg = 24 ++ ldr r5, [sp, #28] // sao_offset_val_table_u ++ ldr r7, [sp, #32] // sao_offset_val_table_v ++ ++ @ Load and rearrange offsets ++ @ Also "convert" from 16bit to 8bit ++ ldrb r4, [r5, #2] ++ ldrb r8, [r5, #4] ++ ldrb r6, [r7, #2] ++ ldrb r12, [r7, #4] ++ orr r4, r4, r8, lsl #8 ++ orr r6, r6, r12, lsl #8 ++ ldrb r8, [r5, #6] ++ ldrb r12, [r7, #6] ++ orr r4, r4, r8, lsl #24 ++ orr r6, r6, r12, lsl #24 ++ ldrb r5, [r5, #8] ++ ldrb r7, [r7, #8] ++ ++ ldr r12, [sp, #36] // e0 ++ adr r8, edge_c_tbl_w64 ++ ldr r8, [r8, r12, lsl #2] ++ ++ ldr r12, [sp, #24] // height ++ vpush {d8-d15} ++ mov pc, r8 ++ ++edge_c_tbl_w64: ++ .word ff_hevc_sao_edge_c_eo0_w64_neon_8 ++ .word ff_hevc_sao_edge_c_eo1_w64_neon_8 ++ .word ff_hevc_sao_edge_c_eo2_w64_neon_8 ++ .word ff_hevc_sao_edge_c_eo3_w64_neon_8 ++ ++ff_hevc_sao_edge_c_eo0_w64_neon_8: ++ sub r1, #8 ++1: subs r12, #1 ++ vld1.64 {d7}, [r1, :64]! ++ vld1.64 {q4-q5}, [r1, :128]! // load c ++ vld1.64 {q6-q7}, [r1, :128]! ++ vld1.64 {d24}, [r1, :64], r3 ++ sub r1, #72 ++ // load a ++ vext.8 q0, q3, q4, #14 ++ vext.8 q1, q4, q5, #14 ++ vext.8 q2, q5, q6, #14 ++ vext.8 q3, q6, q7, #14 ++ // load b ++ vext.8 q8, q4, q5, #2 ++ vext.8 q9, q5, q6, #2 ++ vext.8 q10, q6, q7, #2 ++ vext.8 q11, q7, q12, #2 ++ bl edge_w64_body ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++ ++ff_hevc_sao_edge_c_eo1_w64_neon_8: ++ sub r1, r3 ++ // load a ++ vldm r1, {q0-q3} ++ add r1, r3 ++ // load c ++ vldm r1, {q4-q7} ++ add r1, r3 ++1: subs r12, #1 ++ // load b ++ vldm r1, {q8-q11} ++ add r1, r3 ++ bl edge_w64_body ++ // copy c to a ++ vmov.64 q0, q4 ++ vmov.64 q1, q5 ++ vmov.64 q2, q6 ++ vmov.64 q3, q7 ++ // copy b to c ++ vmov.64 q4, q8 ++ vmov.64 q5, q9 ++ vmov.64 q6, q10 ++ vmov.64 q7, q11 ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++ ++ff_hevc_sao_edge_c_eo2_w64_neon_8: ++1: sub r1, r3 ++ // load a ++ // TODO: fix unaligned load ++ // don't reload a like in eo1 ++ sub r1, #2 ++ vld1.8 {q0-q1}, [r1]! ++ vld1.8 {q2-q3}, [r1], r3 ++ sub r1, #30 ++ subs r12, #1 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ sub r1, #32 ++ // load b ++ add r1, #2 ++ vld1.8 {q8-q9}, [r1]! ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #34 ++ bl edge_w64_body ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++ ++ff_hevc_sao_edge_c_eo3_w64_neon_8: ++1: sub r1, r3 ++ // load a ++ // TODO: fix unaligned load ++ // don't reload a like in eo1 ++ add r1, #2 ++ vld1.8 {q0-q1}, [r1]! ++ vld1.8 {q2-q3}, [r1], r3 ++ sub r1, #34 ++ subs r12, #1 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ sub r1, #32 ++ // load b ++ sub r1, #2 ++ vld1.8 {q8-q9}, [r1]! ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #30 ++ bl edge_w64_body ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++endfunc ++ ++ +.macro init_edge_32 + ldr r12, [sp, #4] // sao_offset_val_table + vld1.32 {d31}, [r12] @@ -2187,7 +3272,7 @@ index 0000000..9c7808d + vext.8 q7, q11, q12, #8 + vext.8 q5, q10, q11, #7 + diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 ++ diff32 q0, q1, q10, q11, q8, q9, q2, q3 + vadd.s8 q0, q12 //diff0 + diff1 + vadd.s8 q1, q13 + table32 @@ -2227,7 +3312,7 @@ index 0000000..9c7808d + vext.8 q14, q12, q10, #7 + + diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 ++ diff32 q0, q1, q10, q11, q8, q9, q2, q3 + + vadd.s8 q0, q12 //diff0 + diff1 + vadd.s8 q1, q13 @@ -2439,26 +3524,21 @@ index ce4bab2..b9b0c78 100644 + .split = h264_split, +}; diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c -index b478065..88dd40b 100644 +index b478065..955e426 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c -@@ -41,8 +41,186 @@ +@@ -41,8 +41,196 @@ #include "hevc.h" #include "profiles.h" +#ifdef RPI + #include "rpi_qpu.h" -+ #include "rpi_user_vcsm.h" -+ // Move Inter prediction into separate pass -+ #define RPI_INTER -+ -+ #ifdef RPI_INTER_QPU -+ // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU -+ #define RPI_MULTI_MAILBOX -+ #endif ++ #include "rpi_shader.h" ++ #include "rpi_shader_cmd.h" ++ #include "rpi_zc.h" + + // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory -+ // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined. ++ #define RPI_CACHE_UNIF_MVS 1 + + // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*) + //#define RPI_SIMULATE_QPUS @@ -2466,19 +3546,24 @@ index b478065..88dd40b 100644 + #include "pthread.h" + #endif + -+ static void rpi_execute_dblk_cmds(HEVCContext *s); -+ static void rpi_execute_transform(HEVCContext *s); -+ static void rpi_launch_vpu_qpu(HEVCContext *s); -+ static void rpi_execute_pred_cmds(HEVCContext *s); -+ static void rpi_execute_inter_cmds(HEVCContext *s); -+ static void rpi_begin(HEVCContext *s); -+ static void flush_frame(HEVCContext *s,AVFrame *frame); -+ static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job); ++ static void worker_core(HEVCContext * const s); + ++ // We can pred any block height but annoyingly if we we do then the TMU cache ++ // explodes and it goes even slower :-( ++ #if 0 ++ #define Y_P_MAX_H 16 ++ #define Y_B_MAX_H 16 ++ #else ++ #define Y_P_MAX_H 64 ++ #define Y_B_MAX_H 64 ++ #endif +#endif + +// #define DISABLE_MC + ++#define DISABLE_CHROMA 0 ++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards ++ +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) + +#ifndef av_mod_uintp2 @@ -2488,46 +3573,66 @@ index b478065..88dd40b 100644 +} +# define av_mod_uintp2 av_mod_uintp2_c +#endif ++ ++#define Y_B_ONLY 0 + const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; + -+#ifdef RPI_INTER_QPU ++#if RPI_INTER ++ ++#define MC_DUMMY_X (-32) ++#define MC_DUMMY_Y (-32) + +// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks +// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks +// For each block of 64*64 the smallest block size is 8x4 +// We also need an extra command for the setup information + -+#define RPI_CHROMA_COMMAND_WORDS 12 -+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS) ++#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4)) +// The QPU code for UV blocks only works up to a block width of 8 +#define RPI_CHROMA_BLOCK_WIDTH 8 + -+#define RPI_LUMA_COMMAND_WORDS 10 -+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS) -+ +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) + +// TODO Chroma only needs 4 taps + +// Actual filter goes -ve, +ve, +ve, -ve using these values -+static const uint32_t rpi_filter_coefs[8][1] = { -+ { ENCODE_COEFFS( 0, 64, 0, 0) }, -+ { ENCODE_COEFFS( 2, 58, 10, 2) }, -+ { ENCODE_COEFFS( 4, 54, 16, 2) }, -+ { ENCODE_COEFFS( 6, 46, 28, 4) }, -+ { ENCODE_COEFFS( 4, 36, 36, 4) }, -+ { ENCODE_COEFFS( 4, 28, 46, 6) }, -+ { ENCODE_COEFFS( 2, 16, 54, 4) }, -+ { ENCODE_COEFFS( 2, 10, 58, 2) } ++static const uint32_t rpi_filter_coefs[8] = { ++ ENCODE_COEFFS( 0, 64, 0, 0), ++ ENCODE_COEFFS( 2, 58, 10, 2), ++ ENCODE_COEFFS( 4, 54, 16, 2), ++ ENCODE_COEFFS( 6, 46, 28, 4), ++ ENCODE_COEFFS( 4, 36, 36, 4), ++ ENCODE_COEFFS( 4, 28, 46, 6), ++ ENCODE_COEFFS( 2, 16, 54, 4), ++ ENCODE_COEFFS( 2, 10, 58, 2) +}; + ++#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4))) ++ +#endif + + +#ifdef RPI_WORKER + ++typedef struct worker_global_env_s ++{ ++ volatile int arm_load; ++ pthread_mutex_t lock; ++ ++ unsigned int arm_y; ++ unsigned int arm_c; ++ unsigned int gpu_y; ++ unsigned int gpu_c; ++} worker_global_env_t; ++ ++static worker_global_env_t worker_global_env = ++{ ++ .lock = PTHREAD_MUTEX_INITIALIZER ++}; ++ ++ +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); + @@ -2606,17 +3711,7 @@ index b478065..88dd40b 100644 + break; + } + LOG_ENTER -+ // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10); -+ rpi_launch_vpu_qpu(s); -+ // Perform inter prediction -+ rpi_execute_inter_cmds(s); -+ // Wait for transform completion -+ vpu_wait(s->vpu_id); -+ -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s); -+ // Perform deblocking for CTBs in this row -+ rpi_execute_dblk_cmds(s); ++ worker_core(s); + + worker_complete_job(s); + LOG_EXIT @@ -2629,7 +3724,7 @@ index b478065..88dd40b 100644 /** * NOTE: Each function hls_foo correspond to the function foo in the * specification (HLS stands for High Level Syntax). -@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 +@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 /* free everything allocated by pic_arrays_init() */ static void pic_arrays_free(HEVCContext *s) { @@ -2662,36 +3757,40 @@ index b478065..88dd40b 100644 av_freep(&s->sao); av_freep(&s->deblock); -@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) +@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) int ctb_count = sps->ctb_width * sps->ctb_height; int min_pu_size = sps->min_pu_width * sps->min_pu_height; +#ifdef RPI -+ int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size); -+ int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS; -+ int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1]; -+ int coefs_per_row = coefs_per_luma + coefs_per_chroma; ++ const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size); ++ const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS; ++ const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1]; ++ const int coefs_per_row = coefs_per_luma + coefs_per_chroma; + int job; + + av_assert0(sps); -+ s->max_ctu_count = coefs_per_luma / coefs_in_ctb; -+ s->ctu_per_y_chan = s->max_ctu_count / 12; -+ s->ctu_per_uv_chan = s->max_ctu_count / 8; ++// s->max_ctu_count = sps->ctb_width; ++// printf("CTB with=%d\n", sps->ctb_width); ++// s->max_ctu_count = coefs_per_luma / coefs_in_ctb; ++ s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width); ++ s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y; ++ s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV; ++ + for(job=0;jobcoeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; -+ if (!s->coeffs_buf_arm[job][0]) -+ goto fail; -+ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data -+ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; -+ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; -+ if (!s->coeffs_buf_arm[job][2]) -+ goto fail; -+ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. -+ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; -+ } ++ for(job=0;jobcoeffs_buf_default[job]); ++ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; ++ if (!s->coeffs_buf_arm[job][0]) ++ goto fail; ++ ++ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data ++ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; ++ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; ++ if (!s->coeffs_buf_arm[job][2]) ++ goto fail; ++ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. ++ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; ++ } + } +#endif +#ifdef RPI_DEBLOCK_VPU @@ -2738,8 +3837,6 @@ index b478065..88dd40b 100644 + + dvq->uv_setup_arm = (void*)p_arm; + dvq->uv_setup_vc = (void*)p_vc; -+ -+ dvq->cmd_id = -1; + } + + s->dvq_n = 0; @@ -2750,7 +3847,7 @@ index b478065..88dd40b 100644 s->bs_width = (width >> 2) + 1; s->bs_height = (height >> 2) + 1; -@@ -137,6 +422,29 @@ fail: +@@ -137,6 +434,29 @@ fail: return AVERROR(ENOMEM); } @@ -2780,7 +3877,52 @@ index b478065..88dd40b 100644 static void pred_weight_table(HEVCContext *s, GetBitContext *gb) { int i = 0; -@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s) +@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps, + static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt) + { + #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL) +- enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; ++ enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts; + int ret, i; + + pic_arrays_free(s); +@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + switch (sps->pix_fmt) { + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUVJ420P: ++#if RPI_HEVC_SAND ++ // Currently geometry calc is stuffed for big sizes ++ if (sps->width < 2048 && sps->height <= 1088) { ++ *fmt++ = AV_PIX_FMT_SAND128; ++ } ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + ret = ff_thread_get_format(s->avctx, pix_fmts); + if (ret < 0) + goto fail; ++ + s->avctx->pix_fmt = ret; + } + else { +@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + for(c_idx = 0; c_idx < c_count; c_idx++) { + int w = sps->width >> sps->hshift[c_idx]; + int h = sps->height >> sps->vshift[c_idx]; ++ // ******** Very very nasty allocation kludge for plaited Chroma + s->sao_pixel_buffer_h[c_idx] = +- av_malloc((w * 2 * sps->ctb_height) << ++ av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) << + sps->pixel_shift); + s->sao_pixel_buffer_v[c_idx] = +- av_malloc((h * 2 * sps->ctb_width) << ++ av_malloc((h * 2 * sps->ctb_width * (1 + (c_idx == 1))) << + sps->pixel_shift); + } + } +@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s) (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) { pred_weight_table(s, gb); } @@ -2792,33 +3934,42 @@ index b478065..88dd40b 100644 sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { -@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { +@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { return 0; } +#ifdef RPI +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx) +{ ++ // U & V done on U call in the case of sliced frames ++ if (rpi_sliced_frame(s->frame) && c_idx > 1) ++ return; ++ + if (s->enable_rpi) { + HEVCLocalContext *lc = s->HEVClc; + HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; + cmd->type = RPI_PRED_INTRA; + cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; -+ cmd->x = x0; -+ cmd->y = y0; + cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; -+ cmd->mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; -+ } else { ++ cmd->c_idx = c_idx; ++ cmd->i_pred.x = x0; ++ cmd->i_pred.y = y0; ++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ } ++ else if (rpi_sliced_frame(s->frame) && c_idx != 0) { ++ s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx); ++ } ++ else { + s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx); + } ++ +} +#endif + static int hls_transform_unit(HEVCContext *s, int x0, int y0, int xBase, int yBase, int cb_xBase, int cb_yBase, int log2_cb_size, int log2_trafo_size, -@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { int trafo_size = 1 << log2_trafo_size; ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size); @@ -2831,7 +3982,7 @@ index b478065..88dd40b 100644 } if (cbf_luma || cbf_cb[0] || cbf_cr[0] || -@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -2843,7 +3994,7 @@ index b478065..88dd40b 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -2855,7 +4006,7 @@ index b478065..88dd40b 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -2867,7 +4018,7 @@ index b478065..88dd40b 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -2879,7 +4030,7 @@ index b478065..88dd40b 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v); @@ -2926,17 +4077,162 @@ index b478065..88dd40b 100644 } } } -@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) +@@ -1275,47 +1675,120 @@ do { + return 0; + } + +-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) ++ ++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) + { +- HEVCLocalContext *lc = s->HEVClc; + GetBitContext gb; +- int cb_size = 1 << log2_cb_size; +- int stride0 = s->frame->linesize[0]; +- uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; +- int stride1 = s->frame->linesize[1]; +- uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; +- int stride2 = s->frame->linesize[2]; +- uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; +- +- int length = cb_size * cb_size * s->ps.sps->pcm.bit_depth + +- (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) + +- ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) * +- s->ps.sps->pcm.bit_depth_chroma; +- const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3); + int ret; + +- if (!s->sh.disable_deblocking_filter_flag) +- ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size); +- + ret = init_get_bits(&gb, pcm, length); + if (ret < 0) + return ret; + +- s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); +- if (s->ps.sps->chroma_format_idc) { +- s->hevcdsp.put_pcm(dst1, stride1, ++#ifdef RPI ++ if (rpi_sliced_frame(s->frame)) { ++ s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0), ++ s->frame->linesize[0], ++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ ++ s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]), ++ s->frame->linesize[1], + cb_size >> s->ps.sps->hshift[1], + cb_size >> s->ps.sps->vshift[1], + &gb, s->ps.sps->pcm.bit_depth_chroma); +- s->hevcdsp.put_pcm(dst2, stride2, +- cb_size >> s->ps.sps->hshift[2], +- cb_size >> s->ps.sps->vshift[2], +- &gb, s->ps.sps->pcm.bit_depth_chroma); + } ++ else ++#endif ++ { ++ const int stride0 = s->frame->linesize[0]; ++ uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; ++ const int stride1 = s->frame->linesize[1]; ++ uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; ++ const int stride2 = s->frame->linesize[2]; ++ uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; ++ ++ s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ if (s->ps.sps->chroma_format_idc) { ++ s->hevcdsp.put_pcm(dst1, stride1, ++ cb_size >> s->ps.sps->hshift[1], ++ cb_size >> s->ps.sps->vshift[1], ++ &gb, s->ps.sps->pcm.bit_depth_chroma); ++ s->hevcdsp.put_pcm(dst2, stride2, ++ cb_size >> s->ps.sps->hshift[2], ++ cb_size >> s->ps.sps->vshift[2], ++ &gb, s->ps.sps->pcm.bit_depth_chroma); ++ } + ++ } + return 0; + } + ++#ifdef RPI ++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n) ++{ ++ int16_t * const coeffs = (buf_no != 3) ? ++ s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] : ++ s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n; ++ s->num_coeffs[s->pass0_job][buf_no] += n; ++ return coeffs; ++} ++#endif ++ ++// x * 2^(y*2) ++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) ++{ ++ return x << (y * 2); ++} ++ ++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size) ++{ ++ // Length in bits ++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]); ++ ++ const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size); ++ ++#ifdef RPI ++ if (s->enable_rpi) { ++ // Copy coeffs ++ const int blen = (length + 7) >> 3; ++ // Round allocated bytes up to nearest 32 to avoid alignment confusion ++ // Allocation is in int16_t s ++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per ++ // sample this rounding doesn't affect the total size we need to allocate for ++ // the coeff buffer ++ int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1); ++ memcpy(coeffs, pcm, blen); ++ ++ // Our coeff stash assumes that any partially allocated 64byte lump ++ // is zeroed so make that true. ++ { ++ uint8_t * const eopcm = (uint8_t *)coeffs + blen; ++ if ((-(intptr_t)eopcm & 63) != 0) ++ memset(eopcm, 0, -(intptr_t)eopcm & 63); ++ } ++ ++ // Add command ++ { ++ HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ cmd->type = RPI_PRED_I_PCM; ++ cmd->size = log2_cb_size; ++ cmd->i_pcm.src = coeffs; ++ cmd->i_pcm.x = x0; ++ cmd->i_pcm.y = y0; ++ cmd->i_pcm.src_len = length; ++ } ++ return 0; ++ } ++#endif ++ ++ return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size); ++} ++ + /** + * 8.5.3.2.2.1 Luma sample unidirectional interpolation process + * +@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) * @param luma_offset additive offset applied to the luma prediction value */ -+#ifdef RPI_INTER -+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn) ++#if RPI_INTER +static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, + AVFrame *ref, const Mv *mv, int x_off, int y_off, + int block_w, int block_h, int luma_weight, int luma_offset) +{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; ++ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; + cmd->cmd = RPI_CMD_LUMA_UNI; + cmd->dst = dst; + cmd->dststride = dststride; @@ -2953,9 +4249,10 @@ index b478065..88dd40b 100644 + +static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, + AVFrame *ref0, const Mv *mv0, int x_off, int y_off, -+ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) ++ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, ++ const struct MvField * const current_mv) +{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; ++ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; + cmd->cmd = RPI_CMD_LUMA_BI; + cmd->dst = dst; + cmd->dststride = dststride; @@ -2973,17 +4270,17 @@ index b478065..88dd40b 100644 + cmd->ref_idx[1] = current_mv->ref_idx[1]; +} + -+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0, -+ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist, -+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset) ++static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0, ++ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, ++ int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset) +{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; ++ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; + cmd->cmd = RPI_CMD_CHROMA_UNI; + cmd->dst = dst0; + cmd->dststride = dststride; + cmd->src = src0; + cmd->srcstride = srcstride; -+ cmd->mv = current_mv->mv[reflist]; ++ cmd->mv = *mv; + cmd->x_off = x_off; + cmd->y_off = y_off; + cmd->block_w = block_w; @@ -2992,10 +4289,10 @@ index b478065..88dd40b 100644 + cmd->offset = chroma_offset; +} + -+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, -+ int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx) ++static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, ++ int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx) +{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++; ++ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; + cmd->cmd = RPI_CMD_CHROMA_BI+cidx; + cmd->dst = dst0; + cmd->dststride = dststride; @@ -3013,14 +4310,12 @@ index b478065..88dd40b 100644 + cmd->ref_idx[1] = current_mv->ref_idx[1]; +} + -+#else -+#define RPI_REDIRECT(fn) fn +#endif + static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, AVFrame *ref, const Mv *mv, int x_off, int y_off, int block_w, int block_h, int luma_weight, int luma_offset) -@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag); int idx = ff_hevc_pel_weight[block_w]; @@ -3031,7 +4326,7 @@ index b478065..88dd40b 100644 x_off += mv->x >> 2; y_off += mv->y >> 2; src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, * @param mv1 motion vector1 (relative to block position) to get pixel data from * @param current_mv current motion vector structure */ @@ -3040,7 +4335,7 @@ index b478065..88dd40b 100644 AVFrame *ref0, const Mv *mv0, int x_off, int y_off, int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) { -@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift); uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift); @@ -3051,7 +4346,7 @@ index b478065..88dd40b 100644 if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER || x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER || y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) { -@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, +@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, intptr_t _mx = mx << (1 - hshift); intptr_t _my = my << (1 - vshift); @@ -3062,7 +4357,7 @@ index b478065..88dd40b 100644 x_off += mv->x >> (2 + hshift); y_off += mv->y >> (2 + vshift); src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF +@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF int hshift = s->ps.sps->hshift[1]; int vshift = s->ps.sps->vshift[1]; @@ -3073,13 +4368,422 @@ index b478065..88dd40b 100644 intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift); intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift); intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift); -@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, +@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, } } -static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int nPbW, int nPbH, - int log2_cb_size, int partIdx, int idx) ++ ++#if RPI_INTER ++ ++static HEVCRpiLumaPred * ++rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val) ++{ ++ HEVCRpiLumaPred * yp = s->curr_pred_y; ++ HEVCRpiLumaPred * ypt = yp + 1; ++ for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) { ++ if (ypt->load < yp->load) ++ yp = ypt; ++ } ++ ++// yp->load += load_val; ++ ++yp->load; ++ return yp; ++} ++ ++static void ++rpi_pred_y(HEVCContext *const s, const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const Mv *const mv, ++ const int weight_mul, ++ const int weight_offset, ++ AVFrame *const src_frame) ++{ ++ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); ++ ++// rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, ++// mv, x0, y0, nPbW, nPbH, ++// weight_mul, weight_offset); ++ ++ { ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my << 8) | mx; ++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; ++ const int x1_m3 = x0 + (mv->x >> 2) - 3; ++ const int y1_m3 = y0 + (mv->y >> 2) - 3; ++ const uint32_t src_vc_address_y = get_vc_address_y(src_frame); ++ uint32_t dst_addr = get_vc_address_y(s->frame) + y_off; ++ const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul); ++ ++ // Potentially we could change the assembly code to support taller sizes in one go ++ for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16) ++ { ++ const uint32_t src_yx_y = y1_m3 + start_y; ++ int start_x = 0; ++ const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H); ++ ++#if 1 ++ // As Y-pred operates on two independant 8-wide src blocks we can merge ++ // this pred with the previous one if it the previous one is 8 pel wide, ++ // the same height as the current block, immediately to the left of our ++ // current dest block and mono-pred. ++ ++ qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p; ++ if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr) ++ { ++ const int bw = FFMIN(nPbW, 8); ++ qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx; ++ ++ last_y8_lx->next_src2_x = x1_m3; ++ last_y8_lx->next_src2_y = src_yx_y; ++ last_y8_lx->next_src2_base = src_vc_address_y; ++ last_y8_p->p.w += bw; ++ last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21); ++ last_y8_p->p.wo2 = wo; ++ ++ s->last_y8_p = NULL; ++ s->last_y8_lx = NULL; ++ start_x = bw; ++#if RPI_TSTATS ++ ++s->tstats.y_pred1_y8_merge; ++#endif ++ } ++#endif ++ ++ for (; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7); ++ qpu_mc_pred_y_t *const cmd_lx = yp->last_lx; ++ qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ if (mx == 0 && my == 0) ++ ++ts->y_pred1_x0y0; ++ else if (mx == 0) ++ ++ts->y_pred1_x0; ++ else if (my == 0) ++ ++ts->y_pred1_y0; ++ else ++ ++ts->y_pred1_xy; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ cmd_y[-1].next_fn = s->qpu_filter; ++ cmd_lx->next_src1_x = x1_m3 + start_x; ++ cmd_lx->next_src1_y = src_yx_y; ++ cmd_lx->next_src1_base = src_vc_address_y; ++ if (bw <= 8) ++ { ++ cmd_lx->next_src2_x = MC_DUMMY_X; ++ cmd_lx->next_src2_y = MC_DUMMY_Y; ++ cmd_lx->next_src2_base = s->qpu_dummy_frame; ++ } ++ else ++ { ++ cmd_lx->next_src2_x = x1_m3 + start_x + 8; ++ cmd_lx->next_src2_y = src_yx_y; ++ cmd_lx->next_src2_base = src_vc_address_y; ++ } ++ cmd_y->p.w = bw; ++ cmd_y->p.h = bh; ++ cmd_y->p.mymx21 = my2_mx2_my_mx; ++ cmd_y->p.wo1 = wo; ++ cmd_y->p.wo2 = wo; ++ cmd_y->p.dst_addr = dst_addr + start_x; ++ yp->last_lx = cmd_y; ++ yp->qpu_mc_curr = cmd_y + 1; ++ ++ if (bw == 8) { ++ s->last_y8_lx = cmd_lx; ++ s->last_y8_p = cmd_y; ++ } ++ } ++ } ++ } ++} ++ ++static void ++rpi_pred_y_b(HEVCContext * const s, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const struct MvField *const mv_field, ++ AVFrame *const src_frame, ++ AVFrame *const src_frame2) ++{ ++ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; ++ ++// rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, ++// mv, x0, y0, nPbW, nPbH, ++// src_frame2, mv2, mv_field); ++ { ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my<<8) | mx; ++ const unsigned int mx2 = mv2->x & 3; ++ const unsigned int my2 = mv2->y & 3; ++ const unsigned int my2_mx2 = (my2<<8) | mx2; ++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const int x1 = x0 + (mv->x >> 2) - 3; ++ const int y1 = y0 + (mv->y >> 2) - 3; ++ const int x2 = x0 + (mv2->x >> 2) - 3; ++ const int y2 = y0 + (mv2->y >> 2) - 3; ++ const unsigned int ref_idx0 = mv_field->ref_idx[0]; ++ const unsigned int ref_idx1 = mv_field->ref_idx[1]; ++ const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] + ++ s->sh.luma_offset_l1[ref_idx1] + 1; ++ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++ ++ uint32_t dst = get_vc_address_y(s->frame) + y_off; ++ const uint32_t src1_base = get_vc_address_y(src_frame); ++ const uint32_t src2_base = get_vc_address_y(src_frame2); ++ ++ for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H) ++ { ++ const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H); ++ ++ for (int start_x=0; start_x < nPbW; start_x += 8) ++ { // B blocks work 8 at a time ++ HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7); ++ qpu_mc_pred_y_t *const cmd_lx = yp->last_lx; ++ qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ const unsigned int mmx = mx | mx2; ++ const unsigned int mmy = my | my2; ++ if (mmx == 0 && mmy == 0) ++ ++ts->y_pred2_x0y0; ++ else if (mmx == 0) ++ ++ts->y_pred2_x0; ++ else if (mmy == 0) ++ ++ts->y_pred2_y0; ++ else ++ ++ts->y_pred2_xy; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ cmd_y[-1].next_fn = s->qpu_filter_b; ++ cmd_lx->next_src1_x = x1 + start_x; ++ cmd_lx->next_src1_y = y1 + start_y; ++ cmd_lx->next_src1_base = src1_base; ++ cmd_lx->next_src2_x = x2 + start_x; ++ cmd_lx->next_src2_y = y2 + start_y; ++ cmd_lx->next_src2_base = src2_base; ++ cmd_y->p.w = FFMIN(nPbW - start_x, 8); ++ cmd_y->p.h = bh; ++ cmd_y->p.mymx21 = my2_mx2_my_mx; ++ cmd_y->p.wo1 = wo1; ++ cmd_y->p.wo2 = wo2; ++ cmd_y->p.dst_addr = dst + start_x; ++ yp->last_lx = cmd_y; ++ yp->qpu_mc_curr = cmd_y + 1; ++ } ++ dst += s->frame->linesize[0] * 16; ++ } ++ } ++} ++ ++ ++static HEVCRpiChromaPred * ++rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val) ++{ ++ HEVCRpiChromaPred * cp = s->curr_pred_c; ++ HEVCRpiChromaPred * cpt = cp + 1; ++ for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) { ++ if (cpt->load < cp->load) ++ cp = cpt; ++ } ++ // Actual use of load_val is noticably better but we haven't sorted Q length problems yet ++ ++cp->load; ++// cp->load += load_val; ++ return cp; ++} ++ ++static void ++rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const Mv * const mv, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ AVFrame * const src_frame) ++{ ++ ++ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); ++#if 0 ++ av_assert0(s->frame->linesize[1] == s->frame->linesize[2]); ++ ++ rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1], ++ x0_c, y0_c, nPbW_c, nPbH_c, mv, ++ c_weights[0], c_offsets[0]); ++ ++ rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2], ++ x0_c, y0_c, nPbW_c, nPbH_c, mv, ++ c_weights[1], c_offsets[1]); ++#endif ++ { ++ const int hshift = s->ps.sps->hshift[1]; ++ const int vshift = s->ps.sps->vshift[1]; ++ ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ const uint32_t src_base_u = get_vc_address_u(src_frame); ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; ++ const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]); ++ const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]); ++ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; ++ ++ for(int start_y=0;start_y < nPbH_c;start_y+=16) ++ { ++ const int bh = FFMIN(nPbH_c-start_y, 16); ++ ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) ++ { ++ HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3); ++ qpu_mc_pred_c_t * const u = cp->qpu_mc_curr; ++ qpu_mc_pred_c_t * const last_l0 = cp->last_l0; ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ u[-1].next_fn = s->qpu_filter_uv; ++ last_l0->next_src_x = x1_c + start_x; ++ last_l0->next_src_y = y1_c + start_y; ++ last_l0->next_src_base_c = src_base_u; ++ u[0].p.h = bh; ++ u[0].p.w = bw; ++ u[0].p.coeffs_x = x_coeffs; ++ u[0].p.coeffs_y = y_coeffs; ++ u[0].p.wo_u = wo_u; ++ u[0].p.wo_v = wo_v; ++ u[0].p.dst_addr_c = dst_base_u + start_x * 2; ++ cp->last_l0 = u; ++ cp->qpu_mc_curr = u + 1; ++ } ++ ++ dst_base_u += s->frame->linesize[1] * 16; ++ } ++ } ++ return; ++} ++ ++static void ++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const struct MvField * const mv_field, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ const int16_t * const c_weights2, ++ const int16_t * const c_offsets2, ++ AVFrame * const src_frame, ++ AVFrame * const src_frame2) ++{ ++ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); ++#if 0 ++ rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2, ++ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0); ++ ++ rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2, ++ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1); ++#endif ++ { ++ const int hshift = s->ps.sps->hshift[1]; ++ const int vshift = s->ps.sps->vshift[1]; ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; ++ ++ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); ++ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); ++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; ++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ ++ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); ++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; ++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ ++ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ ++ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; ++ ++ for (int start_y = 0; start_y < nPbH_c; start_y += 16) { ++ const unsigned int bh = FFMIN(nPbH_c-start_y, 16); ++ ++ // We are allowed 3/4 powers of two as well as powers of 2 ++ av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2); ++ ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) { ++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3); ++ qpu_mc_pred_c_t * const u = cp->qpu_mc_curr; ++ qpu_mc_pred_c_t * const last_l0 = cp->last_l0; ++ qpu_mc_pred_c_t * const last_l1 = cp->last_l1; ++ ++ u[-1].next_fn = s->qpu_filter_uv_b0; ++ last_l0->next_src_x = x1_c + start_x; ++ last_l0->next_src_y = y1_c + start_y; ++ last_l0->next_src_base_c = get_vc_address_u(src_frame); ++ ++ u[0].next_fn = 0; // Ignored - 2 block cmd ++ u[0].next_src_x = x2_c + start_x; ++ u[0].next_src_y = y2_c + start_y; ++ u[0].next_src_base_c = get_vc_address_u(src_frame2); ++ ++ u[0].b0.h = (bh<16 ? bh : 16); ++ u[0].b0.w = (bwnext_src_x = x2_c + start_x; ++ last_l1->next_src_y = y2_c + start_y; ++ last_l1->next_src_base_c = get_vc_address_u(src_frame2); ++ ++ u[1].b1.dummy0 = 0; // w,h inherited from b0 ++ u[1].b1.coeffs_x = coefs1_x; ++ u[1].b1.coeffs_y = coefs1_y; ++ u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]); ++ u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]); ++ u[1].b1.dst_addr_c = dst_base_u + start_x * 2; ++ ++ cp->last_l0 = u; ++ cp->last_l1 = u + 1; ++ cp->qpu_mc_curr = u + 2; ++ } ++ ++ dst_base_u += s->frame->linesize[1] * 16; ++ } ++ } ++} ++#endif ++ ++ ++ +static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0, + const int nPbW, const int nPbH, + const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) @@ -3092,7 +4796,7 @@ index b478065..88dd40b 100644 int merge_idx = 0; struct MvField current_mv = {{{ 0 }}}; -@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int y_cb = y0 >> log2_min_cb_size; int x_pu, y_pu; int i, j; @@ -3102,315 +4806,112 @@ index b478065..88dd40b 100644 if (!skip_flag) lc->pu.merge_flag = ff_hevc_merge_flag_decode(s); -@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame, -+#ifdef RPI_LUMA_QPU ++#if RPI_INTER + if (s->enable_rpi) { -+ const Mv * const mv = ¤t_mv.mv[0]; -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my<<8) | mx; -+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; -+ const int x1_m3 = x0 + (mv->x >> 2) - 3; -+ const int y1_m3 = y0 + (mv->y >> 2) - 3; -+ const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame); -+ uint32_t * y = s->curr_y_mvs; -+ -+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go -+ const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16); -+ -+ for(int start_x=0;start_x < nPbW;start_x+=16) { -+ const int bw = nPbW-start_x; -+ const int bh = nPbH-start_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; -+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16); -+ *y++ = my2_mx2_my_mx; -+ *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]]; -+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1; -+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]); -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter; -+ } -+ } -+ s->curr_y_mvs = y; ++ rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0, ++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); + } else +#endif + { -+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame, ++ luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame, ¤t_mv.mv[0], x0, y0, nPbW, nPbH, s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]]); + } if (s->ps.sps->chroma_format_idc) { -- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], -+#ifdef RPI_INTER_QPU -+ if (s->enable_rpi) { -+ int hshift = s->ps.sps->hshift[1]; -+ int vshift = s->ps.sps->vshift[1]; -+ const Mv *mv = ¤t_mv.mv[0]; -+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift); -+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift); -+ intptr_t _mx = mx << (1 - hshift); -+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector -+ -+ int x1_c = x0_c + (mv->x >> (2 + hshift)); -+ int y1_c = y0_c + (mv->y >> (2 + hshift)); -+ -+ uint32_t *u = s->curr_u_mvs; -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) { -+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) { -+ int bw = nPbW_c-start_x; -+ int bh = nPbH_c-start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame); -+ *u++ = ( (bwsh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1, -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]); -+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1, -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]); -+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); -+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]); -+ } -+ } -+ s->curr_u_mvs = u; ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); + return; + } +#endif -+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], + chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]); -- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2], -+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2], - 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]); - } -@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame, -+#ifdef RPI_LUMA_QPU ++#if RPI_INTER + if (s->enable_rpi) { -+ const int reflist = 1; -+ const Mv *mv = ¤t_mv.mv[reflist]; -+ int mx = mv->x & 3; -+ int my = mv->y & 3; -+ int my_mx = (my<<8) + mx; -+ int my2_mx2_my_mx = (my_mx << 16) + my_mx; -+ int x1 = x0 + (mv->x >> 2); -+ int y1 = y0 + (mv->y >> 2); -+ uint32_t *y = s->curr_y_mvs; -+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go -+ for(int start_x=0;start_x < nPbW;start_x+=16) { -+ int bw = nPbW-start_x; -+ int bh = nPbH-start_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame); -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame); -+ *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16); -+ *y++ = my2_mx2_my_mx; -+ *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]]; -+ *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1; -+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]); -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter; -+ } -+ } -+ s->curr_y_mvs = y; ++ rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1, ++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); + } else +#endif -+ + { -+ RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame, ++ luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame, ¤t_mv.mv[1], x0, y0, nPbW, nPbH, s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]]); + } if (s->ps.sps->chroma_format_idc) { -- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], -+#ifdef RPI_INTER_QPU ++#if RPI_INTER + if (s->enable_rpi) { -+ const int reflist = 1; -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; -+ const Mv * const mv = ¤t_mv.mv[reflist]; -+ const intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift); -+ const intptr_t my = av_mod_uintp2(mv->y, 2 + vshift); -+ const intptr_t _mx = mx << (1 - hshift); -+ const intptr_t _my = my << (1 - vshift); // Fractional part of motion vector -+ -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)); -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)); -+ -+ uint32_t * u = s->curr_u_mvs; -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) { -+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) { -+ const int bw = nPbW_c-start_x; -+ const int bh = nPbH_c-start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame); -+ *u++ = ( (bwsh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]); -+ *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]); -+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); -+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]); -+ } -+ } -+ s->curr_u_mvs = u; ++ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); + return; + } +#endif -+ RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], + chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]); - -- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2], -+ RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2], - 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]); - } -@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame, -+#ifdef RPI_LUMA_QPU -+ if (s->enable_rpi && 0) { -+ const Mv *mv = ¤t_mv.mv[0]; -+ int mx = mv->x & 3; -+ int my = mv->y & 3; -+ int my_mx = (my<<8) + mx; -+ const Mv *mv2 = ¤t_mv.mv[1]; -+ int mx2 = mv2->x & 3; -+ int my2 = mv2->y & 3; -+ int my2_mx2 = (my2<<8) + mx2; -+ int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx; -+ int x1 = x0 + (mv->x >> 2); -+ int y1 = y0 + (mv->y >> 2); -+ int x2 = x0 + (mv2->x >> 2); -+ int y2 = y0 + (mv2->y >> 2); -+ uint32_t *y = s->curr_y_mvs; -+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go -+ for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time -+ int bw = nPbW-start_x; -+ int bh = nPbH-start_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame); -+ y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1 -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame); -+ *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16); -+ *y++ = my2_mx2_my_mx; -+ -+ *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]], -+ s->sh.luma_weight_l0[current_mv.ref_idx[0]]); -+ *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] + -+ s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1; -+ -+ *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]); -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b; -+ } -+ } -+ s->curr_y_mvs = y; ++#if RPI_INTER ++ if (s->enable_rpi) { ++ rpi_pred_y_b(s, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); + } else +#endif + { -+ RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame, ++ luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame, ¤t_mv.mv[0], x0, y0, nPbW, nPbH, ref1->frame, ¤t_mv.mv[1], ¤t_mv); + } if (s->ps.sps->chroma_format_idc) { -- chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, -+#ifdef RPI_INTER_QPU ++#if RPI_INTER + if (s->enable_rpi) { -+ int hshift = s->ps.sps->hshift[1]; -+ int vshift = s->ps.sps->vshift[1]; -+ const Mv *mv = ¤t_mv.mv[0]; -+ intptr_t mx = av_mod_uintp2(mv->x, 2 + hshift); -+ intptr_t my = av_mod_uintp2(mv->y, 2 + vshift); -+ intptr_t _mx = mx << (1 - hshift); -+ intptr_t _my = my << (1 - vshift); // Fractional part of motion vector -+ int x1_c = x0_c + (mv->x >> (2 + hshift)); -+ int y1_c = y0_c + (mv->y >> (2 + hshift)); -+ -+ const Mv *mv2 = ¤t_mv.mv[1]; -+ intptr_t mx2 = av_mod_uintp2(mv2->x, 2 + hshift); -+ intptr_t my2 = av_mod_uintp2(mv2->y, 2 + vshift); -+ intptr_t _mx2 = mx2 << (1 - hshift); -+ intptr_t _my2 = my2 << (1 - vshift); // Fractional part of motion vector -+ -+ int x2_c = x0_c + (mv2->x >> (2 + hshift)); -+ int y2_c = y0_c + (mv2->y >> (2 + hshift)); -+ -+ -+ uint32_t *u = s->curr_u_mvs; -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) { -+ for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) { -+ int bw = nPbW_c-start_x; -+ int bh = nPbH_c-start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame); -+ *u++ = ( (bwsh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U -+ *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V -+ *u++ = 0; // Intermediate results are not written back in first pass of B filtering -+ *u++ = 0; -+ -+ u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y; -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame); -+ u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame); -+ *u++ = ( (bwsh.chroma_offset_l0[current_mv.ref_idx[0]][0] + -+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]); -+ *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] + -+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]); -+ *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]); -+ *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]); -+ } -+ } -+ s->curr_u_mvs = u; ++ rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c, ++ ¤t_mv, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], ++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref0->frame, ++ ref1->frame); + return; + } +#endif -+ RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, + chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0); -- chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame, -+ RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame, - x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 1); - } - } -@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, +@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) + intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); + ret = hls_pcm_sample(s, x0, y0, log2_cb_size); + if (s->ps.sps->pcm.loop_filter_disable_flag) ++ { + set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } + + if (ret < 0) + return ret; +@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]])); } @@ -3427,6 +4928,7 @@ index b478065..88dd40b 100644 + s->num_dblk_cmds[job] = 0; +} + ++#if 0 +static void rpi_execute_transform(HEVCContext *s) +{ + int i=2; @@ -3442,7 +4944,7 @@ index b478065..88dd40b 100644 + s->hevcdsp.idct[5-2](coeffs, 32); + }*/ + -+ gpu_cache_flush(&s->coeffs_buf_accelerated[job]); ++ rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); + s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], + s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], + s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]); @@ -3453,12 +4955,16 @@ index b478065..88dd40b 100644 + for(i=0;i<4;i++) + s->num_coeffs[job][i] = 0; +} ++#endif + -+static void rpi_execute_pred_cmds(HEVCContext *s) ++ ++// I-pred, transform_and_add for all blocks types done here ++// All ARM ++static void rpi_execute_pred_cmds(HEVCContext * const s) +{ + int i; + int job = s->pass1_job; -+ HEVCPredCmd *cmd = s->univ_pred_cmds[job]; ++ const HEVCPredCmd *cmd = s->univ_pred_cmds[job]; +#ifdef RPI_WORKER + HEVCLocalContextIntra *lc = &s->HEVClcIntra; +#else @@ -3466,43 +4972,65 @@ index b478065..88dd40b 100644 +#endif + + for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) { -+ //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); -+ if (cmd->type == RPI_PRED_INTRA) { -+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode; -+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; -+ lc->na.cand_left = (cmd->na >> 3) & 1; -+ lc->na.cand_up_left = (cmd->na >> 2) & 1; -+ lc->na.cand_up = (cmd->na >> 1) & 1; -+ lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx); -+ } else { ++// printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); ++ ++ switch (cmd->type) ++ { ++ case RPI_PRED_INTRA: ++ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode; ++ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; ++ lc->na.cand_left = (cmd->na >> 3) & 1; ++ lc->na.cand_up_left = (cmd->na >> 2) & 1; ++ lc->na.cand_up = (cmd->na >> 1) & 1; ++ lc->na.cand_up_right = (cmd->na >> 0) & 1; ++ if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0) ++ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ else ++ s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ break; ++ ++ case RPI_PRED_ADD_RESIDUAL: ++ s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); +#ifdef RPI_PRECLEAR -+ int trafo_size = 1 << cmd->size; -+#endif -+ s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride); -+#ifdef RPI_PRECLEAR -+ memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache ++ memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache +#endif ++ break; ++ case RPI_PRED_ADD_RESIDUAL_U: ++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_V: ++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ ++ case RPI_PRED_I_PCM: ++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); ++ break; ++ ++ default: ++ av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); ++ abort(); + } + } + s->num_pred_cmds[job] = 0; +} + -+static void rpi_execute_inter_cmds(HEVCContext *s) ++// Do any inter-pred that we want to do in software ++// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here ++// All ARM ++static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only) +{ -+ int job = s->pass1_job; -+ HEVCMvCmd *cmd = s->unif_mv_cmds[job]; -+ int n,cidx; ++ unsigned int cidx; + AVFrame myref; + AVFrame myref1; + struct MvField mymv; -+ if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) { -+ printf("Overflow inter_cmds\n"); -+ exit(-1); -+ } -+ for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) { ++ ++ for(; n>0 ; n--, cmd++) { ++ av_assert0(0); ++ + switch(cmd->cmd) { + case RPI_CMD_LUMA_UNI: ++ if (b_only) ++ break; + myref.data[0] = cmd->src; + myref.linesize[0] = cmd->srcstride; + luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset); @@ -3519,6 +5047,8 @@ index b478065..88dd40b 100644 + &myref1, &cmd->mv1, &mymv); + break; + case RPI_CMD_CHROMA_UNI: ++ if (b_only) ++ break; + mymv.mv[0] = cmd->mv; + chroma_mc_uni(s, cmd->dst, + cmd->dststride, cmd->src, cmd->srcstride, 0, @@ -3540,618 +5070,385 @@ index b478065..88dd40b 100644 + break; + } + } -+ s->num_mv_cmds[job] = 0; +} + -+static void rpi_do_all_passes(HEVCContext *s) ++static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only) +{ -+ // Kick off QPUs and VPUs -+ rpi_launch_vpu_qpu(s); -+ // Perform luma inter prediction -+ rpi_execute_inter_cmds(s); -+ // Wait for transform completion -+ vpu_wait(s->vpu_id); -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s); -+ // Perform deblocking for CTBs in this row -+ rpi_execute_dblk_cmds(s); -+ // Prepare next batch -+ rpi_begin(s); ++ const int job = s->pass1_job; ++ ++ if (!qpu_luma || luma_b_only) ++ do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma); ++ s->num_mv_cmds_y[job] = 0; ++ if (!qpu_chroma || chroma_b_only) ++ do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma); ++ s->num_mv_cmds_c[job] = 0; +} + +#endif + +#ifdef RPI ++// Set initial uniform job values & zero ctu_count +static void rpi_begin(HEVCContext *s) +{ ++#if RPI_INTER + int job = s->pass0_job; + int i; -+#ifdef RPI_INTER_QPU -+ int pic_width = s->ps.sps->width >> s->ps.sps->hshift[1]; -+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[1]; + -+ for(i=0;i<8;i++) { -+ s->u_mvs[job][i] = s->mvs_base[job][i]; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = pic_width; -+ *s->u_mvs[job][i]++ = pic_height; -+ *s->u_mvs[job][i]++ = s->frame->linesize[1]; -+ *s->u_mvs[job][i]++ = s->frame->linesize[2]; -+ *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6; -+ *s->u_mvs[job][i]++ = 0; -+ *s->u_mvs[job][i]++ = i; // Select section of VPM (avoid collisions with 3d unit) -+ } -+ s->curr_u_mvs = s->u_mvs[job][0]; -+#endif ++ const uint16_t pic_width_y = s->ps.sps->width; ++ const uint16_t pic_height_y = s->ps.sps->height; + -+#ifdef RPI_LUMA_QPU -+ for(i=0;i<12;i++) { -+ // This needs to have a generally similar structure to the -+ // actual filter code as various pipelined bits need to land correctly -+ // when inserted by the filter requests -+ s->y_mvs[job][i] = s->y_mvs_base[job][i]; -+ *s->y_mvs[job][i]++ = 0; // y_x -+ *s->y_mvs[job][i]++ = 0; // ref_y_base -+ *s->y_mvs[job][i]++ = 0; // y2_x2 -+ *s->y_mvs[job][i]++ = 0; // ref_y2_base -+ *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height; -+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch -+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch -+ *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6; // weight demon + 6 -+ *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block -+ *s->y_mvs[job][i]++ = 0; // Next kernel ++ const uint16_t pic_width_c = s->ps.sps->width >> s->ps.sps->hshift[1]; ++ const uint16_t pic_height_c = s->ps.sps->height >> s->ps.sps->vshift[1]; ++ ++ for(i=0; i < QPU_N_UV;i++) { ++ HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i; ++ qpu_mc_pred_c_t * u = cp->qpu_mc_base; ++ ++ // Chroma setup is a double block with L0 fetch ++ // and other stuff in the 1st block and L1 fetch ++ // in the 2nd along with a lot of dummy vars ++ // This could be packed a lot tighter but it would make ++ // L0, L1 management a lot harder ++ ++ u->next_fn = 0; ++ u->next_src_x = 0; ++ u->next_src_y = 0; ++ u->next_src_base_c = 0; ++ u->s0.pic_cw = pic_width_c; ++ u->s0.pic_ch = pic_height_c; ++ u->s0.stride2 = rpi_sliced_frame_stride2(s->frame); ++ u->s0.stride1 = s->frame->linesize[1]; ++ u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6; ++ u->s0.dummy0 = 0; ++ cp->last_l0 = u; ++ ++u; ++ ++ u->next_fn = 0; ++ u->next_src_x = 0; ++ u->next_src_y = 0; ++ u->next_src_base_c = 0; ++ u->s1.dummy0 = 0; ++ u->s1.dummy1 = 0; ++ u->s1.dummy2 = 0; ++ u->s1.dummy3 = 0; ++ u->s1.dummy4 = 0; ++ u->s1.dummy5 = 0; ++ cp->last_l1 = u; ++ ++u; ++ ++ cp->load = 0; ++ cp->qpu_mc_curr = u; + } -+ s->curr_y_mvs = s->y_mvs[job][0]; ++ s->curr_pred_c = NULL; ++ ++ for(i=0;i < QPU_N_Y;i++) { ++ HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i; ++ qpu_mc_pred_y_t * y = yp->qpu_mc_base; ++ ++ y->next_src1_x = 0; ++ y->next_src1_y = 0; ++ y->next_src1_base = 0; ++ y->next_src2_x = 0; ++ y->next_src2_y = 0; ++ y->next_src2_base = 0; ++ y->s.pic_h = pic_height_y; ++ y->s.pic_w = pic_width_y; ++ y->s.stride2 = rpi_sliced_frame_stride2(s->frame); ++ y->s.stride1 = s->frame->linesize[0]; ++ y->s.wdenom = s->sh.luma_log2_weight_denom + 6; ++ y->s.dummy0 = 0; ++ y->next_fn = 0; ++ yp->last_lx = y; ++ ++y; ++ ++ yp->load = 0; ++ yp->qpu_mc_curr = y; ++ } ++ s->curr_pred_y = NULL; ++ s->last_y8_p = NULL; ++ s->last_y8_lx = NULL; +#endif + s->ctu_count = 0; +} +#endif + -+#ifdef RPI_SIMULATE_QPUS + -+static int32_t clipx(int x,int FRAME_WIDTH) ++#if RPI_INTER ++static unsigned int mc_terminate_y(HEVCContext * const s, const int job) +{ -+ if (x<=0) return 0; -+ if (x>=FRAME_WIDTH) return FRAME_WIDTH-1; -+ return x; -+} ++ unsigned int i; ++ const uint32_t exit_fn = qpu_fn(mc_exit); ++ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12); ++ unsigned int tc = 0; ++ HEVCRpiJob * const jb = s->jobs + job; + -+static int32_t clipy(int y,int FRAME_HEIGHT) -+{ -+ if (y<=0) return 0; -+ if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1; -+ return y; -+} ++ // Add final commands to Q ++ for(i = 0; i != QPU_N_Y; ++i) { ++ HEVCRpiLumaPred * const yp = jb->luma_mvs + i; ++ qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx; + -+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset) -+{ -+ int32_t vsum = 0; -+ int x, y; ++ // We will always have had L0 if we have L1 so only test L0 ++ if (px != yp->qpu_mc_base) ++ tc = 1; + -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; ++ yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2; // Actual fn ptr + -+ for (x = 0; x < 8; x++) -+ hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch]; ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ px->next_src1_x = MC_DUMMY_X; ++ px->next_src1_y = MC_DUMMY_Y; ++ px->next_src1_base = s->qpu_dummy_frame; ++ px->next_src2_x = MC_DUMMY_X; ++ px->next_src2_y = MC_DUMMY_Y; ++ px->next_src2_base = s->qpu_dummy_frame; + -+ vsum += lumaFilter[my][y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+round)>>denom)+offset; -+ -+ return av_clip_uint8( vsum ); -+}*/ -+ -+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ int chromaFilterH[4]; -+ int chromaFilterV[4]; -+ int i; -+ int offset_after = offset_weight>>16; -+ int weight = (offset_weight<<16)>>16; -+ for(i=0;i<4;i++) { -+ chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24; -+ chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24; -+ } -+ -+ for (y = 0; y < 4; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 4; x++) -+ hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; -+ -+ vsum += chromaFilterV[y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; -+ -+ return vsum; -+} -+ -+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} }; -+ -+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ int i; -+ int offset_after = offset_weight>>16; -+ int weight = (offset_weight<<16)>>16; -+ -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 8; x++) -+ hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; -+ -+ vsum += lumaFilter[(my_mx>>8)&3][y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; -+ -+ return vsum; -+} -+ -+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx) -+{ -+ //int pic_width = s->ps.sps->width >> s->ps.sps->hshift[cIdx]; -+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[cIdx]; -+ int pitch = frame->linesize[cIdx]; -+ uint32_t base = cIdx == 0 ? get_vc_address_y(frame) : -+ cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame); -+ if (p>=base && pdata[cIdx] + (p-base); -+ } -+ return NULL; -+} -+ -+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx) -+{ -+ SliceHeader *sh = &s->sh; -+ uint8_t *arm = test_frame(s,p,s->frame,cIdx); -+ int i; -+ if (arm) return arm; -+ if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE) -+ { -+ for(i=0;inb_refs[L0];i++) { -+ arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx); -+ if (arm) return arm; ++ yp->last_lx = NULL; + } -+ } -+ if (sh->slice_type == B_SLICE) { -+ for(i=0;inb_refs[L1];i++) { -+ arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx); -+ if (arm) return arm; -+ } -+ } -+ printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT)); -+ exit(-1); -+ return NULL; ++ ++ return tc; +} + -+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p) ++#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c ++#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n) ++ ++static unsigned int mc_terminate_uv(HEVCContext * const s, const int job) +{ -+ uint32_t next_kernel; -+ uint32_t x0; -+ uint32_t y0; -+ uint8_t *ref_u_base; -+ uint8_t *ref_v_base; -+ uint32_t frame_width = p[5]; -+ uint32_t frame_height = p[6]; -+ uint32_t pitch = p[7]; -+ uint32_t dst_pitch = p[8]; -+ int32_t offset_before = p[9]; -+ int32_t denom = p[10]; -+ uint32_t vpm_id = p[11]; -+ uint32_t tmp_u_dst[256]; -+ uint32_t tmp_v_dst[256]; -+ while(1) { -+ p += 12; -+ next_kernel = p[0-12]; -+ x0 = p[1-12]; -+ y0 = p[2-12]; -+ if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) { -+ int x,y; -+ uint32_t width_height = p[5]; -+ uint32_t hcoeffs = p[6]; -+ uint32_t vcoeffs = p[7]; -+ uint32_t offset_weight_u = p[8]; -+ uint32_t offset_weight_v = p[9]; -+ uint8_t *this_u_dst; -+ uint8_t *this_v_dst; -+ uint32_t width = width_height >> 16; -+ uint32_t height = (width_height << 16) >> 16; -+ ref_u_base = compute_arm_addr(s,p[3-12],1); -+ ref_v_base = compute_arm_addr(s,p[4-12],2); -+ if (next_kernel!=s->mc_filter_uv_b0) -+ { -+ this_u_dst = compute_arm_addr(s,p[10],1); -+ this_v_dst = compute_arm_addr(s,p[11],2); -+ } -+ for (y=0; ymc_filter_uv) { -+ int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height); -+ int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height); -+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); -+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } else if (next_kernel==s->mc_filter_uv_b0) { -+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); -+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); -+ tmp_u_dst[x+y*16] = refa; -+ tmp_v_dst[x+y*16] = refb; -+ } else { -+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height); -+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height); -+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); -+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } -+ } -+ } -+ } else { -+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); -+ break; ++ unsigned int i; ++ const uint32_t exit_fn = qpu_fn(mc_exit_c); ++ const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV)); ++ unsigned int tc = 0; ++ HEVCRpiJob * const jb = s->jobs + job; ++ ++ // Add final commands to Q ++ for(i = 0; i != QPU_N_UV; ++i) { ++ HEVCRpiChromaPred * const cp = jb->chroma_mvs + i; ++ qpu_mc_pred_c_t *const p0 = cp->last_l0; ++ qpu_mc_pred_c_t *const p1 = cp->last_l1; ++ ++ // We will always have had L0 if we have L1 so only test L0 ++ if (p0 != cp->qpu_mc_base) ++ tc = 1; ++ ++ cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2; // Actual fn ptr ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->next_src_x = MC_DUMMY_X; ++ p0->next_src_y = MC_DUMMY_Y; ++ p0->next_src_base_c = s->qpu_dummy_frame; ++ p1->next_src_x = MC_DUMMY_X; ++ p1->next_src_y = MC_DUMMY_Y; ++ p1->next_src_base_c = s->qpu_dummy_frame;; ++ ++ cp->last_l0 = NULL; ++ cp->last_l1 = NULL; + } -+ } ++ ++ return tc; +} -+ -+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel) -+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan) -+{ -+ uint32_t next_kernel; -+ int y_x,y2_x2; -+ int x0; -+ int y0; -+ int x2; -+ int y2; -+ uint32_t *p0 = p; -+ uint8_t *ref_y_base; -+ uint8_t *ref_y2_base; -+ uint32_t frame_width_height = p[4]; -+ uint32_t frame_width = frame_width_height>>16; -+ uint32_t frame_height = (frame_width_height<<16)>>16; -+ uint32_t pitch = p[5]; -+ uint32_t dst_pitch = p[6]; -+ int offset_shift = p[7]; -+ int32_t offset_before = offset_shift>>16; -+ int32_t denom = (offset_shift<<16)>>16; -+ while(1) { -+ p += 9; -+ next_kernel = p[8-9]; -+ y_x = p[0-9]; -+ x0 = (y_x<<16)>>16; -+ y0 = y_x>>16; -+ y2_x2 = p[2-9]; -+ x2 = (y2_x2<<16)>>16; -+ y2 = y2_x2>>16; -+ -+ if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) { -+ // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+ int x,y; -+ uint32_t width_height = p[4]; -+ uint32_t my2_mx2_my_mx = p[5]; -+ uint32_t offset_weight = p[6]; -+ uint8_t *this_dst = compute_arm_addr(s,p[7],0); -+ uint32_t width = width_height >> 16; -+ uint32_t height = (width_height << 16) >> 16; -+ uint8_t *dst_base = s->frame->data[0]; -+ ref_y_base = compute_arm_addr(s,p[1-9],0); -+ ref_y2_base = compute_arm_addr(s,p[3-9],0); -+ for (y=0; ymc_filter) { -+ int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height); -+ refa = av_clip_uint8(refa); -+ this_dst[x+y*dst_pitch] = refa; -+ } -+ else { -+ int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height); -+ int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height); -+ this_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } -+ } -+ } -+ } else { -+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); -+ break; -+ } -+ } -+} -+ -+static void rpi_simulate_inter_qpu(HEVCContext *s) -+{ -+ // First run the transform as normal -+ int i; -+ rpi_execute_transform(s); -+ for(i=0;i<8;i++) -+ { -+ rpi_simulate_inter_chroma(s,s->mvs_base[i]); -+ } -+ for(i=0;i<12;i++) -+ { -+ rpi_simulate_inter_luma(s,s->y_mvs_base[i],i); -+ } -+} -+ +#endif + -+#ifdef RPI_INTER_QPU ++#ifdef RPI + -+static void rpi_launch_vpu_qpu(HEVCContext *s) ++ ++static void flush_frame(HEVCContext *s,AVFrame *frame) +{ -+ int k; -+ int job = s->pass1_job; -+ int i; -+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc; -+#ifdef RPI_LUMA_QPU -+ uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc; -+#endif -+ if (s->sh.slice_type == I_SLICE) { -+#ifdef RPI_MULTI_MAILBOX -+ rpi_execute_transform(s); -+ return; -+#endif -+ } -+ for(k=0;k<8;k++) { -+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command -+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined -+ s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V -+ av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU); -+ } ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ rpi_cache_flush_finish(rfe); ++} + -+ s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore + -+#ifdef RPI_LUMA_QPU -+ for(k=0;k<12;k++) { -+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined -+ s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request -+ s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform) -+ av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU); -+ } -+ s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore -+#endif ++// Core execution tasks ++static void worker_core(HEVCContext * const s) ++{ ++ worker_global_env_t * const wg = &worker_global_env; ++ int arm_cost = 0; ++// vpu_qpu_wait_h sync_c; ++ vpu_qpu_wait_h sync_y; ++ int qpu_luma = 0; ++ int qpu_chroma = 0; ++ int gpu_load; ++ int arm_load; ++ static const int arm_const_cost = 2; + -+#ifdef RPI_SIMULATE_QPUS -+ rpi_simulate_inter_qpu(s); -+ return; -+#endif ++// static int z = 0; + -+#ifdef RPI_MULTI_MAILBOX -+#ifdef RPI_CACHE_UNIF_MVS -+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job); -+#else -+ flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job); -+#endif ++ const int job = s->pass1_job; ++ unsigned int flush_start = 0; ++ unsigned int flush_count = 0; + -+#if 1 -+ { -+ unsigned int i; -+ uint32_t * p; -+ uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV); -+ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS]; -+ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS]; ++ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); + -+ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) { -+ *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm)); -+ *p++ = code; -+ } -+ -+ code = qpu_get_fn(QPU_MC_SETUP); -+ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) { -+ *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)); -+ *p++ = code; -+ } -+ -+ s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(), ++ if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) { ++ vpu_qpu_job_add_vpu(vqj, ++ vpu_get_fn(), + vpu_get_constants(), + s->coeffs_buf_vc[job][2], + s->num_coeffs[job][2] >> 8, + s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], + s->num_coeffs[job][3] >> 10, -+ 0, -+ // QPU job 1 -+ QPU_N_UV, -+ mail_uv, -+ // QPU job 2 -+ QPU_N_Y, -+ mail_y -+ ); ++ 0); ++ ++ rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); + } + ++ ++#if RPI_INTER ++ pthread_mutex_lock(&wg->lock); ++ ++// ++z; ++ gpu_load = vpu_qpu_current_load(); ++ arm_load = avpriv_atomic_int_get(&wg->arm_load); ++#if 0 // Y_B_ONLY ++ qpu_luma = gpu_load + 2 < arm_load; ++ qpu_chroma = gpu_load < arm_load + 8; ++#elif 0 ++ qpu_luma = gpu_load < arm_load + 2; ++ qpu_chroma = gpu_load < arm_load + 8; +#else -+ s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8, -+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0, -+ qpu_get_fn(QPU_MC_SETUP_UV), -+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+#ifdef RPI_LUMA_QPU -+ qpu_get_fn(QPU_MC_SETUP), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)), -+ (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)) -+#else -+ 0, -+ 0,0,0,0, -+ 0,0,0,0, -+ 0,0,0,0 -+#endif -+ ); -+#endif -+ for(i=0;i<4;i++) -+ s->num_coeffs[job][i] = 0; -+#else -+#error Code rotted here -+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV), -+ (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)) -+ ); ++ qpu_chroma = 1; ++ qpu_luma = 1; +#endif + ++ arm_cost = !qpu_chroma * 2 + !qpu_luma * 3; ++ avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost); + ++ wg->gpu_c += qpu_chroma; ++ wg->gpu_y += qpu_luma; ++ wg->arm_c += !qpu_chroma; ++ wg->arm_y += !qpu_luma; ++ ++ ++// if ((z & 511) == 0) { ++// printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y); ++// } ++ ++ ++ { ++ int (*d)[2] = s->dblk_cmds[job]; ++ unsigned int high=(*d)[1]; ++ int n; ++ ++ flush_start = high; ++ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) { ++ unsigned int y = (*d)[1]; ++ flush_start = FFMIN(flush_start, y); ++ high=FFMAX(high,y); ++ } ++ // Avoid flushing past end of frame ++ flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start; ++ } ++ ++#if !DISABLE_CHROMA ++ if (qpu_chroma && mc_terminate_uv(s, job) != 0) ++ { ++ HEVCRpiJob * const jb = s->jobs + job; ++ const uint32_t code = qpu_fn(mc_setup_c); ++ uint32_t * p; ++ unsigned int i; ++ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS]; ++ ++ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) { ++ *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm); ++ *p++ = code; ++ } ++ ++ vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv); ++ ++#if RPI_CACHE_UNIF_MVS ++ rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++#endif ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ flush_start, flush_count, s->ps.sps->vshift[1], 0, 1); ++ } ++#endif ++ ++// We can take a sync here and try to locally overlap QPU processing with ARM ++// but testing showed a slightly negative benefit with noticable extra complexity ++// vpu_qpu_job_add_sync_this(vqj, &sync_c); ++ ++ if (qpu_luma && mc_terminate_y(s, job) != 0) ++ { ++ HEVCRpiJob * const jb = s->jobs + job; ++ const uint32_t code = qpu_fn(mc_setup); ++ uint32_t * p; ++ unsigned int i; ++ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS]; ++ ++ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) { ++ *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm); ++ *p++ = code; ++ } ++ ++ vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y); ++ ++#if RPI_CACHE_UNIF_MVS ++ rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++#endif ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ flush_start, flush_count, s->ps.sps->vshift[1], 1, 0); ++ } ++ ++ pthread_mutex_unlock(&wg->lock); ++ ++#endif ++ ++ vpu_qpu_job_add_sync_this(vqj, &sync_y); ++ ++ // Having accumulated some commands - do them ++ rpi_cache_flush_finish(rfe); ++ vpu_qpu_job_finish(vqj); ++ ++ memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job])); //???? Surely we haven't done the smaller ++ ++#if Y_B_ONLY ++ if (qpu_luma) ++ vpu_qpu_wait(&sync_y); ++#endif ++ // Perform inter prediction ++ rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0); ++ ++ // Wait for transform completion ++ ++ // Perform intra prediction and residual reconstruction ++ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost); ++#if Y_B_ONLY ++ if (!qpu_luma) ++ vpu_qpu_wait(&sync_y); ++#else ++ vpu_qpu_wait(&sync_y); ++#endif ++ rpi_execute_pred_cmds(s); ++ ++ // Perform deblocking for CTBs in this row ++ rpi_execute_dblk_cmds(s); ++ ++ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost); +} -+#else + -+#ifdef RPI -+static void rpi_launch_vpu_qpu(HEVCContext *s) ++static void rpi_do_all_passes(HEVCContext *s) +{ -+ rpi_execute_transform(s); -+} -+#endif -+ -+#endif -+ -+#ifdef RPI -+ -+#ifndef RPI_FAST_CACHEFLUSH -+#error RPI_FAST_CACHEFLUSH is broken -+static void flush_buffer(AVBufferRef *bref) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref); -+ gpu_cache_flush(p); -+} -+#endif -+ -+static void flush_frame(HEVCContext *s,AVFrame *frame) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame); -+ int n = s->ps.sps->height; -+ int curr_y = 0; -+ int curr_uv = 0; -+ int n_uv = n >> s->ps.sps->vshift[1]; -+ int sz,base; -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)(p.arm) + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)(p.arm) + base; -+ iocache.s[1].size = sz; -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)(p.arm) + base; -+ iocache.s[2].size = sz; -+ vcsm_clean_invalid( &iocache ); -+#else -+ flush_buffer(frame->buf[0]); -+ flush_buffer(frame->buf[1]); -+ flush_buffer(frame->buf[2]); -+#endif ++ // Do the various passes - common with the worker code ++ worker_core(s); ++ // Prepare next batch ++ rpi_begin(s); +} + -+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ int n; -+ int curr_y; -+ int curr_uv; -+ int n_uv; -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame); -+ int sz,base; -+ int (*d)[2] = s->dblk_cmds[job]; -+ int low=(*d)[1]; -+ int high=(*d)[1]; -+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) { -+ int y = (*d)[1]; -+ low=FFMIN(low,y); -+ high=FFMAX(high,y); -+ } -+ curr_y = low; -+ n = high+(1 << s->ps.sps->log2_ctb_size); -+ curr_uv = curr_y >> s->ps.sps->vshift[1]; -+ n_uv = n >> s->ps.sps->vshift[1]; + -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)(p.arm) + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)(p.arm) + base; -+ iocache.s[1].size = sz; -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)(p.arm) + base; -+ iocache.s[2].size = sz; -+ -+ iocache.s[3].handle = p0->vcsm_handle; -+ iocache.s[3].cmd = 3; // clean+invalidate -+ iocache.s[3].addr = (int) p0->arm; -+ iocache.s[3].size = p0->numbytes; -+ if (p1) { -+ iocache.s[4].handle = p1->vcsm_handle; -+ iocache.s[4].cmd = 3; // clean+invalidate -+ iocache.s[4].addr = (int) p1->arm; -+ iocache.s[4].size = p1->numbytes; -+ } -+ if (p2) { -+ iocache.s[5].handle = p2->vcsm_handle; -+ iocache.s[5].cmd = 3; // clean+invalidate -+ iocache.s[5].addr = (int) p2->arm; -+ iocache.s[5].size = p2->numbytes; -+ } -+ vcsm_clean_invalid( &iocache ); -+#else -+ flush_buffer(frame->buf[0]); -+ flush_buffer(frame->buf[1]); -+ flush_buffer(frame->buf[2]); -+ gpu_cache_flush3(p0, p1, p2); -+#endif -+} + +#endif + static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) { HEVCContext *s = avctxt->priv_data; -@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) int y_ctb = 0; int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; +#ifdef RPI -+ s->enable_rpi = s->ps.sps->bit_depth == 8 -+ && !s->ps.pps->cross_component_prediction_enabled_flag; ++ s->enable_rpi = s->ps.sps->bit_depth == 8 && ++ s->frame->format == AV_PIX_FMT_SAND128 && ++ !s->ps.pps->cross_component_prediction_enabled_flag; + + if (!s->enable_rpi) { + if (s->ps.pps->cross_component_prediction_enabled_flag) @@ -4163,7 +5460,7 @@ index b478065..88dd40b 100644 if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); return AVERROR_INVALIDDATA; -@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) } } @@ -4178,26 +5475,25 @@ index b478065..88dd40b 100644 while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) { int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; + hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts); + ++ + ff_hevc_cabac_init(s, ctb_addr_ts); + + hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); +@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; -+#ifdef RPI_INTER_QPU -+ s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8]; -+#endif -+#ifdef RPI_LUMA_QPU -+ s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12]; ++#if RPI_INTER ++ s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV; ++ s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y; +#endif + more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); + -+#ifdef RPI_INTER_QPU -+ s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs; -+#endif -+#ifdef RPI_LUMA_QPU -+ s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs; -+#endif -+ +#ifdef RPI + if (s->enable_rpi) { + //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0); @@ -4207,14 +5503,18 @@ index b478065..88dd40b 100644 + s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb; + s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb; + s->ctu_count++; -+ //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job); + + if ( s->ctu_count >= s->max_ctu_count ) { +#ifdef RPI_WORKER -+ if (s->used_for_ref) { ++ if (s->used_for_ref) ++ { ++// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb); ++ ++// worker_wait(s); + // Split work load onto separate threads so we make as rapid progress as possible with this frame + // Pass on this job to worker thread + worker_submit_job(s); ++ + // Make sure we have space to prepare the next job + worker_pass0_ready(s); + @@ -4236,7 +5536,7 @@ index b478065..88dd40b 100644 if (more_data < 0) { s->tab_slice_address[ctb_addr_rs] = -1; return more_data; -@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) ctb_addr_ts++; ff_hevc_save_states(s, ctb_addr_ts); @@ -4261,12 +5561,25 @@ index b478065..88dd40b 100644 + rpi_do_all_passes(s); + } + ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", ++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, ++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, ++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, ++ ts->y_pred2_hgt16, ts->y_pred2_hle16); ++ memset(ts, 0, sizeof(*ts)); ++ } ++#endif ++ +#endif + if (x_ctb + ctb_size >= s->ps.sps->width && y_ctb + ctb_size >= s->ps.sps->height) ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); -@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int +@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int s = s1->sList[self_id]; lc = s->HEVClc; @@ -4278,16 +5591,32 @@ index b478065..88dd40b 100644 if(ctb_row) { ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]); -@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) +@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) if (ret < 0) return ret; -+ s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N || ++ // The definition of _N unit types is "non-reference for other frames ++ // with the same temporal_id" so they may/will be ref frames for pics ++ // with a higher temporal_id. ++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || ++ !(s->nal_unit_type == NAL_TRAIL_N || + s->nal_unit_type == NAL_TSA_N || + s->nal_unit_type == NAL_STSA_N || + s->nal_unit_type == NAL_RADL_N || + s->nal_unit_type == NAL_RASL_N); + ++#if DEBUG_DECODE_N ++ { ++ static int z = 0; ++ if (IS_IDR(s)) { ++ z = 1; ++ } ++ if (z != 0 && z++ > DEBUG_DECODE_N) { ++ s->is_decoded = 0; ++ break; ++ } ++ } ++#endif + if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { + s->is_decoded = 0; + break; @@ -4295,27 +5624,30 @@ index b478065..88dd40b 100644 if (s->max_ra == INT_MAX) { if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) { s->max_ra = s->poc; -@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) +@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) + } } - fail: +-fail: - if (s->ref && s->threads_type == FF_THREAD_FRAME) ++fail: // Also success path + if (s->ref && s->threads_type == FF_THREAD_FRAME) { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height); ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height); +#endif ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); - -+ } else if (s->ref) { -+#ifdef RPI_INTER_QPU ++ } ++#if RPI_INTER ++ else if (s->ref && s->enable_rpi) { + // When running single threaded we need to flush the whole frame + flush_frame(s,s->frame); -+#endif + } ++#endif return ret; } -@@ -3064,6 +4625,41 @@ fail: +@@ -3064,6 +4764,41 @@ fail: return AVERROR(ENOMEM); } @@ -4357,7 +5689,7 @@ index b478065..88dd40b 100644 static av_cold int hevc_decode_free(AVCodecContext *avctx) { HEVCContext *s = avctx->priv_data; -@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) +@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) av_freep(&s->cabac_state); @@ -4368,29 +5700,26 @@ index b478065..88dd40b 100644 +#endif + + for(i=0;iunif_mv_cmds[i]); -+ av_freep(&s->univ_pred_cmds[i]); + -+#ifdef RPI_INTER_QPU -+ if (s->unif_mvs[i]) { -+ gpu_free( &s->unif_mvs_ptr[i] ); -+ s->unif_mvs[i] = 0; -+ } -+#endif -+#ifdef RPI_LUMA_QPU -+ if (s->y_unif_mvs[i]) { -+ gpu_free( &s->y_unif_mvs_ptr[i] ); -+ s->y_unif_mvs[i] = 0; -+ } ++ av_freep(&s->unif_mv_cmds_y[i]); ++ av_freep(&s->unif_mv_cmds_c[i]); ++ av_freep(&s->univ_pred_cmds[i]); ++ ++#if RPI_INTER ++ gpu_free(&s->jobs[i].chroma_mvs_gptr); ++ gpu_free(&s->jobs[i].luma_mvs_gptr); +#endif + } + ++ vpu_qpu_term(); ++ ++ av_rpi_zc_uninit(avctx); +#endif + for (i = 0; i < 3; i++) { av_freep(&s->sao_pixel_buffer_h[i]); av_freep(&s->sao_pixel_buffer_v[i]); -@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) +@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) return 0; } @@ -4410,75 +5739,76 @@ index b478065..88dd40b 100644 { HEVCContext *s = avctx->priv_data; int i; -+ int job; ++#ifdef RPI ++ unsigned int job; ++#endif s->avctx = avctx; -@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) +@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) s->HEVClcList[0] = s->HEVClc; s->sList[0] = s; +#ifdef RPI -+ for(job=0;jobunif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS); -+ if (!s->unif_mv_cmds[job]) ++ // Whilst FFmpegs init fn is only called once the close fn is called as ++ // many times as we have threads (init_thread_copy is called for the ++ // threads). So to match init & term put the init here where it will be ++ // called by both init & copy ++ av_rpi_zc_init(avctx); ++ ++ if (vpu_qpu_init() != 0) ++ goto fail; ++ ++ for(job = 0; job < RPI_MAX_JOBS; job++) { ++ s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y); ++ if (!s->unif_mv_cmds_y[job]) ++ goto fail; ++ s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C); ++ if (!s->unif_mv_cmds_c[job]) + goto fail; + s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS); + if (!s->univ_pred_cmds[job]) + goto fail; + } + -+#ifdef RPI_INTER_QPU ++#if RPI_INTER + // We divide the image into blocks 256 wide and 64 high + // We support up to 2048 widths + // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted + // Also add space for the startup command for each stream. + -+ { -+ int uv_commands_per_qpu = UV_COMMANDS_PER_QPU; -+ uint32_t *p; -+ for(job=0;jobunif_mvs_ptr[job] ); ++ for (job = 0; job < RPI_MAX_JOBS; job++) { ++ HEVCRpiJob * const jb = s->jobs + job; ++#if RPI_CACHE_UNIF_MVS ++ gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr); ++ gpu_malloc_cached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr); +#else -+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] ); ++ gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr); ++ gpu_malloc_uncached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr); +#endif -+ s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm; + -+ // Set up initial locations for uniform streams -+ p = s->unif_mvs[job]; -+ for(i = 0; i < 8; i++) { -+ s->mvs_base[job][i] = p; -+ p += uv_commands_per_qpu; -+ } ++ { ++ qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm; ++ for(i = 0; i < QPU_N_UV; i++) { ++ jb->chroma_mvs[i].qpu_mc_base = p; ++ jb->chroma_mvs[i].qpu_mc_curr = p; ++ p += UV_COMMANDS_PER_QPU; ++ } + } -+ s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV); -+ s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0); -+ s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B); -+ } -+ -+#endif -+#ifdef RPI_LUMA_QPU -+ for(job=0;joby_unif_mvs_ptr[job] ); -+#else -+ gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] ); -+#endif -+ s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm; -+ -+ // Set up initial locations for uniform streams -+ p = s->y_unif_mvs[job]; -+ for(i = 0; i < 12; i++) { -+ s->y_mvs_base[job][i] = p; -+ p += y_commands_per_qpu; ++ { ++ qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm; ++ for(i = 0; i < QPU_N_Y; i++) { ++ jb->luma_mvs[i].qpu_mc_base = p; ++ jb->luma_mvs[i].qpu_mc_curr = p; ++ p += Y_COMMANDS_PER_QPU; ++ } + } + } -+ s->mc_filter = qpu_get_fn(QPU_MC_FILTER); -+ s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B); ++ s->qpu_filter_uv = qpu_fn(mc_filter_uv); ++ s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0); ++ s->qpu_dummy_frame = qpu_fn(mc_setup_c); // Use our code as a dummy frame ++ s->qpu_filter = qpu_fn(mc_filter); ++ s->qpu_filter_b = qpu_fn(mc_filter_b); +#endif + //gpu_malloc_uncached(2048*64,&s->dummy); + @@ -4493,8 +5823,30 @@ index b478065..88dd40b 100644 s->cabac_state = av_malloc(HEVC_CONTEXTS); if (!s->cabac_state) goto fail; +@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) + } + + if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) +- s->threads_type = FF_THREAD_FRAME; +- else +- s->threads_type = FF_THREAD_SLICE; ++ s->threads_type = FF_THREAD_FRAME; ++ else ++ s->threads_type = FF_THREAD_SLICE; + + return 0; + } +@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = { + .update_thread_context = hevc_update_thread_context, + .init_thread_copy = hevc_init_thread_copy, + .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | ++// 0, ++// AV_CODEC_CAP_FRAME_THREADS, + AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS, + .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), + }; diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h -index be91010..6b03ea8 100644 +index be91010..dd7d152 100644 --- a/libavcodec/hevc.h +++ b/libavcodec/hevc.h @@ -23,6 +23,9 @@ @@ -4507,37 +5859,53 @@ index be91010..6b03ea8 100644 #include "libavutil/buffer.h" #include "libavutil/md5.h" -@@ -37,6 +40,29 @@ +@@ -37,6 +40,45 @@ #include "thread.h" #include "videodsp.h" +// define RPI to split the CABAC/prediction/transform into separate stages -+#ifdef RPI ++#ifndef RPI ++ ++ #define RPI_INTER 0 ++ #define RPI_TSTATS 0 ++ #define RPI_HEVC_SAND 0 ++ ++#else + + #include "rpi_qpu.h" -+ // Define RPI_INTER_QPU to use QPU for chroma inter prediction -+ #define RPI_INTER_QPU ++ #define RPI_INTER 1 // 0 use ARM for UV inter-pred, 1 use QPU + -+ #ifdef RPI_INTER_QPU -+ // Define RPI_LUMA_QPU to also use QPU for luma inter prediction -+ #define RPI_LUMA_QPU -+ #endif -+ -+ // By passing jobs to a worker thread we hope to be able to catch up during slow frames -+ #define RPI_MAX_JOBS 2 + // Define RPI_WORKER to launch a worker thread for pixel processing tasks + #define RPI_WORKER ++ // By passing jobs to a worker thread we hope to be able to catch up during slow frames ++ // This has no effect unless RPI_WORKER is defined ++ // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as ++ // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one ++ // free for the foreground to fill in. ++ #define RPI_MAX_JOBS 2 ++ + // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs ++ // As it stands there is something mildy broken in VPU deblock - looks mostly OK ++ // but reliably fails some conformance tests (e.g. DBLK_A/B/C_) ++ // With VPU luma & chroma pred it is much the same speed to deblock on the ARM +// #define RPI_DEBLOCK_VPU + -+#endif ++ #define RPI_VPU_DEBLOCK_CACHED 1 + -+#define RPI_VPU_DEBLOCK_CACHED 1 ++ #if HAVE_NEON ++ #define RPI_HEVC_SAND 1 ++ #else ++ // Sand bust on Pi1 currently - reasons unknown ++ #define RPI_HEVC_SAND 0 ++ #endif ++ ++ #define RPI_TSTATS 0 ++#endif + #define MAX_DPB_SIZE 16 // A.4.1 #define MAX_REFS 16 -@@ -660,17 +686,6 @@ typedef struct CodingUnit { +@@ -660,17 +702,6 @@ typedef struct CodingUnit { uint8_t cu_transquant_bypass_flag; } CodingUnit; @@ -4555,7 +5923,7 @@ index be91010..6b03ea8 100644 typedef struct NeighbourAvailable { int cand_bottom_left; int cand_left; -@@ -747,7 +762,17 @@ typedef struct HEVCFrame { +@@ -747,7 +778,17 @@ typedef struct HEVCFrame { uint8_t flags; } HEVCFrame; @@ -4573,7 +5941,7 @@ index be91010..6b03ea8 100644 uint8_t cabac_state[HEVC_CONTEXTS]; uint8_t stat_coeff[4]; -@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext { +@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext { int qPy_pred; @@ -4581,7 +5949,7 @@ index be91010..6b03ea8 100644 uint8_t ctb_left_flag; uint8_t ctb_up_flag; -@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext { +@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext { int ct_depth; CodingUnit cu; PredictionUnit pu; @@ -4589,7 +5957,7 @@ index be91010..6b03ea8 100644 #define BOUNDARY_LEFT_SLICE (1 << 0) #define BOUNDARY_LEFT_TILE (1 << 1) -@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext { +@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext { int boundary_flags; } HEVCLocalContext; @@ -4601,13 +5969,15 @@ index be91010..6b03ea8 100644 +// This is a distance of 1536 pixels across the screen +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing, +// but allocate more memory and increase the latency before data in the next frame can be processed -+#define RPI_NUM_CHUNKS 1 ++#define RPI_NUM_CHUNKS 4 ++#define RPI_CHUNK_SIZE 12 + +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code -+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24) ++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE) + +// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi -+#define RPI_MAX_MV_CMDS (2*16*3*(RPI_MAX_WIDTH/4)) ++#define RPI_MAX_MV_CMDS_Y (2*16*1*(RPI_MAX_WIDTH/4)) ++#define RPI_MAX_MV_CMDS_C (2*16*2*(RPI_MAX_WIDTH/4)) +// Each block can have an intra prediction and a transform_add command +#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4)) +// Worst case is 16x16 CTUs @@ -4624,53 +5994,118 @@ index be91010..6b03ea8 100644 + +// Command for inter prediction +typedef struct HEVCMvCmd { -+ int cmd; -+ uint8_t *dst; -+ ptrdiff_t dststride; -+ uint8_t *src; -+ ptrdiff_t srcstride; -+ Mv mv; -+ int x_off; -+ int y_off; -+ int block_w; -+ int block_h; -+ int weight; -+ int offset; -+ uint8_t *src1; -+ ptrdiff_t srcstride1; -+ Mv mv1; ++ uint8_t cmd; ++ uint8_t block_w; ++ uint8_t block_h; + int8_t ref_idx[2]; ++ uint16_t dststride; ++ uint16_t srcstride; ++ uint16_t srcstride1; ++ int16_t weight; ++ int16_t offset; ++ int16_t x_off; ++ int16_t y_off; ++ uint8_t *src; ++ uint8_t *src1; ++ uint8_t *dst; ++ Mv mv; ++ Mv mv1; +} HEVCMvCmd; + + +// Command for intra prediction and transform_add of predictions to coefficients -+#define RPI_PRED_TRANSFORM_ADD 0 -+#define RPI_PRED_INTRA 1 ++enum rpi_pred_cmd_e ++{ ++ RPI_PRED_ADD_RESIDUAL, ++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_INTRA, ++ RPI_PRED_I_PCM, ++ RPI_PRED_CMD_MAX ++}; ++ +typedef struct HEVCPredCmd { -+ uint8_t size; + uint8_t type; -+ uint8_t na; -+ uint8_t c_idx; ++ uint8_t size; // log2 "size" used by all variants ++ uint8_t na; // i_pred - but left here as they pack well ++ uint8_t c_idx; // i_pred + union { -+ uint8_t *dst; // RPI_PRED_TRANSFORM_ADD -+ uint32_t x; // RPI_PRED_INTRA -+ }; -+ union { -+ int16_t *buf; // RPI_PRED_TRANSFORM_ADD -+ uint32_t y; // RPI_PRED_INTRA -+ }; -+ union { -+ enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD -+ uint32_t stride; // RPI_PRED_INTRA ++ struct { // TRANSFORM_ADD ++ uint8_t * dst; ++ const int16_t * buf; ++ uint32_t stride; ++ } ta; ++ struct { // INTRA ++ uint16_t x; ++ uint16_t y; ++ enum IntraPredMode mode; ++ } i_pred; ++ struct { // I_PCM ++ uint16_t x; ++ uint16_t y; ++ const void * src; ++ uint32_t src_len; ++ } i_pcm; + }; +} HEVCPredCmd; + +#endif ++ ++#ifdef RPI ++ ++struct qpu_mc_pred_c_s; ++struct qpu_mc_pred_y_s; ++ ++typedef struct HEVCRpiLumaPred ++{ ++ struct qpu_mc_pred_y_s *qpu_mc_base; ++ struct qpu_mc_pred_y_s *qpu_mc_curr; ++ struct qpu_mc_pred_y_s *last_lx; ++ unsigned int load; ++} HEVCRpiLumaPred; ++ ++typedef struct HEVCRpiChromaPred ++{ ++ struct qpu_mc_pred_c_s *qpu_mc_base; ++ struct qpu_mc_pred_c_s *qpu_mc_curr; ++ struct qpu_mc_pred_c_s *last_l0; ++ struct qpu_mc_pred_c_s *last_l1; ++ unsigned int load; ++} HEVCRpiChromaPred; ++ ++typedef struct HEVCRpiJob { ++ GPU_MEM_PTR_T chroma_mvs_gptr; ++ GPU_MEM_PTR_T luma_mvs_gptr; ++ HEVCRpiChromaPred chroma_mvs[QPU_N_UV]; ++ HEVCRpiLumaPred luma_mvs[QPU_N_Y]; ++} HEVCRpiJob; ++ ++#if RPI_TSTATS ++typedef struct HEVCRpiStats { ++ int y_pred1_y8_merge; ++ int y_pred1_xy; ++ int y_pred1_x0; ++ int y_pred1_y0; ++ int y_pred1_x0y0; ++ int y_pred1_wle8; ++ int y_pred1_wgt8; ++ int y_pred1_hle16; ++ int y_pred1_hgt16; ++ int y_pred2_xy; ++ int y_pred2_x0; ++ int y_pred2_y0; ++ int y_pred2_x0y0; ++ int y_pred2_hle16; ++ int y_pred2_hgt16; ++} HEVCRpiStats; ++#endif ++ ++#endif + typedef struct HEVCContext { const AVClass *c; // needed by private avoptions AVCodecContext *avctx; -@@ -798,13 +895,107 @@ typedef struct HEVCContext { +@@ -798,13 +978,103 @@ typedef struct HEVCContext { HEVCLocalContext *HEVClcList[MAX_NB_THREADS]; HEVCLocalContext *HEVClc; @@ -4688,7 +6123,8 @@ index be91010..6b03ea8 100644 + +#ifdef RPI + int enable_rpi; -+ HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS]; ++ HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS]; ++ HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS]; + HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS]; + int buf_width; + GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS]; @@ -4697,7 +6133,8 @@ index be91010..6b03ea8 100644 + unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4]; + int num_coeffs[RPI_MAX_JOBS][4]; + int num_xfm_cmds[RPI_MAX_JOBS]; -+ int num_mv_cmds[RPI_MAX_JOBS]; ++ int num_mv_cmds_y[RPI_MAX_JOBS]; ++ int num_mv_cmds_c[RPI_MAX_JOBS]; + int num_pred_cmds[RPI_MAX_JOBS]; + int num_dblk_cmds[RPI_MAX_JOBS]; + int vpu_id; @@ -4707,29 +6144,23 @@ index be91010..6b03ea8 100644 + int max_ctu_count; // Number of CTUs when we trigger a round of processing + int ctu_per_y_chan; // Number of CTUs per luma QPU + int ctu_per_uv_chan; // Number of CTUs per chroma QPU -+#ifdef RPI_INTER_QPU -+ GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS]; -+ uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands + -+ // _base pointers are to the start of the row -+ uint32_t *mvs_base[RPI_MAX_JOBS][8]; -+ // these pointers are to the next free space -+ uint32_t *u_mvs[RPI_MAX_JOBS][8]; -+ uint32_t *curr_u_mvs; // Current uniform stream to use for chroma -+ // Function pointers -+ uint32_t mc_filter_uv; -+ uint32_t mc_filter_uv_b0; -+ uint32_t mc_filter_uv_b; ++ HEVCRpiJob jobs[RPI_MAX_JOBS]; ++#if RPI_TSTATS ++ HEVCRpiStats tstats; +#endif -+#ifdef RPI_LUMA_QPU -+ GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS]; -+ uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands -+ uint32_t *y_mvs_base[RPI_MAX_JOBS][12]; -+ uint32_t *y_mvs[RPI_MAX_JOBS][12]; -+ uint32_t *curr_y_mvs; // Current uniform stream for luma ++#if RPI_INTER ++ HEVCRpiChromaPred * curr_pred_c; ++ HEVCRpiLumaPred * curr_pred_y; ++ struct qpu_mc_pred_y_s * last_y8_p; ++ struct qpu_mc_pred_y_s * last_y8_lx; ++ + // Function pointers -+ uint32_t mc_filter; -+ uint32_t mc_filter_b; ++ uint32_t qpu_filter_uv; ++ uint32_t qpu_filter_uv_b0; ++ uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory ++ uint32_t qpu_filter; ++ uint32_t qpu_filter_b; +#endif + +#ifdef RPI_WORKER @@ -4766,7 +6197,7 @@ index be91010..6b03ea8 100644 + int (*vpu_cmds_arm)[6]; // r0-r5 for each command + int vpu_cmds_vc; + -+ int cmd_id; ++ vpu_qpu_wait_h cmd_id; + } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT]; + + struct dblk_vpu_q_s * dvq; @@ -4779,7 +6210,7 @@ index be91010..6b03ea8 100644 uint8_t *cabac_state; /** 1 if the independent slice segment header was successfully parsed */ -@@ -922,6 +1113,9 @@ typedef struct HEVCContext { +@@ -922,6 +1192,9 @@ typedef struct HEVCContext { uint32_t max_mastering_luminance; uint32_t min_mastering_luminance; @@ -4789,22 +6220,38 @@ index be91010..6b03ea8 100644 } HEVCContext; int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, -@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int log2_trafo_size, enum ScanType scan_idx, int c_idx); -+#ifdef RPI_INTER_QPU -+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n); ++#if RPI_INTER ++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n); +#endif + void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); +@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16]; + extern const uint8_t ff_hevc_diag_scan8x8_x[64]; + extern const uint8_t ff_hevc_diag_scan8x8_y[64]; + ++#ifdef RPI ++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n); ++ ++// arm/hevc_misc_neon.S ++// Neon coeff zap fn ++#if HAVE_NEON ++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); ++#endif ++ ++#endif ++ + #endif /* AVCODEC_HEVC_H */ diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c -index 05b2821..e2f1f4e 100644 +index 05b2821..733efde 100644 --- a/libavcodec/hevc_cabac.c +++ b/libavcodec/hevc_cabac.c -@@ -21,14 +21,72 @@ +@@ -21,14 +21,76 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -4817,6 +6264,10 @@ index 05b2821..e2f1f4e 100644 #include "hevc.h" +#include "cabac_functions.h" + ++#ifdef RPI ++#include "rpi_zc.h" ++#endif ++ +// BY22 is probably faster than simple bypass if the processor has +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction +// x86 has fast int divide @@ -4878,7 +6329,7 @@ index 05b2821..e2f1f4e 100644 /** * number of bin by SyntaxElement. */ -@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = { +@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = { { 28, 36, 43, 49, 54, 58, 61, 63, }, }; @@ -5090,7 +6541,7 @@ index 05b2821..e2f1f4e 100644 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts) { if (s->ps.pps->entropy_coding_sync_enabled_flag && -@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth) +@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth) return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth); } @@ -5116,7 +6567,7 @@ index 05b2821..e2f1f4e 100644 } int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) { -@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) { +@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) { return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx); } @@ -5133,7 +6584,7 @@ index 05b2821..e2f1f4e 100644 ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); ctx_shift = (log2_size + 1) >> 2; } else { -@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s, +@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s, return value; } @@ -5159,7 +6610,7 @@ index 05b2821..e2f1f4e 100644 { return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); } -@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, +@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc); } @@ -5172,7 +6623,7 @@ index 05b2821..e2f1f4e 100644 + +#ifndef coeff_abs_level_remaining_decode_bypass +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param) - { ++{ + CABACContext * const c = &s->HEVClc->cc; + uint32_t y; + unsigned int prefix; @@ -5213,7 +6664,7 @@ index 05b2821..e2f1f4e 100644 +#endif + +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param) -+{ + { + CABACContext * const c = &s->HEVClc->cc; int prefix = 0; int suffix = 0; @@ -5359,7 +6810,7 @@ index 05b2821..e2f1f4e 100644 +static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) +{ + return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1); - } ++} +#endif + + @@ -5454,6 +6905,45 @@ index 05b2821..e2f1f4e 100644 + return i; +} + ++#ifdef RPI ++static void rpi_add_residual(HEVCContext * const s, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ unsigned int stride = frame->linesize[c_idx]; ++ unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; ++ unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; ++ const int is_sliced = rpi_sliced_frame(frame); ++ uint8_t * dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(frame, x, y) : ++ rpi_sliced_frame_pos_c(frame, x, y); ++ ++// if (c_idx != 0) { ++// return; ++// } ++ if (s->enable_rpi) { ++ HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); ++ cmd->size = log2_trafo_size; ++ cmd->c_idx = c_idx; ++ cmd->ta.buf = coeffs; ++ cmd->ta.dst = dst; ++ cmd->ta.stride = stride; ++ } ++ else if (!is_sliced || c_idx == 0) { ++ s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ } ++ else if (c_idx == 1) { ++ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ } ++ else { ++ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ } + } ++#endif void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int log2_trafo_size, enum ScanType scan_idx, @@ -5483,17 +6973,20 @@ index 05b2821..e2f1f4e 100644 + const uint8_t *scan_x_cg, *scan_y_cg; + const xy_off_t * scan_xy_off; ++#ifndef RPI ptrdiff_t stride = s->frame->linesize[c_idx]; int hshift = s->ps.sps->hshift[c_idx]; int vshift = s->ps.sps->vshift[c_idx]; - uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + +- uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + ++ uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + ((x0 >> hshift) << s->ps.sps->pixel_shift)]; -+#ifdef RPI -+ //***** transform_skip_flag decoded later! -+ int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4; -+#endif - int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); +- int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); - uint8_t significant_coeff_group_flag[8][8] = {{0}}; ++#endif ++#ifdef RPI ++ int use_vpu; ++#endif ++ int16_t *coeffs; + uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero int explicit_rdpcm_flag = 0; int explicit_rdpcm_dir_flag; @@ -5508,38 +7001,11 @@ index 05b2821..e2f1f4e 100644 int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode : lc->tu.intra_pred_mode_c; +- memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t)); + int prev_sig = 0; + const int c_idx_nz = (c_idx != 0); + + int may_hide_sign; -+ -+#ifdef RPI -+ if (s->enable_rpi) { -+ int n = trafo_size * trafo_size; -+ if (use_vpu) { -+ // We support size 4 and size 5. -+ // Size 4 grows from the front (Coeffs_buf_arm[2] points to start of buf) -+ // Size 5 grows from the back (Coeffs_buf_arm[3] points to end of buf) -+ // num_coeffs is indexed by log2_trafo_size-2 -+ if (log2_trafo_size == 4) -+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2]; -+ else -+ coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n; -+ s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n; -+ } else { -+ coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0]; -+ s->num_coeffs[s->pass0_job][0] += n; -+ } -+ } -+ // We now do the memset after transform_add while we know the data is cached. -+ #ifdef RPI_PRECLEAR -+ #else -+ memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t)); -+ #endif -+#else - memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t)); -+#endif -+ + // Derive QP for dequant @@ -5549,7 +7015,7 @@ index 05b2821..e2f1f4e 100644 static const uint8_t rem6[51 + 4 * 6 + 1] = { 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, -@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, }; int qp_y = lc->qp_y; @@ -5570,7 +7036,7 @@ index 05b2821..e2f1f4e 100644 } if (c_idx == 0) { -@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, qp += s->ps.sps->qp_bd_offset; } @@ -5641,6 +7107,9 @@ index 05b2821..e2f1f4e 100644 + may_hide_sign = 0; } ++ ++ ++ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && - (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) { - explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx); @@ -5658,7 +7127,7 @@ index 05b2821..e2f1f4e 100644 &last_significant_coeff_x, &last_significant_coeff_y); if (last_significant_coeff_x > 3) { -@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int last_x_c = last_significant_coeff_x & 3; int last_y_c = last_significant_coeff_y & 3; @@ -5715,14 +7184,41 @@ index 05b2821..e2f1f4e 100644 - for (i = num_last_subset; i >= 0; i--) { - int n, m; - int x_cg, y_cg, x_c, y_c, pos; -- int implicit_non_zero_coeff = 0; ++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant ++ ++ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; ++ ++ { ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++#ifdef RPI ++ use_vpu = 0; ++ if (s->enable_rpi) { ++ use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4; ++ coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount); ++#if HAVE_NEON ++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); ++#else ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++#endif ++ } ++ else ++#endif ++ { ++ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++ } ++ } ++ ++ i = num_last_subset; ++ do { + int implicit_non_zero_coeff = 0; - int64_t trans_coeff_level; - int prev_sig = 0; - int offset = i << 4; - int rice_init = 0; -+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant ++ int n_end; -- uint8_t significant_coeff_flag_idx[16]; + uint8_t significant_coeff_flag_idx[16]; - uint8_t nb_significant_coeff_flag = 0; - - x_cg = scan_x_cg[i]; @@ -5734,8 +7230,7 @@ index 05b2821..e2f1f4e 100644 - ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg]; - if (y_cg < (1 << (log2_trafo_size - 2)) - 1) - ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1]; -+ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; - +- - significant_coeff_group_flag[x_cg][y_cg] = - significant_coeff_group_flag_decode(s, c_idx, ctx_cg); - implicit_non_zero_coeff = 1; @@ -5744,13 +7239,8 @@ index 05b2821..e2f1f4e 100644 - ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) || - (x_cg == 0 && y_cg == 0)); - } -+ i = num_last_subset; -+ do { -+ int implicit_non_zero_coeff = 0; -+ int n_end; - +- - last_scan_pos = num_coeff - offset - 1; -+ uint8_t significant_coeff_flag_idx[16]; + unsigned int nb_significant_coeff_flag = 0; if (i == num_last_subset) { @@ -5836,7 +7326,7 @@ index 05b2821..e2f1f4e 100644 if (log2_trafo_size == 3) { scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; } else { -@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } } @@ -5885,12 +7375,11 @@ index 05b2821..e2f1f4e 100644 significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; nb_significant_coeff_flag++; } -@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } - n_end = nb_significant_coeff_flag; -- + if (nb_significant_coeff_flag != 0) { + const unsigned int gt1_idx_delta = (c_idx_nz << 2) | + ((i != 0 && !c_idx_nz) ? 2 : 0) | @@ -5938,6 +7427,9 @@ index 05b2821..e2f1f4e 100644 + coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2); + } ++ // Probably not worth the overhead of starting by22 for just one value ++ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); + - if (n_end) { - int first_nz_pos_in_cg; - int last_nz_pos_in_cg; @@ -5948,9 +7440,6 @@ index 05b2821..e2f1f4e 100644 - int sum_abs = 0; - int sign_hidden; - int sb_type; -+ // Probably not worth the overhead of starting by22 for just one value -+ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); - + if (coded_val) + { + if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { @@ -5961,13 +7450,18 @@ index 05b2821..e2f1f4e 100644 + const unsigned int c_rice_param = *stat_coeff >> 2; + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param); -- // initialize first elem of coeff_bas_level_greater1_flag -- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; + trans_coeff_level = 3 + last_coeff_abs_level_remaining; + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + } + } +- // initialize first elem of coeff_bas_level_greater1_flag +- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; ++ { ++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; ++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; ++ const unsigned int scale_m = blk_scale[xy_off->scale]; + - if (s->ps.sps->persistent_rice_adaptation_enabled_flag) { - if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag) - sb_type = 2 * (c_idx == 0 ? 1 : 0); @@ -5975,11 +7469,7 @@ index 05b2821..e2f1f4e 100644 - sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1; - c_rice_param = lc->stat_coeff[sb_type] / 4; - } -+ { -+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; -+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; -+ const unsigned int scale_m = blk_scale[xy_off->scale]; - +- - if (!(i == num_last_subset) && greater1_ctx == 0) - ctx_set++; - greater1_ctx = 1; @@ -6064,10 +7554,6 @@ index 05b2821..e2f1f4e 100644 + + sum_abs += last_coeff_abs_level_remaining + 1; + *level = trans_coeff_level; -+ -+ if (stat_coeff != NULL) -+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); -+ stat_coeff = NULL; - for (m = 0; m < n_end; m++) { - n = significant_coeff_flag_idx[m]; @@ -6088,6 +7574,10 @@ index 05b2821..e2f1f4e 100644 - if (lc->stat_coeff[sb_type] > 0) - lc->stat_coeff[sb_type]--; - rice_init = 1; ++ if (stat_coeff != NULL) ++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); ++ stat_coeff = NULL; ++ + if (trans_coeff_level > (3 << c_rice_param) && + (c_rice_param < 4 || rice_adaptation_enabled)) + ++c_rice_param; @@ -6188,7 +7678,7 @@ index 05b2821..e2f1f4e 100644 if (lc->cu.cu_transquant_bypass_flag) { if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); } } else { @@ -6197,7 +7687,7 @@ index 05b2821..e2f1f4e 100644 int rot = s->ps.sps->transform_skip_rotation_enabled_flag && log2_trafo_size == 2 && lc->cu.pred_mode == MODE_INTRA; -@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, for (i = 0; i < 8; i++) FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); } @@ -6205,7 +7695,7 @@ index 05b2821..e2f1f4e 100644 s->hevcdsp.transform_skip(coeffs, log2_trafo_size); if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); } } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { @@ -6233,7 +7723,7 @@ index 05b2821..e2f1f4e 100644 int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); if (max_xy == 0) s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs); -@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, col_limit = FFMIN(24, col_limit); s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit); } @@ -6241,26 +7731,20 @@ index 05b2821..e2f1f4e 100644 } } if (lc->tu.cross_pf) { -@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); } } +#ifdef RPI -+ if (s->enable_rpi) { -+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; -+ cmd->type = RPI_PRED_TRANSFORM_ADD; -+ cmd->size = log2_trafo_size; -+ cmd->buf = coeffs; -+ cmd->dst = dst; -+ cmd->stride = stride; -+ return; -+ } -+#endif ++ rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++#else s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride); ++#endif } + void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size) diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c -index 1f33b0c..55a0315 100644 +index 1f33b0c..3143b4f 100644 --- a/libavcodec/hevc_filter.c +++ b/libavcodec/hevc_filter.c @@ -22,6 +22,12 @@ @@ -6281,14 +7765,78 @@ index 1f33b0c..55a0315 100644 #include "bit_depth_template.c" +#ifdef RPI -+#include "rpi_user_vcsm.h" +#include "rpi_qpu.h" ++#include "rpi_zc.h" +#endif + #define LUMA 0 #define CB 1 #define CR 2 -@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC) + return s->qp_y_tab[x + y * s->ps.sps->min_cb_width]; + } + ++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx) ++{ ++#ifdef RPI ++ return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift; ++#else ++ return s->ps.sps->pixel_shift; ++#endif ++} ++ + static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, + intptr_t stride_dst, intptr_t stride_src) + { +@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, + int stride_src, int x, int y, int width, int height, + int c_idx, int x_ctb, int y_ctb) + { +- int sh = s->ps.sps->pixel_shift; ++ const unsigned int sh = pixel_shift(s, c_idx); + int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; + int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; + +@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s, + int y_min = ((y0 ) >> s->ps.sps->log2_min_pu_size); + int x_max = ((x0 + width ) >> s->ps.sps->log2_min_pu_size); + int y_max = ((y0 + height) >> s->ps.sps->log2_min_pu_size); +- int len = (min_pu_size >> hshift) << s->ps.sps->pixel_shift; ++ const unsigned int sh = pixel_shift(s, c_idx); ++ int len = (min_pu_size >> hshift) << sh; + for (y = y_min; y < y_max; y++) { + for (x = x_min; x < x_max; x++) { + if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) { + int n; +- uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift); +- const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift); ++ uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh); ++ const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh); + for (n = 0; n < (min_pu_size >> vshift); n++) { + memcpy(src, dst, len); + src += stride_src; +@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s, + + static void sao_filter_CTB(HEVCContext *s, int x, int y) + { +- static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; + HEVCLocalContext *lc = s->HEVClc; + int c_idx; + int edges[4]; // 0 left 1 top 2 right 3 bottom +@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + uint8_t right_tile_edge = 0; + uint8_t up_tile_edge = 0; + uint8_t bottom_tile_edge = 0; ++#ifdef RPI ++ const int sliced = rpi_sliced_frame(s->frame); ++ const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1); ++#else ++ const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1); ++#endif + + edges[0] = x_ctb == 0; + edges[1] = y_ctb == 0; edges[2] = x_ctb == s->ps.sps->ctb_width - 1; edges[3] = y_ctb == s->ps.sps->ctb_height - 1; @@ -6299,7 +7847,301 @@ index 1f33b0c..55a0315 100644 if (restore) { if (!edges[0]) { left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; -@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + } + } + +- for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) { ++ for (c_idx = 0; c_idx < plane_count; c_idx++) { + int x0 = x >> s->ps.sps->hshift[c_idx]; + int y0 = y >> s->ps.sps->vshift[c_idx]; + int stride_src = s->frame->linesize[c_idx]; +@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + int width = FFMIN(ctb_size_h, (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0); + int height = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0); + int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; +- uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)]; +- int stride_dst; ++ ptrdiff_t stride_dst; + uint8_t *dst; + ++#ifdef RPI ++ const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift; ++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; ++ uint8_t * const src = !sliced ? ++ &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(s->frame, x0, y0) : ++ rpi_sliced_frame_pos_c(s->frame, x0, y0); ++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : ++ !sliced ? src - (1 << sh) : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) : ++ rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0); ++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : ++ !sliced ? src + (width << sh) : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) : ++ rpi_sliced_frame_pos_c(s->frame, x0 + width, y0); ++ ++ ++ if (sliced && c_idx > 1) { ++ break; ++ } ++#else ++ const unsigned int sh = s->ps.sps->pixel_shift; ++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; ++ uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)]; ++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh); ++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh); ++#endif ++ + switch (sao->type_idx[c_idx]) { + case SAO_BAND: + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); + if (s->ps.pps->transquant_bypass_enable_flag || + (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) { +- dst = lc->edge_emu_buffer; +- stride_dst = 2*MAX_PB_SIZE; +- copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src); +- s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, +- sao->offset_val[c_idx], sao->band_position[c_idx], +- width, height); +- restore_tqb_pixels(s, src, dst, stride_src, stride_dst, +- x, y, width, height, c_idx); ++ dst = lc->edge_emu_buffer; ++ stride_dst = 2*MAX_PB_SIZE; ++ copy_CTB(dst, src, width << sh, height, stride_dst, stride_src); ++#ifdef RPI ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++#endif ++ { ++ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } ++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, ++ x, y, width, height, c_idx); + } else { +- s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, +- sao->offset_val[c_idx], sao->band_position[c_idx], +- width, height); ++#ifdef RPI ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++#endif ++ { ++ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } + } + sao->type_idx[c_idx] = SAO_APPLIED; + break; +@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + { + int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; + int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; +- int left_edge = edges[0]; + int top_edge = edges[1]; +- int right_edge = edges[2]; + int bottom_edge = edges[3]; +- int sh = s->ps.sps->pixel_shift; +- int left_pixels, right_pixels; + + stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE; + dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE; + + if (!top_edge) { +- int left = 1 - left_edge; +- int right = 1 - right_edge; +- const uint8_t *src1[2]; + uint8_t *dst1; +- int src_idx, pos; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh); + +- dst1 = dst - stride_dst - (left << sh); +- src1[0] = src - stride_src - (left << sh); +- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh); +- pos = 0; +- if (left) { ++ dst1 = dst - stride_dst; ++ ++ if (src_l != NULL) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1, src1[src_idx], sh); +- pos += (1 << sh); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh); + } ++ + src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); +- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); +- if (right) { +- pos += width << sh; ++ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh); ++ ++ if (src_r != NULL) { + src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh); + } + } + if (!bottom_edge) { +- int left = 1 - left_edge; +- int right = 1 - right_edge; +- const uint8_t *src1[2]; +- uint8_t *dst1; +- int src_idx, pos; ++ uint8_t * const dst1 = dst + height * stride_dst; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh); ++ const unsigned int hoff = height * stride_src; + +- dst1 = dst + height * stride_dst - (left << sh); +- src1[0] = src + height * stride_src - (left << sh); +- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh); +- pos = 0; +- if (left) { ++ if (src_l != NULL) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1, src1[src_idx], sh); +- pos += (1 << sh); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh); + } ++ + src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); +- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); +- if (right) { +- pos += width << sh; ++ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh); ++ ++ if (src_r != NULL) { + src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh); + } + } +- left_pixels = 0; +- if (!left_edge) { ++ if (src_l != NULL) { + if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + copy_vert(dst - (1 << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { +- left_pixels = 1; ++ copy_vert(dst - (1 << sh), ++ src_l, ++ sh, height, stride_dst, stride_src); + } + } +- right_pixels = 0; +- if (!right_edge) { ++ if (src_r != NULL) { + if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + copy_vert(dst + (width << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { +- right_pixels = 1; ++ copy_vert(dst + (width << sh), ++ src_r, ++ sh, height, stride_dst, stride_src); + } + } + +- copy_CTB(dst - (left_pixels << sh), +- src - (left_pixels << sh), +- (width + left_pixels + right_pixels) << sh, ++ copy_CTB(dst, ++ src, ++ width << sh, + height, stride_dst, stride_src); + + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); +- s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], +- sao->eo_class[c_idx], width, height); +- s->hevcdsp.sao_edge_restore[restore](src, dst, +- stride_src, stride_dst, +- sao, +- edges, width, +- height, c_idx, +- vert_edge, +- horiz_edge, +- diag_edge); ++#ifdef RPI ++ if (sliced && c_idx != 0) ++ { ++ // Class always the same for both U & V (which is just as well :-)) ++ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src, ++ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1], ++ width, height); ++ s->hevcdsp.sao_edge_restore_c[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } ++ else ++#endif ++ { ++ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], ++ sao->eo_class[c_idx], width, height); ++ s->hevcdsp.sao_edge_restore[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); + sao->type_idx[c_idx] = SAO_APPLIED; +@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + } + } + ++// Returns 2 or 0. + static int get_pcm(HEVCContext *s, int x, int y) + { + int log2_min_pu_size = s->ps.sps->log2_min_pu_size; +@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + uint8_t *src; + int x, y; + int chroma, beta; +- int32_t c_tc[2], tc[2]; ++ int32_t c_tc[4], tc[2]; + uint8_t no_p[2] = { 0 }; + uint8_t no_q[2] = { 0 }; + +@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->ps.sps->pcm.loop_filter_disable_flag) || s->ps.pps->transquant_bypass_enable_flag; @@ -6315,27 +8157,81 @@ index 1f33b0c..55a0315 100644 if (x0) { left_tc_offset = s->deblock[ctb - 1].tc_offset; left_beta_offset = s->deblock[ctb - 1].beta_offset; -@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); - } else -+#ifdef RPI_DEBLOCK_VPU -+ if (s->enable_rpi_deblock) { -+ uint8_t (*setup)[2][2][4]; -+ int num16 = (y>>4)*s->setup_width + (x>>4); -+ int a = ((y>>3) & 1) << 1; -+ int b = (x>>3) & 1; -+ setup = s->dvq->y_setup_arm[num16]; -+ setup[0][b][0][a] = beta; -+ setup[0][b][0][a + 1] = beta; -+ setup[0][b][1][a] = tc[0]; -+ setup[0][b][1][a + 1] = tc[1]; -+ } else +@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + + tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; + tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; +- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x - 1, y); + no_p[1] = get_pcm(s, x - 1, y + 4); + no_q[0] = get_pcm(s, x, y); + no_q[1] = get_pcm(s, x, y + 4); +- s->hevcdsp.hevc_v_loop_filter_luma_c(src, +- s->frame->linesize[LUMA], +- beta, tc, no_p, no_q); +- } else +- s->hevcdsp.hevc_v_loop_filter_luma(src, +- s->frame->linesize[LUMA], +- beta, tc, no_p, no_q); ++ } ++#ifdef RPI ++ if (rpi_sliced_frame(s->frame)) { ++ ++ // This copes properly with no_p/no_q ++ s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y), ++ s->frame->linesize[LUMA], ++ beta, tc, no_p, no_q, ++ rpi_sliced_frame_pos_y(s->frame, x - 4, y)); ++ } ++ else +#endif - s->hevcdsp.hevc_v_loop_filter_luma(src, - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); -@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) ++ { ++ src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; ++ if (pcmf) { ++ // Standard DSP code is broken if no_p / no_q is set ++ s->hevcdsp.hevc_v_loop_filter_luma_c(src, ++ s->frame->linesize[LUMA], ++ beta, tc, no_p, no_q); ++ } ++ else ++#ifdef RPI_DEBLOCK_VPU ++ if (s->enable_rpi_deblock) { ++ uint8_t (*setup)[2][2][4]; ++ int num16 = (y>>4)*s->setup_width + (x>>4); ++ int a = ((y>>3) & 1) << 1; ++ int b = (x>>3) & 1; ++ setup = s->dvq->y_setup_arm[num16]; ++ setup[0][b][0][a] = beta; ++ setup[0][b][0][a + 1] = beta; ++ setup[0][b][1][a] = tc[0]; ++ setup[0][b][1][a + 1] = tc[1]; ++ } else ++#endif ++ { ++ s->hevcdsp.hevc_v_loop_filter_luma(src, ++ s->frame->linesize[LUMA], ++ beta, tc, no_p, no_q); ++ } ++ } + } + } + +@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; + tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; + tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; +- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; ++ src = ++#ifdef RPI ++ rpi_sliced_frame(s->frame) ? ++ rpi_sliced_frame_pos_y(s->frame, x, y) : ++#endif ++ &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x, y - 1); + no_p[1] = get_pcm(s, x + 4, y - 1); +@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[LUMA], beta, tc, no_p, no_q); } else @@ -6355,7 +8251,113 @@ index 1f33b0c..55a0315 100644 s->hevcdsp.hevc_h_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q); -@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + } + + if (s->ps.sps->chroma_format_idc) { ++#ifdef RPI ++ if (rpi_sliced_frame(s->frame)) { ++ const int v = 2; ++ const int h = 2; ++ ++ // vertical filtering chroma ++ for (y = y0; y < y_end; y += 8 * v) { ++ for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) { ++ const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; ++ const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2]; ++ ++ if ((bs0 == 2) || (bs1 == 2)) { ++ const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; ++ const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1; ++ unsigned int no_f = 0; ++ ++ // tc_offset here should be set to cur_tc_offset I think ++ const uint32_t tc4 = ++ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) | ++ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); ++ ++ if (tc4 == 0) ++ continue; ++ ++ if (pcmf) { ++ no_f = ++ (get_pcm(s, x - 1, y) ? 1 : 0) | ++ (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) | ++ (get_pcm(s, x, y) ? 4 : 0) | ++ (get_pcm(s, x, y + 4 * v) ? 8 : 0); ++ if (no_f == 0xf) ++ continue; ++ } ++ ++ s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->frame->linesize[1], ++ tc4, ++ rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ no_f); ++ } ++ } ++ ++ if (y == 0) ++ continue; ++ ++ // horizontal filtering chroma ++ tc_offset = x0 ? left_tc_offset : cur_tc_offset; ++ x_end2 = x_end; ++ if (x_end != s->ps.sps->width) ++ x_end2 = x_end - 8 * h; ++ ++ for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) { ++ const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; ++ const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2]; ++ if ((bs0 == 2) || (bs1 == 2)) { ++ const int qp0 = bs0 == 2 ? (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1 : 0; ++ const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0; ++ const uint32_t tc4 = ++ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) | ++ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); ++ unsigned int no_f = 0; ++ ++ if (tc4 == 0) ++ continue; ++ ++ if (pcmf) { ++ no_f = ++ (get_pcm(s, x, y - 1) ? 1 : 0) | ++ (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) | ++ (get_pcm(s, x, y) ? 4 : 0) | ++ (get_pcm(s, x + 4 * h, y) ? 8 : 0); ++ ++ if (no_f == 0xf) ++ continue; ++ } ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->frame->linesize[1], ++ tc4, no_f); ++ } ++ } ++ } ++ } ++ else ++#endif + for (chroma = 1; chroma <= 2; chroma++) { + int h = 1 << s->ps.sps->hshift[chroma]; + int v = 1 << s->ps.sps->vshift[chroma]; +@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + + c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0; + c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0; +- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; ++ src = ++#ifdef RPI ++ rpi_sliced_frame(s->frame) ? ++ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#endif ++ &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x - 1, y); + no_p[1] = get_pcm(s, x - 1, y + (4 * v)); +@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -6379,7 +8381,21 @@ index 1f33b0c..55a0315 100644 } } -@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + + c_tc[0] = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset) : 0; + c_tc[1] = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0; +- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; ++ src = ++#ifdef RPI ++ rpi_sliced_frame(s->frame) ? ++ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#endif ++ &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x, y - 1); + no_p[1] = get_pcm(s, x + (4 * h), y - 1); +@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -6399,7 +8415,7 @@ index 1f33b0c..55a0315 100644 s->hevcdsp.hevc_h_loop_filter_chroma(src, s->frame->linesize[chroma], c_tc, no_p, no_q); -@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) } } @@ -6469,7 +8485,7 @@ index 1f33b0c..55a0315 100644 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_trafo_size) -@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_min_tu_size = s->ps.sps->log2_min_tb_size; int min_pu_width = s->ps.sps->min_pu_width; int min_tu_width = s->ps.sps->min_tb_width; @@ -6479,8 +8495,9 @@ index 1f33b0c..55a0315 100644 - int i, j, bs; + int i, j; + RefPicList *rpl = s->ref->refPicList; -+ int min_pu_in_4pix = (1 << log2_min_pu_size) >> 2; -+ int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size; ++ const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size); ++ const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2); // Dup ++ const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep + int y_pu = y0 >> log2_min_pu_size; + int x_pu = x0 >> log2_min_pu_size; + MvField *curr = &tab_mvf[y_pu * min_pu_width + x_pu]; @@ -6494,7 +8511,7 @@ index 1f33b0c..55a0315 100644 boundary_upper = y0 > 0 && !(y0 & 7); if (boundary_upper && -@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_upper = 0; @@ -6571,7 +8588,7 @@ index 1f33b0c..55a0315 100644 boundary_left = x0 > 0 && !(x0 & 7); if (boundary_left && ((!s->sh.slice_loop_filter_across_slices_enabled_flag && -@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_left = 0; @@ -6586,9 +8603,7 @@ index 1f33b0c..55a0315 100644 - int xq_pu = x0 >> log2_min_pu_size; - int xp_tu = (x0 - 1) >> log2_min_tu_size; - int xq_tu = x0 >> log2_min_tu_size; -+ rpl; -+ MvField *left = curr - 1; - +- - for (i = 0; i < (1 << log2_trafo_size); i += 4) { - int y_pu = (y0 + i) >> log2_min_pu_size; - int y_tu = (y0 + i) >> log2_min_tu_size; @@ -6606,18 +8621,20 @@ index 1f33b0c..55a0315 100644 - s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs; - } - } -+ if (is_intra) { -+ for (j = 0; j < (1 << log2_trafo_size); j += 4) -+ bs[j * s->bs_width >> 2] = 2; - +- - if (log2_trafo_size > log2_min_pu_size && !is_intra) { - RefPicList *rpl = s->ref->refPicList; -- ++ rpl; ++ MvField *left = curr - 1; + - // bs for TU internal horizontal PU boundaries - for (j = 8; j < (1 << log2_trafo_size); j += 8) { - int yp_pu = (y0 + j - 1) >> log2_min_pu_size; - int yq_pu = (y0 + j) >> log2_min_pu_size; -- ++ if (is_intra) { ++ for (j = 0; j < (1 << log2_trafo_size); j += 4) ++ bs[j * s->bs_width >> 2] = 2; + - for (i = 0; i < (1 << log2_trafo_size); i += 4) { - int x_pu = (x0 + i) >> log2_min_pu_size; - MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu]; @@ -6674,137 +8691,42 @@ index 1f33b0c..55a0315 100644 } } } -@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, #undef CB #undef CR -+#if !defined(RPI_FAST_CACHEFLUSH) -+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU) -+static void flush_buffer_y(const AVFrame * const frame) { -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame); -+ gpu_cache_flush(&p); -+} -+ -+static void flush_buffer_u(const AVFrame * const frame) { -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame); -+ gpu_cache_flush(&p); -+} -+ -+static void flush_buffer_v(const AVFrame * const frame) { -+ GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame); -+ gpu_cache_flush(&p); -+} -+#endif -+#endif -+ -+ +#ifdef RPI_DEBLOCK_VPU -+#error Not fixed yet -+ +// ff_hevc_flush_buffer_lines +// flushes and invalidates all pixel rows in [start,end-1] +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma) +{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ int curr_y = start; -+ int n = end; -+ int curr_uv = curr_y >> s->ps.sps->vshift[1]; -+ int n_uv = n >> s->ps.sps->vshift[1]; -+ int sz,base; -+ GPU_MEM_PTR_T p; -+ if (curr_uv < 0) curr_uv = 0; -+ if (n_uv<=curr_uv) { return; } -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ if (flush_chroma) { -+ p = get_gpu_mem_ptr_u(s->frame); -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)p.arm + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)p.arm + base; -+ iocache.s[1].size = sz; -+ } -+ if (flush_luma) { -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)p.arm + base; -+ iocache.s[2].size = sz; -+ } -+ vcsm_clean_invalid( &iocache ); -+#else -+ if (flush_chroma) { -+ flush_buffer_u(s->frame); -+ flush_buffer_v(s->frame); -+ } -+ if (flush_luma) { -+ flush_buffer_y(s->frame); -+ } -+#endif ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma); ++ rpi_cache_flush_finish(rfe); +} +#endif + -+#ifdef RPI_INTER_QPU -+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n) ++#if RPI_INTER ++ ++// Flush some lines of a reference frames ++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n) +{ + if (s->enable_rpi && s->used_for_ref) { -+ // TODO make this use ff_hevc_flush_buffer_lines -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ int curr_y = ((int *)f->progress->data)[0]; -+ int curr_uv = curr_y >> s->ps.sps->vshift[1]; -+ int n_uv = n >> s->ps.sps->vshift[1]; -+ int sz,base; -+ GPU_MEM_PTR_T p; -+ if (curr_uv < 0) curr_uv = 0; -+ if (n_uv<=curr_uv) { return; } -+ sz = s->frame->linesize[1] * (n_uv-curr_uv); -+ base = s->frame->linesize[1] * curr_uv; -+ p = get_gpu_mem_ptr_u(s->frame); -+ iocache.s[0].handle = p.vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int)p.arm + base; -+ iocache.s[0].size = sz; -+ p = get_gpu_mem_ptr_v(s->frame); -+ iocache.s[1].handle = p.vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int)p.arm + base; -+ iocache.s[1].size = sz; ++ const int d0 = ((int *)f->progress->data)[0]; ++ const unsigned int curr_y = d0 == -1 ? 0 : d0; // At start of time progress is -1 + -+#ifdef RPI_LUMA_QPU -+ p = get_gpu_mem_ptr_y(s->frame); -+ sz = s->frame->linesize[0] * (n-curr_y); -+ base = s->frame->linesize[0] * curr_y; -+ iocache.s[2].handle = p.vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int)p.arm + base; -+ iocache.s[2].size = sz; -+#endif -+ vcsm_clean_invalid( &iocache ); -+#else -+ flush_buffer_u(s->frame); -+ flush_buffer_v(s->frame); -+#ifdef RPI_LUMA_QPU -+ flush_buffer_y(s->frame); -+#endif -+ -+#endif -+ //memcpy(s->dummy.arm,s->frame->data[0],2048*64); -+ //memcpy(s->dummy.arm,s->frame->data[1],1024*32); -+ //memcpy(s->dummy.arm,s->frame->data[2],1024*32); ++ if (curr_y < (unsigned int)f->f->height) { ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1); ++ rpi_cache_flush_finish(rfe); ++ } + } +} +#endif + +#ifdef RPI_DEBLOCK_VPU -+#error XXX +/* rpi_deblock deblocks an entire row of ctbs using the VPU */ +static void rpi_deblock(HEVCContext *s, int y, int ctb_size) +{ @@ -6833,16 +8755,19 @@ index 1f33b0c..55a0315 100644 + s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) ); + s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1]; + s->dvq->vpu_cmds_arm[2][5] = 4; ++ + // Call VPU -+ s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands ++ { ++ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); ++ vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands ++ vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id); ++ vpu_qpu_job_finish(vqj); ++ } + + s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1); + s->dvq = s->dvq_ents + s->dvq_n; + -+ if (s->dvq->cmd_id != -1) { -+ vpu_wait(s->dvq->cmd_id); -+ s->dvq->cmd_id = -1; -+ } ++ vpu_qpu_wait(&s->dvq->cmd_id); +} + +#endif @@ -6871,14 +8796,14 @@ index 1f33b0c..55a0315 100644 if (s->ps.sps->sao_enabled) { int y_end = y >= s->ps.sps->height - ctb_size; if (y && x) -@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) +@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) sao_filter_CTB(s, x - ctb_size, y); if (y && x_end) { sao_filter_CTB(s, x, y - ctb_size); - if (s->threads_type & FF_THREAD_FRAME ) -+ if (s->threads_type & FF_THREAD_FRAME ) { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s,&s->ref->tf, y); ++ if (s->threads_type == FF_THREAD_FRAME ) { ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s,&s->ref->tf, y); +#endif ff_thread_report_progress(&s->ref->tf, y, 0); + } @@ -6886,14 +8811,14 @@ index 1f33b0c..55a0315 100644 if (x_end && y_end) { sao_filter_CTB(s, x , y); - if (s->threads_type & FF_THREAD_FRAME ) -+ if (s->threads_type & FF_THREAD_FRAME ) { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size); ++ if (s->threads_type == FF_THREAD_FRAME ) { ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size); +#endif ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); + } + } -+ } else if (s->threads_type & FF_THREAD_FRAME && x_end) { ++ } else if (s->threads_type == FF_THREAD_FRAME && x_end) { + //int newh = y + ctb_size - 4; + //int currh = s->ref->tf.progress->data[0]; + //if (((y + ctb_size)&63)==0) @@ -6904,15 +8829,15 @@ index 1f33b0c..55a0315 100644 + ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); + } + } else { -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4); ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); +#endif + ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); } - } else if (s->threads_type & FF_THREAD_FRAME && x_end) +#else -+#ifdef RPI_INTER_QPU -+ ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4); ++#if RPI_INTER ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); + // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi +#endif ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); @@ -6922,10 +8847,23 @@ index 1f33b0c..55a0315 100644 void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size) diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c -index 83f2ec2..6882a8d 100644 +index 83f2ec2..bcf53dc 100644 --- a/libavcodec/hevc_ps.c +++ b/libavcodec/hevc_ps.c -@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, +@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) + switch (sps->bit_depth) { + case 8: + if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8; ++#if RPI_HEVC_SAND ++ // *** Horrid kludge s.t. we start out with sand format ++ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P; ++#else + if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P; ++#endif + if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P; + if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P; + break; +@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, sps->amp_enabled_flag = get_bits1(gb); sps->sao_enabled = get_bits1(gb); @@ -6935,7 +8873,7 @@ index 83f2ec2..6882a8d 100644 if (sps->pcm_enabled_flag) { sps->pcm.bit_depth = get_bits(gb, 4) + 1; diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c -index 9d773d9..a6534a9 100644 +index 9d773d9..c4d7250 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = { @@ -7059,7 +8997,68 @@ index 9d773d9..a6534a9 100644 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) { #undef FUNC -@@ -257,6 +371,8 @@ int i = 0; +@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ + PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) + ++#ifndef RPI ++#define SLICED_LOOP_FILTERS(depth) ++#else ++#define SLICED_LOOP_FILTERS(depth)\ ++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ ++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) ++#endif ++ ++ + #define HEVC_DSP(depth) \ + hevcdsp->put_pcm = FUNC(put_pcm, depth); \ + hevcdsp->transform_add[0] = FUNC(transform_add4x4, depth); \ +@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + hevcdsp->transform_add[2] = FUNC(transform_add16x16, depth); \ + hevcdsp->transform_add[3] = FUNC(transform_add32x32, depth); \ + hevcdsp->transform_skip = FUNC(transform_skip, depth); \ ++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth); \ ++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ ++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ ++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ ++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ ++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ ++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ ++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ ++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ + hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ + hevcdsp->idct_4x4_luma = FUNC(transform_4x4_luma, depth); \ + hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ +@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ + hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ + \ ++ hevcdsp->sao_band_filter_c[0] = \ ++ hevcdsp->sao_band_filter_c[1] = \ ++ hevcdsp->sao_band_filter_c[2] = \ ++ hevcdsp->sao_band_filter_c[3] = \ ++ hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth); \ ++ hevcdsp->sao_edge_filter_c[0] = \ ++ hevcdsp->sao_edge_filter_c[1] = \ ++ hevcdsp->sao_edge_filter_c[2] = \ ++ hevcdsp->sao_edge_filter_c[3] = \ ++ hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth); \ ++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ ++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth); \ ++ \ + QPEL_FUNCS(depth); \ + QPEL_UNI_FUNCS(depth); \ + QPEL_BI_FUNCS(depth); \ +@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + EPEL_UNI_FUNCS(depth); \ + EPEL_BI_FUNCS(depth); \ + \ ++ SLICED_LOOP_FILTERS(depth); \ + hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ + hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ + hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ +@@ -257,6 +404,8 @@ int i = 0; break; } @@ -7069,10 +9068,10 @@ index 9d773d9..a6534a9 100644 ff_hevc_dsp_init_x86(hevcdsp, bit_depth); if (ARCH_ARM) diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h -index 9f1f6dd..e221e54 100644 +index 9f1f6dd..639ecf1 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h -@@ -42,6 +42,17 @@ typedef struct SAOParams { +@@ -42,11 +42,26 @@ typedef struct SAOParams { uint8_t type_idx[3]; ///< sao_type_idx } SAOParams; @@ -7090,21 +9089,742 @@ index 9f1f6dd..e221e54 100644 typedef struct HEVCDSPContext { void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, struct GetBitContext *gb, int pcm_bit_depth); -@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext { ++ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, ++ struct GetBitContext *gb, int pcm_bit_depth); + +- void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride); ++ void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); ++ void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); ++ void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); + + void (*transform_skip)(int16_t *coeffs, int16_t log2_size); + +@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext { + + void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); + + /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */ + void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, + int16_t *sao_offset_val, int sao_eo_class, int width, int height); ++ void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height); + + void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, + uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); ++ void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++ struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, ++ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); + + void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width); +@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext { void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q); ++#ifdef RPI ++ void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++ void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++ void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++#endif ++ + void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc, + int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, + MvField *curr, MvField *neigh, uint8_t *bs); } HEVCDSPContext; void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth); +diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c +index b840d17..32b9e47 100644 +--- a/libavcodec/hevcdsp_template.c ++++ b/libavcodec/hevcdsp_template.c +@@ -26,6 +26,9 @@ + #include "bit_depth_template.c" + #include "hevcdsp.h" + ++#ifdef RPI ++#include "rpi_zc.h" ++#endif + + static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, + GetBitContext *gb, int pcm_bit_depth) +@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height + } + } + ++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height, ++ GetBitContext *gb, int pcm_bit_depth) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth); ++ dst += stride; ++ } ++ ++ dst = (pixel *)_dst + 1; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth); ++ dst += stride; ++ } ++} ++ ++ + static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride, int size) + { +@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe + } + } + ++static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride, int size) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + *res); ++ res++; ++ } ++ dst += stride; ++ } ++} ++ + static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride) + { +@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs, + FUNC(transquant_bypass)(_dst, coeffs, stride, 32); + } + ++// -- U -- (plaited) ++ ++static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst, res, stride, 32); ++} ++ ++// -- V -- (plaited) ++ ++static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst + 1, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst + 1, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst + 1, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_uv)(_dst + 1, res, stride, 32); ++} ++ + + static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) + { +@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, + int x, y; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; +- int16_t *sao_offset_val = sao->offset_val[c_idx]; + int sao_eo_class = sao->eo_class[c_idx]; + int init_x = 0, width = _width, height = _height; + +@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, + + if (sao_eo_class != SAO_EO_VERT) { + if (borders[0]) { +- int offset_val = sao_offset_val[0]; + for (y = 0; y < height; y++) { +- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val); ++ dst[y * stride_dst] = src[y * stride_src]; + } + init_x = 1; + } + if (borders[2]) { +- int offset_val = sao_offset_val[0]; + int offset = width - 1; + for (x = 0; x < height; x++) { +- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val); ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; + } + width--; + } + } + if (sao_eo_class != SAO_EO_HORIZ) { + if (borders[1]) { +- int offset_val = sao_offset_val[0]; + for (x = init_x; x < width; x++) +- dst[x] = av_clip_pixel(src[x] + offset_val); ++ dst[x] = src[x]; + } + if (borders[3]) { +- int offset_val = sao_offset_val[0]; +- int y_stride_dst = stride_dst * (height - 1); +- int y_stride_src = stride_src * (height - 1); ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); + for (x = init_x; x < width; x++) +- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val); ++ dst[x + y_stride_dst] = src[x + y_stride_src]; + height--; + } + } +@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + int x, y; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; +- int16_t *sao_offset_val = sao->offset_val[c_idx]; + int sao_eo_class = sao->eo_class[c_idx]; + int init_x = 0, init_y = 0, width = _width, height = _height; + +@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + + if (sao_eo_class != SAO_EO_VERT) { + if (borders[0]) { +- int offset_val = sao_offset_val[0]; + for (y = 0; y < height; y++) { +- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val); ++ dst[y * stride_dst] = src[y * stride_src]; + } + init_x = 1; + } + if (borders[2]) { +- int offset_val = sao_offset_val[0]; + int offset = width - 1; + for (x = 0; x < height; x++) { +- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val); ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; + } + width--; + } + } + if (sao_eo_class != SAO_EO_HORIZ) { + if (borders[1]) { +- int offset_val = sao_offset_val[0]; + for (x = init_x; x < width; x++) +- dst[x] = av_clip_pixel(src[x] + offset_val); ++ dst[x] = src[x]; + init_y = 1; + } + if (borders[3]) { +- int offset_val = sao_offset_val[0]; +- int y_stride_dst = stride_dst * (height - 1); +- int y_stride_src = stride_src * (height - 1); ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); + for (x = init_x; x < width; x++) +- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val); ++ dst[x + y_stride_dst] = src[x + y_stride_src]; + height--; + } + } +@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + } + } + ++ ++// --- Plaited chroma versions ++ ++#if BIT_DEPTH != 8 ++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#else ++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table_u[32] = { 0 }; ++ int offset_table_v[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ width *= 2; ++ ++ for (k = 0; k < 4; k++) ++ { ++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; ++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; ++ } ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) ++ { ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); ++ } ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++#endif ++ ++#if BIT_DEPTH != 8 ++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, ++ int eo, int width, int height) { ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#else ++ ++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, ++ int eo, int width, int height) { ++ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ stride_dst /= sizeof(pixel); ++ width *= 2; ++ ++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) { ++ int diff0u = CMP(src[x], src[x + a_stride]); ++ int diff1u = CMP(src[x], src[x + b_stride]); ++ int offset_valu = edge_idx[2 + diff0u + diff1u]; ++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); ++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); ++ int offset_valv = edge_idx[2 + diff0v + diff1v]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); ++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} ++#endif ++ ++#if BIT_DEPTH != 8 ++static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#else ++// Any old 2 byte 'normal' restore will work for these ++#define sao_edge_restore_c_0_8 sao_edge_restore_0_10 ++#define sao_edge_restore_c_1_8 sao_edge_restore_1_10 ++#endif ++ ++ + #undef CMP + + //////////////////////////////////////////////////////////////////////////////// +@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, + #undef TQ1 + #undef TQ2 + #undef TQ3 ++ ++#ifdef RPI ++ ++// line zero ++#define P3 pix_l[0 * xstride] ++#define P2 pix_l[1 * xstride] ++#define P1 pix_l[2 * xstride] ++#define P0 pix_l[3 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++#define Q2 pix_r[2 * xstride] ++#define Q3 pix_r[3 * xstride] ++ ++// line three. used only for deblocking decision ++#define TP3 pix_l[0 * xstride + 3 * ystride] ++#define TP2 pix_l[1 * xstride + 3 * ystride] ++#define TP1 pix_l[2 * xstride + 3 * ystride] ++#define TP0 pix_l[3 * xstride + 3 * ystride] ++#define TQ0 pix_r[0 * xstride + 3 * ystride] ++#define TQ1 pix_r[1 * xstride + 3 * ystride] ++#define TQ2 pix_r[2 * xstride + 3 * ystride] ++#define TQ3 pix_r[3 * xstride + 3 * ystride] ++ ++// This is identical to hevc_loop_filter_luma except that the P/Q ++// components are on separate pointers ++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t _tc[2], ++ const uint8_t _no_p[2], const uint8_t _no_q[2], ++ uint8_t * _pix_l) ++{ ++ int d, j; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ const ptrdiff_t xstride = 1; ++ const ptrdiff_t ystride = _stride / sizeof(pixel); ++ ++ beta <<= BIT_DEPTH - 8; ++ ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ const int no_p = _no_p[j]; ++ const int no_q = _no_q[j]; ++ ++ if (d0 + d3 >= beta) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); ++ ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; ++ ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++ } ++ } ++} ++ ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 ++ ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 ++ ++#define P1 pix_l[0 * xstride] ++#define P0 pix_l[1 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++ ++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, const int32_t *_tc, ++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; ++ ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); ++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); ++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); ++} ++ ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++ ++ ++#endif ++ +diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c +index 02c1766..cea16ea 100644 +--- a/libavcodec/hevcpred.c ++++ b/libavcodec/hevcpred.c +@@ -24,6 +24,7 @@ + + #include "hevcpred.h" + ++#define PRED_C 0 + #define BIT_DEPTH 8 + #include "hevcpred_template.c" + #undef BIT_DEPTH +@@ -39,13 +40,37 @@ + #define BIT_DEPTH 12 + #include "hevcpred_template.c" + #undef BIT_DEPTH ++#undef PRED_C ++ ++#ifdef RPI ++#define PRED_C 1 ++#define BIT_DEPTH 8 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C ++#endif + + void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) + { + #undef FUNC + #define FUNC(a, depth) a ## _ ## depth + +-#define HEVC_PRED(depth) \ ++#undef FUNCC ++#define FUNCC(a, depth) a ## _ ## depth ## _c ++ ++#define HEVC_PRED_Y(depth) \ + hpc->intra_pred[0] = FUNC(intra_pred_2, depth); \ + hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \ + hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \ +@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ + hpc->pred_angular[3] = FUNC(pred_angular_3, depth); + ++#define HEVC_PRED_C(depth) \ ++ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \ ++ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \ ++ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \ ++ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \ ++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ ++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ ++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ ++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ ++ hpc->pred_dc_c = FUNCC(pred_dc, depth); \ ++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); ++ ++#ifdef RPI ++#define HEVC_PRED(depth) \ ++ HEVC_PRED_Y(depth); \ ++ HEVC_PRED_C(depth); ++#else ++#define HEVC_PRED(depth) \ ++ HEVC_PRED_Y(depth); ++#endif ++ + switch (bit_depth) { + case 9: + HEVC_PRED(9); +diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h +index eb17663..00ba3f9 100644 +--- a/libavcodec/hevcpred.h ++++ b/libavcodec/hevcpred.h +@@ -38,6 +38,17 @@ typedef struct HEVCPredContext { + void (*pred_angular[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); ++#ifdef RPI ++ void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx); ++ ++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int c_idx, int mode); ++#endif + } HEVCPredContext; + + void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c -index 6ae87cc..28d2653 100644 +index 6ae87cc..c14dddd 100644 --- a/libavcodec/hevcpred_template.c +++ b/libavcodec/hevcpred_template.c -@@ -20,6 +20,8 @@ +@@ -20,13 +20,55 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -7113,7 +9833,54 @@ index 6ae87cc..28d2653 100644 #include "libavutil/pixdesc.h" #include "bit_depth_template.c" -@@ -69,8 +71,11 @@ do { \ + #include "hevcpred.h" + ++#ifdef RPI ++#include "rpi_zc.h" ++#endif ++ ++#define DUMP_PRED 0 ++ + #define POS(x, y) src[(x) + stride * (y)] + ++#if PRED_C ++ ++typedef uint8_t (* c8_dst_ptr_t)[2]; ++typedef const uint8_t (* c8_src_ptr_t)[2]; ++ ++#if BIT_DEPTH == 8 ++#undef BIT_DEPTH ++#define BIT_DEPTH 16 ++#include "bit_depth_template.c" ++#undef FUNC ++#define FUNC(a) FUNC3(a, 8, _c) ++#else ++#undef FUNC ++#define FUNC FUNCC ++#endif ++ ++#endif ++ ++#if DUMP_PRED ++#ifndef DEBUG_ONCE ++#define DEBUG_ONCE ++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) ++{ ++ for (unsigned int y = 0; y != size; y++, data += stride * 2) { ++ for (unsigned int x = 0; x != size; x++) { ++ printf("%4d", data[x * 2]); ++ } ++ printf("\n"); ++ } ++ printf("\n"); ++} ++#endif ++#endif ++ + static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0, + int log2_size, int c_idx) + { +@@ -69,8 +111,11 @@ do { \ AV_WN4P(&ptr[i], a); \ else \ a = PIXEL_SPLAT_X4(ptr[i + 3]) @@ -7126,17 +9893,399 @@ index 6ae87cc..28d2653 100644 int i; int hshift = s->ps.sps->hshift[c_idx]; int vshift = s->ps.sps->vshift[c_idx]; -@@ -114,6 +119,10 @@ do { \ +@@ -79,15 +124,23 @@ do { \ + int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; + int size_in_luma_v = size << vshift; + int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; +- int x = x0 >> hshift; +- int y = y0 >> vshift; ++ const int x = x0 >> hshift; ++ const int y = y0 >> vshift; + int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + + int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb); + +- ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); ++ const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); ++#if defined(RPI) ++ pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ? ++ (pixel*)s->frame->data[c_idx] + x + y * stride : ++ c_idx == 0 ? ++ (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) : ++ (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y); ++#else + pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride; ++#endif + + int min_pu_width = s->ps.sps->min_pu_width; + +@@ -95,14 +148,20 @@ do { \ + lc->tu.intra_pred_mode; + pixel4 a; + pixel left_array[2 * MAX_TB_SIZE + 1]; ++#if !PRED_C + pixel filtered_left_array[2 * MAX_TB_SIZE + 1]; ++#endif + pixel top_array[2 * MAX_TB_SIZE + 1]; ++#if !PRED_C + pixel filtered_top_array[2 * MAX_TB_SIZE + 1]; ++#endif + + pixel *left = left_array + 1; + pixel *top = top_array + 1; ++#if !PRED_C + pixel *filtered_left = filtered_left_array + 1; + pixel *filtered_top = filtered_top_array + 1; ++#endif + int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); + int cand_left = lc->na.cand_left; + int cand_up_left = lc->na.cand_up_left; +@@ -114,6 +173,26 @@ do { \ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - (x0 + size_in_luma_h)) >> hshift; ++ pixel * src_l = src - 1; ++ pixel * src_u = src - stride; ++ pixel * src_ur = src_u + size; ++ +#ifdef DISABLE_INTRA + return; +#endif ++ ++#if defined(RPI) ++ if (s->frame->format == AV_PIX_FMT_SAND128) { ++ const AVFrame * const frame = s->frame; ++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 ++ const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride; ++ if ((x & mask) == 0) ++ src_l -= stripe_adj; ++ if (((x + size) & mask) == 0) ++ src_ur += stripe_adj; ++ } ++#endif + if (s->ps.pps->constrained_intra_pred_flag == 1) { int size_in_luma_pu_v = PU(size_in_luma_v); int size_in_luma_pu_h = PU(size_in_luma_h); +@@ -163,23 +242,24 @@ do { \ + top[-1] = 128; + } + if (cand_up_left) { +- left[-1] = POS(-1, -1); ++ left[-1] = src_l[-stride]; + top[-1] = left[-1]; + } + if (cand_up) +- memcpy(top, src - stride, size * sizeof(pixel)); ++ // Always good - even with sand ++ memcpy(top, src_u, size * sizeof(pixel)); + if (cand_up_right) { +- memcpy(top + size, src - stride + size, size * sizeof(pixel)); +- EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1), ++ memcpy(top + size, src_ur, top_right_size * sizeof(pixel)); ++ EXTEND(top + size + top_right_size, top[size + top_right_size - 1], + size - top_right_size); + } + if (cand_left) + for (i = 0; i < size; i++) +- left[i] = POS(-1, i); ++ left[i] = src_l[stride * i]; + if (cand_bottom_left) { + for (i = size; i < size + bottom_left_size; i++) +- left[i] = POS(-1, i); +- EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1), ++ left[i] = src_l[stride * i]; ++ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1], + size - bottom_left_size); + } + +@@ -268,7 +348,11 @@ do { \ + cand_up_left = 1; + cand_left = 1; + } else { // No samples available ++#if PRED_C && BIT_DEPTH == 16 ++ left[-1] = 0x8080; ++#else + left[-1] = (1 << (BIT_DEPTH - 1)); ++#endif + EXTEND(top, left[-1], 2 * size); + EXTEND(left, left[-1], 2 * size); + } +@@ -287,6 +371,9 @@ do { \ + top[-1] = left[-1]; + + // Filtering process ++ // Sand128 can only apply to chroma_format_idc == 1 so we don't need to ++ // worry about chroma smoothing for that case ++#if !PRED_C + if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { + if (mode != INTRA_DC && size != 4){ + int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; +@@ -342,13 +429,46 @@ do { \ + mode); + break; + } ++#else ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, log2_size, c_idx); ++ break; ++ default: ++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, c_idx, ++ mode); ++ break; ++ } ++ ++#if DUMP_PRED ++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); ++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); ++#endif ++#endif + } + ++#if !PRED_C || BIT_DEPTH == 16 + #define INTRA_PRED(size) \ + static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ + { \ + FUNC(intra_pred)(s, x0, y0, size, c_idx); \ + } ++#else ++#define INTRA_PRED(size) \ ++static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ ++{ \ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#endif + + INTRA_PRED(2) + INTRA_PRED(3) +@@ -357,6 +477,7 @@ INTRA_PRED(5) + + #undef INTRA_PRED + ++#if !PRED_C + static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, ptrdiff_t stride, + int trafo_size) +@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to + POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + + (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); + } ++#else ++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, ++ const uint8_t * _left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ int size = 1 << trafo_size; ++ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; ++ const c8_src_ptr_t top = (c8_src_ptr_t)_top; ++ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ ++ for (y = 0; y < size; y++, src += stride) ++ { ++ for (x = 0; x < size; x++) ++ { ++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + ++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); ++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + ++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); ++ } ++ } ++} ++#endif + ++#if !PRED_C || BIT_DEPTH == 16 + #define PRED_PLANAR(size)\ + static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride) \ + { \ + FUNC(pred_planar)(src, top, left, stride, size + 2); \ + } ++#else ++#define PRED_PLANAR(size)\ ++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__); \ ++ abort(); \ ++} ++#endif + + PRED_PLANAR(0) + PRED_PLANAR(1) +@@ -386,6 +540,7 @@ PRED_PLANAR(3) + + #undef PRED_PLANAR + ++#if !PRED_C + static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, + ptrdiff_t stride, int log2_size, int c_idx) +@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + POS(0, y) = (left[y] + 3 * dc + 2) >> 2; + } + } ++#else ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size, int c_idx) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; ++ const c8_src_ptr_t top = (c8_src_ptr_t)_top; ++ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ unsigned int dc0 = size; ++ unsigned int dc1 = size; ++ ++ for (i = 0; i < size; i++) ++ { ++ dc0 += left[i][0] + top[i][0]; ++ dc1 += left[i][1] + top[i][1]; ++ } ++ ++ dc0 >>= log2_size + 1; ++ dc1 >>= log2_size + 1; ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = dc0; ++ src[j][1] = dc1; + ++ } ++ } ++} ++#endif ++ ++#ifndef ANGLE_CONSTS ++#define ANGLE_CONSTS ++static const int intra_pred_angle[] = { ++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, ++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 ++}; ++static const int inv_angle[] = { ++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, ++ -630, -910, -1638, -4096 ++}; ++#endif ++ ++#if !PRED_C + static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, +@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const pixel *top = (const pixel *)_top; + const pixel *left = (const pixel *)_left; + +- static const int intra_pred_angle[] = { +- 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, +- -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 +- }; +- static const int inv_angle[] = { +- -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, +- -630, -910, -1638, -4096 +- }; +- + int angle = intra_pred_angle[mode - 2]; + pixel ref_array[3 * MAX_TB_SIZE + 4]; + pixel *ref_tmp = ref_array + size; +@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + } + } + } ++#else ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int c_idx, ++ int mode, int size) ++{ ++ int x, y; ++ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; ++ c8_src_ptr_t top = (c8_src_ptr_t)_top; ++ c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ ++ const int angle = intra_pred_angle[mode - 2]; ++ uint8_t ref_array[3 * MAX_TB_SIZE + 4][2]; ++ c8_dst_ptr_t ref_tmp = ref_array + size; ++ c8_src_ptr_t ref; ++ const int last = (size * angle) >> 5; ++ ++ if (mode >= 18) { ++ ref = top - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, top - 1, (size + 1) * 2); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c8_src_ptr_t)ref_tmp; ++ } ++ ++ for (y = 0; y < size; y++, src += stride) { ++ const int idx = ((y + 1) * angle) >> 5; ++ const int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; ++x) { ++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + ++ fact * ref[x + idx + 2][0] + 16) >> 5; ++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + ++ fact * ref[x + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ memcpy(src, ref + idx + 1, size * 2); ++ } ++ } ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, left - 1, (size + 1) * 2); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c8_src_ptr_t)ref_tmp; ++ } ++ ++ for (x = 0; x < size; x++, src++) { ++ const int idx = ((x + 1) * angle) >> 5; ++ const int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + ++ fact * ref[y + idx + 2][0] + 16) >> 5; ++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + ++ fact * ref[y + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ { ++ src[y * stride][0] = ref[y + idx + 1][0]; ++ src[y * stride][1] = ref[y + idx + 1][1]; ++ } ++ } ++ } ++ } ++} ++#endif + + static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, + const uint8_t *left, diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c index 099a8c5..bdff2d2 100644 --- a/libavcodec/mmaldec.c @@ -7181,6 +10330,87 @@ index 3adf28d..2f9195f 100644 if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 && s->codec_id == AV_CODEC_ID_MPEG4 && avctx->idct_algo == FF_IDCT_AUTO) { +diff --git a/libavcodec/raw.c b/libavcodec/raw.c +index bfa2537..1bca89e 100644 +--- a/libavcodec/raw.c ++++ b/libavcodec/raw.c +@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { + { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, + { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, + ++ /* RPI */ ++#ifdef RPI ++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++#endif ++ + /* special */ + { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ + { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ +diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c +index d837056..81256b5 100644 +--- a/libavcodec/rawenc.c ++++ b/libavcodec/rawenc.c +@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS + return 0; + } + ++static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off) ++{ ++ for (int y = 0; y != frame->height / 2; ++y) { ++ for (int x = 0; x < frame->width; x += frame->linesize[0]) { ++ const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off; ++ const int w = FFMIN(frame->linesize[0], frame->width - x) / 2; ++ for (int i = 0; i < w; ++i) ++ *dst++ = p[i * 2]; ++ } ++ } ++ return dst; ++} ++ ++static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ int size = frame->width * frame->height * 3 / 2; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ // Luma is "easy" ++ for (int y = 0; y != frame->height; ++y) { ++ for (int x = 0; x < frame->width; x += frame->linesize[0]) { ++ const int w = FFMIN(frame->linesize[0], frame->width - x); ++ memcpy(dst, ++ frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w); ++ dst += w; ++ } ++ } ++ // Chroma is dull ++ dst = cpy_sand_c(dst, frame, 0); ++ dst = cpy_sand_c(dst, frame, 1); ++ ++ return 0; ++} ++ + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame, int *got_packet) + { +@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + if (ret < 0) + return ret; + ++ if (frame->format == AV_PIX_FMT_SAND128) { ++ ret = raw_sand_as_yuv420(avctx, pkt, frame); ++ *got_packet = (ret == 0); ++ return ret; ++ } ++ + if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) + return ret; + if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h new file mode 100644 index 0000000..4309f1c @@ -11182,10 +14412,10 @@ index 0000000..5543093 + pop r6-r7, pc diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c new file mode 100644 -index 0000000..3904efc +index 0000000..0255f5d --- /dev/null +++ b/libavcodec/rpi_mailbox.c -@@ -0,0 +1,340 @@ +@@ -0,0 +1,149 @@ +/* +Copyright (c) 2012, Broadcom Europe Ltd. +All rights reserved. @@ -11213,6 +14443,8 @@ index 0000000..3904efc +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + ++#ifdef RPI ++ +#include +#include +#include @@ -11220,7 +14452,6 @@ index 0000000..3904efc +#include +#include +#include -+#include +#include + +#include @@ -11230,75 +14461,7 @@ index 0000000..3904efc +#define DEVICE_FILE_NAME "/dev/vcio" + +#include "rpi_mailbox.h" -+ -+#define PAGE_SIZE (4*1024) -+ -+// Shared memory will not be cached in ARM cache -+void *mapmem_shared(unsigned base, unsigned size) -+{ -+ int mem_fd; -+ unsigned offset = base % PAGE_SIZE; -+ base = base - offset; -+ /* open /dev/mem */ -+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) { -+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n"); -+ return NULL; -+ } -+ void *mem = mmap( -+ 0, -+ size, -+ PROT_READ|PROT_WRITE, -+ MAP_SHARED/*|MAP_FIXED*/, -+ mem_fd, -+ base); -+#ifdef DEBUG -+ printf("base=0x%x, mem=%p\n", base, mem); -+#endif -+ if (mem == MAP_FAILED) { -+ printf("mmap error %d\n", (int)mem); -+ return NULL; -+ } -+ close(mem_fd); -+ return (char *)mem + offset; -+} -+ -+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing -+void *mapmem_private(unsigned base, unsigned size) -+{ -+ int mem_fd; -+ unsigned offset = base % PAGE_SIZE; -+ base = base - offset; -+ /* open /dev/mem */ -+ if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) { -+ printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n"); -+ return NULL; -+ } -+ void *mem = mmap( -+ 0, -+ size, -+ PROT_READ|PROT_WRITE, -+ MAP_PRIVATE/*|MAP_FIXED*/, -+ mem_fd, -+ base); -+#ifdef DEBUG -+ printf("base=0x%x, mem=%p\n", base, mem); -+#endif -+ if (mem == MAP_FAILED) { -+ printf("mmap error %d\n", (int)mem); -+ return NULL; -+ } -+ close(mem_fd); -+ return (char *)mem + offset; -+} -+ -+void unmapmem(void *addr, unsigned size) -+{ -+ int s = munmap(addr, size); -+ if (s != 0) { -+ printf("munmap error %d\n", s); -+ exit (-1); -+ } -+} ++//#include + +/* + * use ioctl to send mbox property message @@ -11320,47 +14483,7 @@ index 0000000..3904efc + return ret_val; +} + -+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags) -+{ -+ int i=0; -+ unsigned p[32]; -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ -+ p[i++] = 0x3000c; // (the tag id) -+ p[i++] = 12; // (size of the buffer) -+ p[i++] = 12; // (size of the data) -+ p[i++] = size; // (num bytes? or pages?) -+ p[i++] = align; // (alignment) -+ p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING) -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned mem_free(int file_desc, unsigned handle) -+{ -+ int i=0; -+ unsigned p[32]; -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ -+ p[i++] = 0x3000f; // (the tag id) -+ p[i++] = 4; // (size of the buffer) -+ p[i++] = 4; // (size of the data) -+ p[i++] = handle; -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned mem_lock(int file_desc, unsigned handle) ++unsigned mbox_mem_lock(int file_desc, unsigned handle) +{ + int i=0; + unsigned p[32]; @@ -11379,7 +14502,7 @@ index 0000000..3904efc + return p[5]; +} + -+unsigned mem_unlock(int file_desc, unsigned handle) ++unsigned mbox_mem_unlock(int file_desc, unsigned handle) +{ + int i=0; + unsigned p[32]; @@ -11398,117 +14521,30 @@ index 0000000..3904efc + return p[5]; +} + -+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5) ++#define GET_VCIMAGE_PARAMS 0x30044 ++ ++int mbox_get_image_params(int fd, VC_IMAGE_T * img) +{ -+ int i=0; -+ unsigned p[32]; -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request ++ uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32]; ++ uint32_t * p = buf; ++ void * rimg; ++ int rv; + -+ p[i++] = 0x30010; // (the tag id) -+ p[i++] = 28; // (size of the buffer) -+ p[i++] = 28; // (size of the data) -+ p[i++] = code; -+ p[i++] = r0; -+ p[i++] = r1; -+ p[i++] = r2; -+ p[i++] = r3; -+ p[i++] = r4; -+ p[i++] = r5; ++ *p++ = 0; // size ++ *p++ = 0; // process request ++ *p++ = GET_VCIMAGE_PARAMS; ++ *p++ = sizeof(*img); ++ *p++ = sizeof(*img); ++ rimg = p; ++ memcpy(p, img, sizeof(*img)); ++ p += sizeof(*img) / sizeof(*p); ++ *p++ = 0; // End tag ++ buf[0] = (p - buf) * sizeof(*p); + -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size ++ rv = mbox_property(fd, buf); ++ memcpy(img, rimg, sizeof(*img)); + -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned qpu_enable(int file_desc, unsigned enable) -+{ -+ int i=0; -+ unsigned p[32]; -+ -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ -+ p[i++] = 0x30012; // (the tag id) -+ p[i++] = 4; // (size of the buffer) -+ p[i++] = 4; // (size of the data) -+ p[i++] = enable; -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) { -+ int i=0; -+ unsigned p[32]; -+ -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ p[i++] = 0x30011; // (the tag id) -+ p[i++] = 16; // (size of the buffer) -+ p[i++] = 16; // (size of the data) -+ p[i++] = num_qpus; -+ p[i++] = control; -+ p[i++] = noflush; -+ p[i++] = timeout; // ms -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return p[5]; -+} -+ -+void execute_multi(int file_desc, -+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout, -+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2, -+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) { -+ int i=0; -+ unsigned p[32]; -+ -+ p[i++] = 0; // size -+ p[i++] = 0x00000000; // process request -+ p[i++] = 0x30018; // (the tag id) -+ p[i++] = 88; // (size of the buffer) -+ p[i++] = 88; // (size of the data) -+ -+ p[i++] = num_qpus; -+ p[i++] = control; -+ p[i++] = noflush; -+ p[i++] = timeout; // ms -+ -+ p[i++] = num_qpus_2; -+ p[i++] = control_2; -+ p[i++] = noflush_2; -+ p[i++] = timeout_2; // ms -+ -+ p[i++] = code; -+ p[i++] = r0; -+ p[i++] = r1; -+ p[i++] = r2; -+ p[i++] = r3; -+ p[i++] = r4; -+ p[i++] = r5; -+ -+ p[i++] = code_2; -+ p[i++] = r0_2; -+ p[i++] = r1_2; -+ p[i++] = r2_2; -+ p[i++] = r3_2; -+ p[i++] = r4_2; -+ p[i++] = r5_2; -+ -+ p[i++] = 0x00000000; // end tag -+ p[0] = i*sizeof *p; // actual size -+ -+ mbox_property(file_desc, p); -+ return; ++ return rv; +} + +int mbox_open() { @@ -11526,55 +14562,80 @@ index 0000000..3904efc +void mbox_close(int file_desc) { + close(file_desc); +} ++ ++#endif ++ diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h new file mode 100644 -index 0000000..5898102 +index 0000000..b316878 --- /dev/null +++ b/libavcodec/rpi_mailbox.h -@@ -0,0 +1,25 @@ +@@ -0,0 +1,58 @@ +#ifndef RPI_MAILBOX_H +#define RPI_MAILBOX_H + ++/* The image structure. */ ++typedef struct vc_image_extra_uv_s { ++ void *u, *v; ++ int vpitch; ++} VC_IMAGE_EXTRA_UV_T; ++ ++typedef union { ++ VC_IMAGE_EXTRA_UV_T uv; ++// VC_IMAGE_EXTRA_RGBA_T rgba; ++// VC_IMAGE_EXTRA_PAL_T pal; ++// VC_IMAGE_EXTRA_TF_T tf; ++// VC_IMAGE_EXTRA_BAYER_T bayer; ++// VC_IMAGE_EXTRA_MSBAYER_T msbayer; ++// VC_IMAGE_EXTRA_CODEC_T codec; ++// VC_IMAGE_EXTRA_OPENGL_T opengl; ++} VC_IMAGE_EXTRA_T; ++ ++ ++typedef struct VC_IMAGE_T { ++ unsigned short type; /* should restrict to 16 bits */ ++ unsigned short info; /* format-specific info; zero for VC02 behaviour */ ++ unsigned short width; /* width in pixels */ ++ unsigned short height; /* height in pixels */ ++ int pitch; /* pitch of image_data array in bytes */ ++ int size; /* number of bytes available in image_data array */ ++ void *image_data; /* pixel data */ ++ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */ ++ void *metadata; /* metadata header for the image */ ++ void *pool_object; /* nonNULL if image was allocated from a vc_pool */ ++ int mem_handle; /* the mem handle for relocatable memory storage */ ++ int metadata_size; /* size of metadata of each channel in bytes */ ++ int channel_offset; /* offset of consecutive channels in bytes */ ++ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */ ++ uint8_t num_channels; /* number of channels (2 for stereo) */ ++ uint8_t current_channel;/* the channel this header is currently pointing to */ ++ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/ ++ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header ++ into a linked-mulitchannel image */ ++ uint8_t channel_index; /* index of the channel this header represents while ++ it is being linked. */ ++ uint8_t _dummy[3]; /* pad struct to 64 bytes */ ++} VC_IMAGE_T; ++ ++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1]; ++ ++ +extern int mbox_open(void); +extern void mbox_close(int file_desc); + -+extern unsigned get_version(int file_desc); -+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags); -+extern unsigned mem_free(int file_desc, unsigned handle); -+extern unsigned mem_lock(int file_desc, unsigned handle); -+extern unsigned mem_unlock(int file_desc, unsigned handle); -+extern void *mapmem_shared(unsigned base, unsigned size); -+extern void *mapmem_private(unsigned base, unsigned size); -+extern void unmapmem(void *addr, unsigned size); ++extern unsigned mbox_mem_lock(int file_desc, unsigned handle); ++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle); + -+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5); -+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout); -+extern void execute_multi(int file_desc, -+ unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout, -+ unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2, -+ unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2); -+extern unsigned qpu_enable(int file_desc, unsigned enable); ++int mbox_get_image_params(int fd, VC_IMAGE_T * img); + +#endif diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c new file mode 100644 -index 0000000..365f4a6 +index 0000000..7c0eedd --- /dev/null +++ b/libavcodec/rpi_qpu.c -@@ -0,0 +1,993 @@ +@@ -0,0 +1,902 @@ +#ifdef RPI -+// Use vchiq service for submitting jobs -+#define GPUSERVICE -+ -+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device. -+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code -+//#define RPI_TIME_TOTAL_QPU -+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code -+//#define RPI_TIME_TOTAL_VPU -+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined -+#define RPI_TIME_TOTAL_POSTED -+ +#include +#include +#include @@ -11587,27 +14648,35 @@ index 0000000..365f4a6 +#include +#include + ++#include ++ +#include "rpi_mailbox.h" +#include "rpi_qpu.h" +#include "rpi_shader.h" +#include "rpi_hevc_transform.h" ++#include "rpi_zc.h" + -+#include "rpi_user_vcsm.h" -+#ifdef GPUSERVICE +#pragma GCC diagnostic push +// Many many redundant decls in the header files +#pragma GCC diagnostic ignored "-Wredundant-decls" +#include "interface/vmcs_host/vc_vchi_gpuserv.h" +#pragma GCC diagnostic pop -+#endif + -+// QPU profile flags -+#define NO_FLUSH 1 -+#define CLEAR_PROFILE 2 -+#define OUTPUT_COUNTS 4 ++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) ++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 + -+#define FLAGS_FOR_PROFILING (NO_FLUSH) ++// Add profile flags to all QPU requests - generates output in "vcdbg log msg" ++// Beware this is expensive and will probably throw off all other timing by >10% ++#define RPI_TRACE_QPU_PROFILE_ALL 0 + ++// QPU "noflush" flags ++// a mixture of flushing & profiling ++ ++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed ++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers ++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results ++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling ++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) + +// On Pi2 there is no way to access the VPU L2 cache +// GPU_MEM_FLG should be 4 for uncached memory. (Or C for alias to allocate in the VPU L2 cache) @@ -11664,65 +14733,223 @@ index 0000000..365f4a6 +{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} +}; + ++// Code/constants on GPU +struct GPU +{ + unsigned int qpu_code[QPU_CODE_SIZE]; + unsigned int vpu_code[VPU_CODE_SIZE]; + short transMatrix2even[16*16*2]; -+ int open_count; // Number of allocated video buffers -+ int mb; // Mailbox handle -+ int vc; // Address in GPU memory -+ int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task -+ int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task +}; + ++#define CFE_ENTS_PER_A 8 ++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices ++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70 ++// allow 128 ++#define CFE_ENT_COUNT 128 ++#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A) ++ ++struct rpi_cache_flush_env_s { ++ unsigned int n; ++ struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; ++}; ++ ++#define WAIT_COUNT_MAX 16 ++ ++typedef struct trace_time_one_s ++{ ++ int count; ++ int64_t start[WAIT_COUNT_MAX]; ++ int64_t total[WAIT_COUNT_MAX]; ++} trace_time_one_t; ++ ++typedef struct trace_time_wait_s ++{ ++ unsigned int jcount; ++ int64_t start0; ++ int64_t last_update; ++ trace_time_one_t active; ++ trace_time_one_t wait; ++} trace_time_wait_t; ++ ++typedef struct vq_wait_s ++{ ++ sem_t sem; ++ unsigned int cost; ++ struct vq_wait_s * next; ++} vq_wait_t; ++ ++#define VQ_WAIT_POOL_SIZE 16 ++typedef struct vq_wait_pool_s ++{ ++ vq_wait_t * head; ++ vq_wait_t pool[VQ_WAIT_POOL_SIZE]; ++} vq_wait_pool_t; ++ ++static void vq_wait_pool_init(vq_wait_pool_t * const pool); ++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); ++ ++typedef struct gpu_env_s ++{ ++ int open_count; ++ int init_count; ++ int mb; ++ unsigned int current_load; ++ GPU_MEM_PTR_T code_gm_ptr; ++ vq_wait_pool_t wait_pool; ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ trace_time_wait_t ttw; ++#endif ++} gpu_env_t; ++ +// Stop more than one thread trying to allocate memory or use the processing resources at once +static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; -+static volatile struct GPU* gpu = NULL; -+static GPU_MEM_PTR_T gpu_mem_ptr; ++static gpu_env_t * gpu = NULL; + -+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED) -+static unsigned int Microseconds(void) { ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ++static int64_t ns_time(void) ++{ + struct timespec ts; -+ unsigned int x; -+ static unsigned int base = 0; -+ clock_gettime(CLOCK_REALTIME, &ts); -+ x = ts.tv_sec*1000000 + ts.tv_nsec/1000; -+ if (base==0) base=x; -+ return x-base; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; +} ++ ++ ++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 ++ ++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) ++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) ++#define T_ARG(t) T_SEC(t), T_MS(t) ++#define T_FMT "%u.%03u" ++ ++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) ++{ ++ // Update totals for levels that are still pending ++ for (int i = 0; i < tto->count; ++i) { ++ tto->total[i] += now - tto->start[i]; ++ tto->start[i] = now; ++ } ++ ++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", ++ prefix, ++ T_ARG(now - start0 - tto->total[0]), ++ T_ARG(tto->total[0]), ++ T_ARG(tto->total[1]), ++ T_ARG(tto->total[2]), ++ T_ARG(tto->total[3])); ++} ++ ++ ++static void tto_start(trace_time_one_t * const tto, const int64_t now) ++{ ++ av_assert0(tto->count < WAIT_COUNT_MAX); ++ tto->start[tto->count++] = now; ++} ++ ++static void tto_end(trace_time_one_t * const tto, const int64_t now) ++{ ++ const int n = --tto->count; ++ av_assert0(n >= 0); ++ tto->total[n] += now - tto->start[n]; ++} ++ ++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) ++{ ++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); ++ tto_print(&ttw->active, now, ttw->start0, "Active"); ++ tto_print(&ttw->wait, now, ttw->start0, " Wait"); ++} ++ +#endif + -+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb); -+static void gpu_free_internal(GPU_MEM_PTR_T *p); ++// GPU memory alloc fns (internal) ++ ++// GPU_MEM_PTR_T alloc fns ++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { ++ p->numbytes = numbytes; ++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); ++ av_assert0(p->vcsm_handle); ++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); ++ av_assert0(p->vc_handle); ++ p->arm = vcsm_lock(p->vcsm_handle); ++ av_assert0(p->arm); ++ p->vc = mbox_mem_lock(mb, p->vc_handle); ++ av_assert0(p->vc); ++ return 0; ++} ++ ++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { ++ p->numbytes = numbytes; ++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); ++ av_assert0(p->vcsm_handle); ++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); ++ av_assert0(p->vc_handle); ++ p->arm = vcsm_lock(p->vcsm_handle); ++ av_assert0(p->arm); ++ p->vc = mbox_mem_lock(mb, p->vc_handle); ++ av_assert0(p->vc); ++ return 0; ++} ++ ++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) { ++ mbox_mem_unlock(mb, p->vc_handle); ++ vcsm_unlock_ptr(p->arm); ++ vcsm_free(p->vcsm_handle); ++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++} ++ ++ ++// GPU init, free, lock, unlock ++ ++static void gpu_term(void) ++{ ++ gpu_env_t * const ge = gpu; ++ ++ // We have to hope that eveything has terminated... ++ gpu = NULL; ++ ++ vc_gpuserv_deinit(); ++ ++ gpu_free_internal(ge->mb, &ge->code_gm_ptr); ++ ++ vcsm_exit(); ++ ++ mbox_close(ge->mb); ++ ++ vq_wait_pool_deinit(&ge->wait_pool); ++ ++ free(ge); ++} ++ + +// Connect to QPU, returns 0 on success. -+static int gpu_init(volatile struct GPU **gpu) { -+ int mb = mbox_open(); -+ int vc; ++static int gpu_init(gpu_env_t ** const gpu) { + volatile struct GPU* ptr; -+ if (mb < 0) -+ return -1; -+#ifndef RPI_ASYNC -+ if (qpu_enable(mb, 1)) return -2; -+#endif ++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); ++ *gpu = NULL; ++ ++ if (ge == NULL) ++ return -1; ++ ++ if ((ge->mb = mbox_open()) < 0) ++ return -1; ++ ++ vq_wait_pool_init(&ge->wait_pool); ++ + vcsm_init(); -+ vc_gpuserv_init(); -+ gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb); -+ ptr = (volatile struct GPU*)gpu_mem_ptr.arm; -+ memset((void*)ptr, 0, sizeof *ptr); -+ vc = gpu_mem_ptr.vc; + -+ ptr->mb = mb; -+ ptr->vc = vc; ++ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr); ++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; + -+ printf("GPU allocated at 0x%x\n",vc); -+ -+ *gpu = ptr; ++ // Zero everything so we have zeros between the code bits ++ memset((void *)ptr, 0, sizeof(*ptr)); + + // Now copy over the QPU code into GPU memory + { -+ int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV); ++ int num_bytes = (char *)mc_end - (char *)rpi_shader; + av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); + memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes); + } @@ -11735,106 +14962,56 @@ index 0000000..365f4a6 + // And the transform coefficients + memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); + -+#ifdef RPI_ASYNC -+ { -+ int err; -+ vpu_async_tail = 0; -+ vpu_async_head = 0; -+ err = pthread_create(&vpu_thread, NULL, vpu_start, NULL); -+ //printf("Created thread\n"); -+ if (err) { -+ av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n"); -+ return -4; -+ } -+ -+ { -+ struct sched_param param = {0}; -+ int policy = 0; -+ -+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n"); -+ } -+ else -+ { -+ av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n", -+ policy, -+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" , -+ param.sched_priority); -+ -+ policy = SCHED_FIFO; -+ param.sched_priority = sched_get_priority_max(SCHED_FIFO); -+ -+ av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n", -+ policy, -+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" , -+ param.sched_priority); -+ -+ if (pthread_setschedparam(vpu_thread, policy, ¶m) != 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n"); -+ } -+ else -+ { -+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n"); -+ } -+ else -+ { -+ av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n", -+ policy, -+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" , -+ param.sched_priority); -+ } -+ } -+ } -+ -+ } -+ -+ } -+#endif -+ ++ *gpu = ge; + return 0; +} + -+// Returns 1 if the gpu is currently idle -+static int gpu_idle(void) -+{ -+ int ret = pthread_mutex_trylock(&gpu_mutex); -+ if (ret==0) { -+ pthread_mutex_unlock(&gpu_mutex); -+ return 1; -+ } -+ return 0; -+} + -+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. -+static void gpu_lock(void) { -+ pthread_mutex_lock(&gpu_mutex); -+ -+ if (gpu==NULL) { -+ gpu_init(&gpu); -+ } -+} + +static void gpu_unlock(void) { + pthread_mutex_unlock(&gpu_mutex); +} + -+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) { -+ p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); -+ av_assert0(p->vcsm_handle); -+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); -+ av_assert0(p->vc_handle); -+ p->arm = vcsm_lock(p->vcsm_handle); -+ av_assert0(p->arm); -+ p->vc = mem_lock(mb, p->vc_handle); -+ av_assert0(p->vc); -+ return 0; ++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. ++static gpu_env_t * gpu_lock(void) { ++ pthread_mutex_lock(&gpu_mutex); ++ ++ av_assert0(gpu != NULL); ++ return gpu; +} + ++static gpu_env_t * gpu_lock_ref(void) ++{ ++ pthread_mutex_lock(&gpu_mutex); ++ ++ if (gpu == NULL) { ++ int rv = gpu_init(&gpu); ++ if (rv != 0) { ++ gpu_unlock(); ++ return NULL; ++ } ++ } ++ ++ ++gpu->open_count; ++ return gpu; ++} ++ ++static void gpu_unlock_unref(gpu_env_t * const ge) ++{ ++ if (--ge->open_count == 0) ++ gpu_term(); ++ ++ gpu_unlock(); ++} ++ ++static inline gpu_env_t * gpu_ptr(void) ++{ ++ av_assert0(gpu != NULL); ++ return gpu; ++} ++ ++// Public gpu fns ++ +// Allocate memory on GPU +// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes +// Returns 0 on success. @@ -11843,732 +15020,532 @@ index 0000000..365f4a6 +int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) +{ + int r; -+ gpu_lock(); -+ r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb); -+ gpu->open_count++; ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p); + gpu_unlock(); + return r; +} + -+int gpu_get_mailbox(void) -+{ -+ av_assert0(gpu); -+ return gpu->mb; -+} -+ -+// Call this to clean and invalidate a region of memory -+void gpu_cache_flush(const GPU_MEM_PTR_T * const p) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ iocache.s[0].handle = p->vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int) p->arm; -+ iocache.s[0].size = p->numbytes; -+ vcsm_clean_invalid( &iocache ); -+#else -+ void *tmp = vcsm_lock(p->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+#endif -+} -+ -+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2) -+{ -+#ifdef RPI_FAST_CACHEFLUSH -+ struct vcsm_user_clean_invalid_s iocache = {}; -+ iocache.s[0].handle = p0->vcsm_handle; -+ iocache.s[0].cmd = 3; // clean+invalidate -+ iocache.s[0].addr = (int) p0->arm; -+ iocache.s[0].size = p0->numbytes; -+ iocache.s[1].handle = p1->vcsm_handle; -+ iocache.s[1].cmd = 3; // clean+invalidate -+ iocache.s[1].addr = (int) p1->arm; -+ iocache.s[1].size = p1->numbytes; -+ iocache.s[2].handle = p2->vcsm_handle; -+ iocache.s[2].cmd = 3; // clean+invalidate -+ iocache.s[2].addr = (int) p2->arm; -+ iocache.s[2].size = p2->numbytes; -+ vcsm_clean_invalid( &iocache ); -+#else -+ void *tmp; -+ tmp = vcsm_lock(p0->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+ tmp = vcsm_lock(p1->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+ tmp = vcsm_lock(p2->vcsm_handle); -+ vcsm_unlock_ptr(tmp); -+#endif -+} -+ -+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) { -+ p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); -+ av_assert0(p->vcsm_handle); -+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); -+ av_assert0(p->vc_handle); -+ p->arm = vcsm_lock(p->vcsm_handle); -+ av_assert0(p->arm); -+ p->vc = mem_lock(gpu->mb, p->vc_handle); -+ av_assert0(p->vc); -+ return 0; -+} -+ +// This allocates data that will be +// Cached in ARM L2 +// Uncached in VPU L2 +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) +{ + int r; -+ gpu_lock(); -+ r = gpu_malloc_cached_internal(numbytes, p); -+ gpu->open_count++; ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ r = gpu_malloc_cached_internal(ge->mb, numbytes, p); + gpu_unlock(); + return r; +} + -+static void gpu_term(void) -+{ -+ int mb; -+ -+ if (gpu==NULL) -+ return; -+ mb = gpu->mb; -+ -+ // ??? Tear down anything needed for gpuexecute -+ -+ qpu_enable(mb, 0); -+ gpu_free_internal(&gpu_mem_ptr); -+ -+ vc_gpuserv_deinit(); -+ vcsm_exit(); -+ -+ mbox_close(mb); -+ gpu = NULL; -+} -+ -+void gpu_free_internal(GPU_MEM_PTR_T *p) { -+ int mb = gpu->mb; -+ mem_unlock(mb,p->vc_handle); -+ vcsm_unlock_ptr(p->arm); -+ vcsm_free(p->vcsm_handle); -+} -+ -+void gpu_free(GPU_MEM_PTR_T *p) { -+ gpu_lock(); -+ -+ gpu_free_internal(p); -+ -+ gpu->open_count--; -+ if (gpu->open_count==0) { -+ printf("Closing GPU\n"); -+ gpu_term(); -+ gpu = NULL; -+ } -+ gpu_unlock(); ++void gpu_free(GPU_MEM_PTR_T * const p) { ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_free_internal(ge->mb, p); ++ gpu_unlock_unref(ge); +} + +unsigned int vpu_get_fn(void) { + // Make sure that the gpu is initialized -+ if (gpu==NULL) { -+ printf("Preparing gpu\n"); -+ gpu_lock(); -+ gpu_unlock(); -+ } -+ return gpu->vc + offsetof(struct GPU,vpu_code); ++ av_assert0(gpu != NULL); ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code); +} + +unsigned int vpu_get_constants(void) { -+ if (gpu==NULL) { -+ gpu_lock(); ++ av_assert0(gpu != NULL); ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even); ++} ++ ++int gpu_get_mailbox(void) ++{ ++ av_assert0(gpu); ++ return gpu->mb; ++} ++ ++void gpu_ref(void) ++{ ++ gpu_lock_ref(); ++ gpu_unlock(); ++} ++ ++void gpu_unref(void) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_unlock_unref(ge); ++} ++ ++// ---------------------------------------------------------------------------- ++// ++// Cache flush functions ++ ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init() ++{ ++ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t)); ++ if (rfe == NULL) ++ return NULL; ++ ++ rfe->n = 0; ++ return rfe; ++} ++ ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) ++{ ++ if (rfe != NULL) ++ free(rfe); ++} ++ ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = 0; ++ unsigned int na; ++ unsigned int nr; ++ ++ // Clear any reamaining ents in the final block ++ if ((nr = rfe->n % CFE_ENTS_PER_A) != 0) ++ memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0])); ++ ++ for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na) ++ { ++ if (vcsm_clean_invalid(rfe->a + na) != 0) ++ rc = -1; ++ } ++ ++ free(rfe); ++ ++ if (rc == 0) ++ return 0; ++ ++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno); ++ return rc; ++} ++ ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++{ ++ // Deal with empty pointer trivially ++ if (gm == NULL || gm->numbytes == 0) ++ return; ++ ++ { ++ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); ++ const unsigned int n = rfe->n % CFE_ENTS_PER_A; ++ ++ av_assert0(rfe->n < CFE_ENT_COUNT); ++ ++ a->s[n].cmd = mode; ++ a->s[n].handle = gm->vcsm_handle; ++ a->s[n].addr = (unsigned int)gm->arm; ++ a->s[n].size = gm->numbytes; ++ ++rfe->n; ++ } ++} ++ ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset, const unsigned int size) ++{ ++ // Deal with empty pointer trivially ++ if (gm == NULL || size == 0) ++ return; ++ ++// printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes); ++ ++ av_assert0(offset <= gm->numbytes); ++ av_assert0(size <= gm->numbytes); ++ av_assert0(offset + size <= gm->numbytes); ++ ++ { ++ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); ++ const unsigned int n = rfe->n % CFE_ENTS_PER_A; ++ ++ av_assert0(rfe->n < CFE_ENT_COUNT); ++ ++ a->s[n].cmd = mode; ++ a->s[n].handle = gm->vcsm_handle; ++ a->s[n].addr = (unsigned int)gm->arm + offset; ++ a->s[n].size = size; ++ ++rfe->n; ++ } ++} ++ ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) ++{ ++#if !RPI_ONE_BUF ++#error Fixme! (NIF) ++#endif ++ if (gpu_is_buf1(frame)) { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); ++ } ++ else ++ { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); ++ } ++} ++ ++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, ++ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma) ++{ ++ const unsigned int y_offset = frame->linesize[0] * start_line; ++ const unsigned int y_size = frame->linesize[0] * n; ++ // Round UV up/down to get everything ++ const unsigned int uv_rnd = (1U << uv_shift) >> 1; ++ const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift); ++ const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset; ++ ++ // As all unsigned they will also reject -ve ++ // Test individually as well as added to reject overflow ++ av_assert0(start_line <= (unsigned int)frame->height); ++ av_assert0(n <= (unsigned int)frame->height); ++ av_assert0(start_line + n <= (unsigned int)frame->height); ++ ++ if (!gpu_is_buf1(frame)) ++ { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); ++ } ++ } ++ else if (!rpi_sliced_frame(frame)) ++ { ++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); ++ } ++ } ++ else ++ { ++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); ++// printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' '); ++ for (int x = 0; x < frame->width; x += frame->linesize[0]) { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, ++ (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size); ++ } ++ } ++ } ++} ++ ++// Call this to clean and invalidate a region of memory ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) ++{ ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_gm_ptr(rfe, p, mode); ++ rpi_cache_flush_finish(rfe); ++} ++ ++ ++// ---------------------------------------------------------------------------- ++ ++ ++// Wait abstractions - mostly so we can easily add profile code ++static void vq_wait_pool_init(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_init(&wp->pool[i].sem, 0, 0); ++ wp->pool[i].next = wp->pool + i + 1; ++ } ++ wp->head = wp->pool + 0; ++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; ++} ++ ++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ wp->head = NULL; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_destroy(&wp->pool[i].sem); ++ wp->pool[i].next = NULL; ++ } ++} ++ ++ ++// If sem_init actually takes time then maybe we want a pool... ++static vq_wait_t * vq_wait_new(const unsigned int cost) ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ vq_wait_t * const wait = ge->wait_pool.head; ++ ge->wait_pool.head = wait->next; ++ ge->current_load += cost; ++ wait->cost = cost; ++ wait->next = NULL; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ tto_start(&ge->ttw.active, ns_time()); ++#endif ++ ++ gpu_unlock(); ++ return wait; ++} ++ ++static void vq_wait_delete(vq_wait_t * const wait) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ wait->next = ge->wait_pool.head; ++ ge->wait_pool.head = wait; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ trace_time_wait_t * const ttw = &ge->ttw; ++ const int64_t now = ns_time(); ++ ++ttw->jcount; ++ tto_end(&ttw->wait, now); ++ ++ if (ttw->start0 == 0) ++ { ++ ttw->start0 = ttw->active.start[0]; ++ ttw->last_update = ttw->start0; ++ } ++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) ++ { ++ ttw->last_update += WAIT_TIME_PRINT_PERIOD; ++ ttw_print(ttw, now); ++ } ++ } ++#endif ++ gpu_unlock_unref(ge); ++} ++ ++static void vq_wait_wait(vq_wait_t * const wait) ++{ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ const int64_t now = ns_time(); ++ gpu_env_t * const ge = gpu_lock(); ++ tto_start(&ge->ttw.wait, now); ++ gpu_unlock(); ++ } ++#endif ++ ++ while (sem_wait(&wait->sem) == -1 && errno == EINTR) ++ /* loop */; ++} ++ ++static void vq_wait_post(vq_wait_t * const wait) ++{ ++#if !RPI_TRACE_TIME_VPU_QPU_WAIT ++ if (wait->cost != 0) ++#endif ++ { ++ gpu_env_t *const ge = gpu_lock(); ++ ge->current_load -= wait->cost; ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ tto_end(&ge->ttw.active, ns_time()); ++#endif + gpu_unlock(); + } -+ return gpu->vc + offsetof(struct GPU,transMatrix2even); ++ ++ sem_post(&wait->sem); +} + -+#ifdef GPUSERVICE -+static void callback(void *cookie) ++ ++ ++// Header comments were wrong for these two ++#define VPU_QPU_MASK_QPU 1 ++#define VPU_QPU_MASK_VPU 2 ++ ++#define VPU_QPU_JOB_MAX 4 ++struct vpu_qpu_job_env_s +{ -+ sem_post((sem_t *)cookie); ++ unsigned int n; ++ unsigned int mask; ++ unsigned int cost; ++ struct gpu_job_s j[VPU_QPU_JOB_MAX]; ++}; ++ ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; ++ ++vpu_qpu_job_env_t * vpu_qpu_job_new(void) ++{ ++ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); ++ return vqj; +} -+#endif + -+ -+static volatile uint32_t post_done = 0; -+static volatile uint32_t post_qed = 0; -+ -+static void post_code2_cb(void * v) ++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) +{ -+ uint32_t n = (uint32_t)v; -+ if ((int32_t)(n - post_done) > 0) { -+ post_done = n; ++ memset(vqj, 0, sizeof(*vqj)); ++ free(vqj); ++} ++ ++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) ++{ ++ struct gpu_job_s * const j = vqj->j + vqj->n++; ++ av_assert0(vqj->n <= VPU_QPU_JOB_MAX); ++ return j; ++} ++ ++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) ++{ ++ if (vpu_code != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_VPU; ++ ++ j->command = EXECUTE_VPU; ++ j->u.v.q[0] = vpu_code; ++ j->u.v.q[1] = r0; ++ j->u.v.q[2] = r1; ++ j->u.v.q[3] = r2; ++ j->u.v.q[4] = r3; ++ j->u.v.q[5] = r4; ++ j->u.v.q[6] = r5; + } +} + -+ -+// Post a command to the queue -+// Returns an id which we can use to wait for completion -+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf) ++// flags are QPU_FLAGS_xxx ++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail) +{ -+ struct gpu_job_s j[1] = { -+ { -+ .command = EXECUTE_VPU, -+ .u.v.q = {code, r0, r1, r2, r3, r4, r5}, -+ .callback.func = post_code2_cb -+ } -+ }; -+ uint32_t id; ++ if (n != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_QPU; ++ vqj->cost += cost; + -+ j[0].callback.cookie = (void *)(id = ++post_qed); -+ -+ av_assert0(vc_gpuserv_execute_code(1, j) == 0); -+ -+ return id; ++ j->command = EXECUTE_QPU; ++ j->u.q.jobs = n; ++#if RPI_TRACE_QPU_PROFILE_ALL ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; ++#else ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; ++#endif ++ j->u.q.timeout = 5000; ++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); ++ } +} + -+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ int qpu0_n, const uint32_t * qpu0_mail, -+ int qpu1_n, const uint32_t * qpu1_mail) ++// Convert callback to sem post ++static void vpu_qpu_job_callback_wait(void * v) +{ -+#if 1 -+ sem_t sync0; -+ struct gpu_job_s j[4]; ++ vq_wait_post(v); ++} + -+ sem_init(&sync0, 0, 0); ++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) ++{ ++ vq_wait_t * wait; + -+ j[0].command = EXECUTE_VPU; -+ j[0].u.v.q[0] = vpu_code; -+ j[0].u.v.q[1] = r0; -+ j[0].u.v.q[2] = r1; -+ j[0].u.v.q[3] = r2; -+ j[0].u.v.q[4] = r3; -+ j[0].u.v.q[5] = r4; -+ j[0].u.v.q[6] = r5; -+ j[0].callback.func = 0; -+ j[0].callback.cookie = NULL; ++ if (vqj->mask == 0) { ++ *wait_h = NULL; ++ return; ++ } + -+ j[1].command = EXECUTE_QPU; -+ j[1].u.q.jobs = qpu1_n; -+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[1].u.q.noflush = FLAGS_FOR_PROFILING; -+ j[1].u.q.timeout = 5000; -+ j[1].callback.func = 0; -+ j[1].callback.cookie = NULL; ++ // We are going to want a sync object ++ wait = vq_wait_new(vqj->cost); + -+ j[2].command = EXECUTE_QPU; -+ j[2].u.q.jobs = qpu0_n; -+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[2].u.q.noflush = 1; -+ j[2].u.q.timeout = 5000; -+ j[2].callback.func = 0; -+ j[2].callback.cookie = NULL; ++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync ++ // If we only posted one thing or only QPU jobs ++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) ++ { ++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); ++ av_assert0(j->callback.func == 0); + -+ j[3].command = EXECUTE_SYNC; -+ j[3].u.s.mask = 3; -+ j[3].callback.func = callback; -+ j[3].callback.cookie = (void *)&sync0; ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } ++ else ++ { ++ struct gpu_job_s *const j = new_job(vqj); + -+ av_assert0(vc_gpuserv_execute_code(4, j) == 0); ++ j->command = EXECUTE_SYNC; ++ j->u.s.mask = vqj->mask; ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } + -+ sem_wait(&sync0); -+#else ++ vqj->cost = 0; ++ vqj->mask = 0; ++ *wait_h = wait; ++} + -+ sem_t sync0, sync2; -+ struct gpu_job_s j[3]; ++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) ++{ ++ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j); ++} + -+ sem_init(&sync0, 0, 0); -+ sem_init(&sync2, 0, 0); ++// Simple wrapper of start + delete ++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) ++{ ++ int rv; ++ rv = vpu_qpu_job_start(vqj); ++ vpu_qpu_job_delete(vqj); ++ return rv; ++} + -+ j[0].command = EXECUTE_VPU; -+ j[0].u.v.q[0] = vpu_code; -+ j[0].u.v.q[1] = r0; -+ j[0].u.v.q[2] = r1; -+ j[0].u.v.q[3] = r2; -+ j[0].u.v.q[4] = r3; -+ j[0].u.v.q[5] = r4; -+ j[0].u.v.q[6] = r5; -+ j[0].callback.func = callback; -+ j[0].callback.cookie = (void *)&sync0; ++unsigned int vpu_qpu_current_load(void) ++{ ++ return gpu_ptr()->current_load; ++} + -+ j[1].command = EXECUTE_QPU; -+ j[1].u.q.jobs = qpu1_n; -+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[1].u.q.noflush = FLAGS_FOR_PROFILING; -+ j[1].u.q.timeout = 5000; -+ j[1].callback.func = 0; -+ j[1].callback.cookie = NULL; ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) ++{ ++ if (wait_h != NULL) ++ { ++ vq_wait_t * const wait = *wait_h; ++ if (wait != NULL) { ++ *wait_h = NULL; ++ vq_wait_wait(wait); ++ vq_wait_delete(wait); ++ } ++ } ++} + -+ j[2].command = EXECUTE_QPU; -+ j[2].u.q.jobs = qpu0_n; -+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ j[2].u.q.noflush = 1; -+ j[2].u.q.timeout = 5000; -+ j[2].callback.func = callback; -+ j[2].callback.cookie = (void *)&sync2; ++int vpu_qpu_init() ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; + -+ av_assert0(vc_gpuserv_execute_code(3, j) == 0); -+ -+ sem_wait(&sync0); -+ sem_wait(&sync2); -+#endif ++ if (ge->init_count++ == 0) ++ { ++ vc_gpuserv_init(); ++ } + ++ gpu_unlock(); + return 0; +} + -+ -+// Wait for completion of the given command -+void vpu_wait(int id) ++void vpu_qpu_term() +{ -+ if (id == 0) { -+#if 0 -+ sem_t sync0; -+ struct gpu_job_s j[1] = -+ { -+ { -+ .command = EXECUTE_SYNC, -+ .u.s.mask = 3, -+ .callback.func = callback, -+ .callback.cookie = (void *)&sync0 -+ } -+ }; ++ gpu_env_t * const ge = gpu_lock(); + -+ sem_init(&sync0, 0, 0); ++ if (--ge->init_count == 0) { ++ vc_gpuserv_deinit(); + -+ av_assert0(vc_gpuserv_execute_code(1, j) == 0); -+ -+ sem_wait(&sync0); ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ttw_print(&ge->ttw, ns_time()); +#endif + } -+ else { -+ while ((int32_t)(post_done - (uint32_t)id) < 0) { -+ usleep(1000); -+ } -+ } ++ ++ gpu_unlock_unref(ge); +} + -+ -+unsigned int qpu_get_fn(int num) { -+ // Make sure that the gpu is initialized -+ unsigned int *fn; -+ if (gpu==NULL) { -+ printf("Preparing gpu\n"); -+ gpu_lock(); -+ gpu_unlock(); -+ } -+ switch(num) { -+ case QPU_MC_SETUP: -+ fn = mc_setup; -+ break; -+ case QPU_MC_FILTER: -+ fn = mc_filter; -+ break; -+ case QPU_MC_EXIT: -+ fn = mc_exit; -+ break; -+ case QPU_MC_INTERRUPT_EXIT12: -+ fn = mc_interrupt_exit12; -+ break; -+ case QPU_MC_FILTER_B: -+ fn = mc_filter_b; -+ break; -+ //case QPU_MC_FILTER_HONLY: -+ // fn = mc_filter_honly; -+ // break; -+ case QPU_MC_SETUP_UV: -+ fn = mc_setup_uv; -+ break; -+ case QPU_MC_FILTER_UV: -+ fn = mc_filter_uv; -+ break; -+ case QPU_MC_FILTER_UV_B0: -+ fn = mc_filter_uv_b0; -+ break; -+ case QPU_MC_FILTER_UV_B: -+ fn = mc_filter_uv_b; -+ break; -+ case QPU_MC_INTERRUPT_EXIT8: -+ fn = mc_interrupt_exit8; -+ break; -+ case QPU_MC_END: -+ fn = mc_end; -+ break; -+ default: -+ printf("Unknown function\n"); -+ exit(-1); -+ } -+ return gpu->vc + 4*(int)(fn-rpi_shader); -+ //return code[num] + gpu->vc; -+} -+ -+#if 0 -+typedef unsigned int uint32_t; -+ -+typedef struct mvs_s { -+ GPU_MEM_PTR_T unif_mvs_ptr; -+ uint32_t *unif_mvs; // Base of memory for motion vector commands -+ -+ // _base pointers are to the start of the row -+ uint32_t *mvs_base[8]; -+ // these pointers are to the next free space -+ uint32_t *u_mvs[8]; -+ -+} HEVCContext; -+ -+#define RPI_CHROMA_COMMAND_WORDS 12 -+ -+static void rpi_inter_clear(HEVCContext *s) ++uint32_t qpu_fn(const int * const mc_fn) +{ -+ int i; -+ for(i=0;i<8;i++) { -+ s->u_mvs[i] = s->mvs_base[i]; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 0; -+ *s->u_mvs[i]++ = 128; // w -+ *s->u_mvs[i]++ = 128; // h -+ *s->u_mvs[i]++ = 128; // stride u -+ *s->u_mvs[i]++ = 128; // stride v -+ s->u_mvs[i] += 3; // Padding words -+ } ++ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code); +} + -+static void rpi_execute_inter_qpu(HEVCContext *s) -+{ -+ int k; -+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc; -+ -+ for(k=0;k<8;k++) { -+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command -+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined -+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // dummy location for V -+ } -+ -+ s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore -+ -+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV), -+ (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)), -+ (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm)) -+ ); -+} -+ -+void rpi_test_qpu(void) -+{ -+ HEVCContext mvs; -+ HEVCContext *s = &mvs; -+ int i; -+ int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS; -+ uint32_t *p; -+ printf("Allocate memory\n"); -+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr ); -+ s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm; -+ -+ // Set up initial locations for uniform streams -+ p = s->unif_mvs; -+ for(i = 0; i < 8; i++) { -+ s->mvs_base[i] = p; -+ p += uv_commands_per_qpu; -+ } -+ // Now run a simple program that should just quit immediately after a single texture fetch -+ rpi_inter_clear(s); -+ for(i=0;i<4;i++) { -+ printf("Launch QPUs\n"); -+ rpi_execute_inter_qpu(s); -+ printf("Done\n"); -+ } -+ printf("Free memory\n"); -+ gpu_free(&s->unif_mvs_ptr); -+ return; -+} -+#endif -+ -+#if 0 -+ -+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4}; -+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1}; -+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4}; -+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1}; -+ -+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24); -+ -+static uint8_t av_clip_uint8(int32_t a) -+{ -+ if (a&(~255)) return (-a)>>31; -+ else return a; -+} -+ -+static int32_t filter8(const uint8_t *data, int pitch) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 8; x++) -+ hsum += hcoeffs[x]*data[x + y * pitch]; -+ -+ vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning -+ } -+ -+ return av_clip_uint8( (vsum + 64) >> 7); -+} -+ -+// Note regression changes coefficients so is not thread safe -+//#define REGRESSION -+#ifdef REGRESSION -+#define CMAX 100 -+#else -+#define CMAX 2 -+#endif -+#define YMAX 16 -+ -+int rpi_test_shader(void) -+{ -+ int i, c; -+ -+ uint32_t *unifs; -+ -+ uint8_t *in_buffer; -+ uint8_t *out_buffer[2]; -+ -+ GPU_MEM_PTR_T unifs_ptr; -+ GPU_MEM_PTR_T in_buffer_ptr; -+ GPU_MEM_PTR_T out_buffer_ptr[2]; -+ -+ // Addresses in GPU memory of filter programs -+ uint32_t mc_setup = 0; -+ uint32_t mc_filter = 0; -+ uint32_t mc_exit = 0; -+ -+ int pitch = 0x500; -+ -+ if (gpu==NULL) { -+ gpu_lock(); -+ gpu_unlock(); -+ } -+ -+ printf("This needs to change to reflect new assembler\n"); -+ // Use table to compute locations of program start points -+ mc_setup = code[0] + gpu->vc; -+ mc_filter = code[1] + gpu->vc; -+ mc_exit = code[2] + gpu->vc; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) { -+ return -2; -+ } -+ unifs = (uint32_t*)unifs_ptr.arm; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) { -+ return -3; -+ } -+ in_buffer = (uint8_t*)in_buffer_ptr.arm; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) { -+ return -4; -+ } -+ out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm; -+ out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm; -+ -+ for (c = 0; c < CMAX; c++) { -+ int xo[] = {rand()&31, rand()&31}; -+ -+#ifdef REGRESSION -+ for (i = 0; i < 8; i++) { -+ hcoeffs[i] = (int8_t)rand(); -+ vcoeffs[i] = (int8_t)rand(); -+ if (hcoeffs[i]==-128) -+ hcoeffs[i]++; -+ if (vcoeffs[i]==-128) -+ vcoeffs[i]++; -+ } -+#endif -+ -+ for (i = 0; i < 64*23; i++) { -+ //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]); -+ in_buffer[i] = rand(); -+ } -+ -+ // Clear output array -+ { -+ int b; -+ for(b=0;b<2;b++) { -+ for(i=0;i<16*16;i++) { -+ out_buffer[b][i] = 3; -+ } -+ } -+ } -+ -+ unifs[0] = mc_filter; -+ unifs[1] = in_buffer_ptr.vc+xo[0]+16; -+ unifs[2] = 64; // src pitch -+ unifs[3] = pitch; // dst pitch -+ unifs[4] = 0; // Padding -+ unifs[5] = 0; -+ unifs[6] = 0; -+ unifs[7 ] = mc_filter; -+ unifs[8 ] = in_buffer_ptr.vc+xo[1]+16; -+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]); -+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]); -+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]); -+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]); -+ unifs[13] = out_buffer_ptr[0].vc; -+ unifs[14] = mc_exit; -+ unifs[15] = in_buffer_ptr.vc+xo[1]+16; // dummy -+ unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]); -+ unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]); -+ unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]); -+ unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]); -+ unifs[20] = out_buffer_ptr[1].vc; -+ -+ printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc); -+ -+ // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem -+ -+ //qpu_run_shader(mc_setup, unifs_ptr.vc); -+ //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc); -+ rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]); -+ rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]); -+ -+ if (1) -+ { -+ int x, y, b; -+ int bad = 0; -+ -+ for (b=0; b<2; ++b) -+ for (y=0; yvc; -+ mc_filter = code[1] + gpu->vc; -+ mc_exit = code[2] + gpu->vc; -+ -+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) { -+ return; -+ } -+ //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr); -+ //out_buffer = (uint8_t*)out_buffer_ptr.arm; -+ -+ /*for (y=0; y<16; ++y) { -+ for (x=0; x<16; ++x) { -+ out_buffer[x+y*dst_pitch] = 7; -+ } -+ }*/ -+ -+ unifs = (uint32_t*)unifs_ptr.arm; -+ -+ unifs[0] = mc_filter; -+ unifs[1] = (int)in_buffer_vc; -+ unifs[2] = src_pitch; // src pitch -+ unifs[3] = dst_pitch; // dst pitch -+ unifs[4] = 0; // Padding -+ unifs[5] = 0; -+ unifs[6] = 0; -+ unifs[7 ] = mc_exit; -+ unifs[8 ] = (int)in_buffer_vc; -+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]); -+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]); -+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]); -+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]); -+ unifs[13] = (int)dst_vc; -+ //unifs[13] = (int)out_buffer_ptr.vc; -+ -+ //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc); -+ -+ qpu_run_shader(mc_setup, unifs_ptr.vc); -+ -+ /*for (y=0; y<16; ++y) { -+ for (x=0; x<16; ++x) { -+ dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch]; -+ } -+ }*/ -+ -+ gpu_free(&unifs_ptr); -+ //gpu_free(&out_buffer_ptr); -+} -+ -+ -+ -+#endif -+ +#endif // RPI diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h new file mode 100644 -index 0000000..c6cdb2b +index 0000000..a95f7d9 --- /dev/null +++ b/libavcodec/rpi_qpu.h -@@ -0,0 +1,176 @@ +@@ -0,0 +1,200 @@ +#ifndef RPI_QPU_H +#define RPI_QPU_H + -+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code -+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes) -+#define RPI_FAST_CACHEFLUSH -+ +#define RPI_ONE_BUF 1 + +typedef struct gpu_mem_ptr_s { @@ -12582,9 +15559,7 @@ index 0000000..c6cdb2b +// General GPU functions +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); -+extern void gpu_free(GPU_MEM_PTR_T *p); -+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p); -+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2); ++extern void gpu_free(GPU_MEM_PTR_T * const p); + +#include "libavutil/frame.h" +#if !RPI_ONE_BUF @@ -12627,29 +15602,31 @@ index 0000000..c6cdb2b + return av_buffer_get_opaque(frame->buf[0]); +} + -+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n) ++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) +{ + return av_buffer_pool_opaque(frame->buf[n]); +} + ++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) ++{ ++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); ++ return gm->vc + (frame->data[n] - gm->arm); ++} ++ + +static inline uint32_t get_vc_address_y(const AVFrame * const frame) { -+ return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc; ++ return get_vc_address3(frame, 0); +} + +static inline uint32_t get_vc_address_u(const AVFrame * const frame) { -+ return gpu_is_buf1(frame) ? -+ gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] : -+ gpu_buf3_gmem(frame, 1)->vc; ++ return get_vc_address3(frame, 1); +} + +static inline uint32_t get_vc_address_v(const AVFrame * const frame) { -+ return gpu_is_buf1(frame) ? -+ gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] : -+ gpu_buf3_gmem(frame, 2)->vc; ++ return get_vc_address3(frame, 2); +} + -+ ++#if 0 +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { + if (gpu_is_buf1(frame)) + { @@ -12686,48 +15663,74 @@ index 0000000..c6cdb2b + else + return *gpu_buf3_gmem(frame, 2); +} -+ +#endif ++#endif ++ ++// Cache flush stuff ++ ++struct rpi_cache_flush_env_s; ++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init(void); ++// Free env without flushing ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & free the env ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); ++ ++typedef enum ++{ ++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, ++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, ++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 ++} rpi_cache_flush_mode_t; ++ ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, ++ const unsigned int offset, const unsigned int size); ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, ++ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma); ++ ++// init, add, finish for one gm ptr ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); + + +// QPU specific functions -+extern void rpi_test_qpu(void); ++uint32_t qpu_fn(const int * const mc_fn); + -+enum { -+ QPU_MC_SETUP, -+ QPU_MC_FILTER, -+ QPU_MC_EXIT, -+ QPU_MC_INTERRUPT_EXIT12, -+ QPU_MC_FILTER_B, -+ QPU_MC_FILTER_HONLY, -+ QPU_MC_SETUP_UV, -+ QPU_MC_FILTER_UV, -+ QPU_MC_FILTER_UV_B0, -+ QPU_MC_FILTER_UV_B, -+ QPU_MC_INTERRUPT_EXIT8, -+ QPU_MC_END -+ }; -+extern unsigned int qpu_get_fn(int num); -+ -+#define QPU_N_UV 8 -+#define QPU_N_Y 12 -+#define QPU_N_MAX 16 ++#define QPU_N_GRP_UV 4 ++#define QPU_N_UV 8 ++#define QPU_N_GRP_Y 4 // 4 QPUs per TMU ++#define QPU_N_Y 12 + +#define QPU_MAIL_EL_VALS 2 -+#define QPU_MAIL_EL_SIZE (QPU_MAIL_EL_VALS * sizeof(uint32_t)) -+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS) -+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t)) ++ ++struct vpu_qpu_wait_s; ++typedef struct vq_wait_s * vpu_qpu_wait_h; + +// VPU specific functions ++ ++struct vpu_qpu_job_env_s; ++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; ++ ++vpu_qpu_job_h vpu_qpu_job_new(void); ++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); ++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); ++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail); ++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_job_start(const vpu_qpu_job_h vqj); ++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); ++ ++ +extern unsigned int vpu_get_fn(void); +extern unsigned int vpu_get_constants(void); -+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5); -+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf); -+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, -+ int qpu0_n, const uint32_t * qpu0_mail, -+ int qpu1_n, const uint32_t * qpu1_mail); + -+extern void vpu_wait( int id); ++// Waits for previous post_codee to complete and Will null out *wait_h after use ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); ++unsigned int vpu_qpu_current_load(void); ++int vpu_qpu_init(void); ++void vpu_qpu_term(void); + +// Simple test of shader code +extern int rpi_test_shader(void); @@ -12736,14 +15739,16 @@ index 0000000..c6cdb2b +extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch); + +extern int gpu_get_mailbox(void); ++void gpu_ref(void); ++void gpu_unref(void); + +#endif diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c new file mode 100644 -index 0000000..06fb166 +index 0000000..0898ecd --- /dev/null +++ b/libavcodec/rpi_shader.c -@@ -0,0 +1,629 @@ +@@ -0,0 +1,670 @@ +#include "rpi_shader.h" + +#ifdef _MSC_VER @@ -12767,607 +15772,648 @@ index 0000000..06fb166 +__attribute__((aligned(8))) +#endif +unsigned int rpi_shader[] = { -+// ::mc_setup_uv -+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num -+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif -+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif -+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base -+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1 -+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1 -+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif -+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif -+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0 -+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000 -+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64 -+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00 -+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24 -+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0 -+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0 -+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0 -+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0 -+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0 -+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0 -+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0 -+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0 -+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x -+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y -+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base -+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset -+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1 -+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3 -+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0 -+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch -+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2 -+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1 -+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9 -+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif -+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif -+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1 -+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15 -+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2 -+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2 -+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2 -+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3 -+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1 -+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1 -+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) -+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1 -+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1 -+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 -+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 -+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x -+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base ++// ::mc_setup_c ++/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1 ; mov -, unif ++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif ++/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1 ++/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 ++/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 ++/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 ++/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0 ++/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif ; mov ra12, 0 ++/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif ; mov ra13, 0 ++/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num ; mov ra14, 0 ++/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0 ++/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b ++/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1 ++/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4 ++/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2 ++/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0 ; v8subs r1, r1, r1 ++/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch ++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_y ++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0 ++/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y ++/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0 ++/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0 ++/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y ++/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0 ++/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif ++/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif ++/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5 ++/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x00000160] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif ++/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a ++/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b ++/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num ++/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0 ; mov -, unif ++/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x ; mov -, unif ++/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1 ; mov -, unif ++/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4 ++/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2 ++/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0 ; v8subs r1, r1, r1 ++/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch ++/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_y2 ++/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 ++/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0 ++/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y ++/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif ++/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0 ; mov -, unif ++/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y ; mov -, unif ++/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0 +// ::mc_filter_uv -+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num -+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif -+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif -+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb28 -+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 -+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif -+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a -+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b -+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c -+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 -+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 -+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1 ++/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif ; mov vw_setup, rb28 ++/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif ; mov r0, elem_num ++/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0 ; v8subs r1, r1, r1 ++/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif ++/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next ++/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif ++/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4 ++/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2 ; mov ra0, unif ++/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0 ; mov ra_y_next, ra2.16a ++/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 ++/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a ++/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 ++/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2 ; mov ra3, unif ++/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1 ; mov ra1, unif ++/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3 ; mov.ifnz ra1, unif ++/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0, r0, r2 ; mov rb8, ra3.8a ++/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0, r0, 15 ; mov rb9, ra3.8b ++/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27 ; mov r1, ra1.16b ++/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13 ; mov rb10, ra3.8c ++/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d ++/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 ++/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1 +// :uvloop -+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 -+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16 -+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0 -+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 ++/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next ++/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8 ; mov.ifnz r3, ra_y ++/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0 ; mov.ifz ra_base, ra_base_next ++/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch ++/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 ++/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop ++/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 ++/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 ++/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 ++/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 ++/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 ++/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 ++/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 ++/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 ++/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13 ++/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop ; mov r1, r1 << 8 ++/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop ++/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13 ++/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1 ++/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 ++/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 ++/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif +// ::mc_filter_uv_b0 -+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num -+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif -+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif -+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb21 -+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 -+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0, r0, i_shift16 ; mov ra3, unif -+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 -+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a -+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b -+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c -+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d -+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov rb14, unif -+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0 ++/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif ; mov vw_setup, rb28 ++/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif ; mov r0, elem_num ++/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0 ; v8subs r1, r1, r1 ++/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif ++/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next ++/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif ++/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4 ++/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2 ; mov ra0, unif ++/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0 ; mov ra_y_next, ra2.16a ++/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 ++/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a ++/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 ++/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2 ; mov ra3, unif ++/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1 ++/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3 ; mov rb8, ra3.8a ++/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, 15 ; mov rb10, ra3.8c ++/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 ++/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d ++/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif ++/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif +// :uvloop_b0 -+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 -+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18 -+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6 -+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_filter_uv_b -+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28 -+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num -+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0 ; mov ra_y_next, unif -+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif -+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 -+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 -+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21 ; mov ra3, unif -+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8 -+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21 -+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 -+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a -+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b -+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c -+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 -+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 ++/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 ++/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next ++/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8 ; mov.ifnz r3, ra_y ++/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0 ; mov.ifz ra_base, ra_base_next ++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch ++/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 ++/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0 ++/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 ++/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 ++/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 ++/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 ++/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 ++/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0 ; mov ra7, rb6 ++/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31 ++/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6 ; mov rb6, ra5 ++/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0 ++/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4 ; mov rb4, ra4 ++/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5 ; mov rb5, ra6 ++/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7 ; mov rb7, ra8 ++/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3 ++/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin ++/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3 ; mov -, unif ++/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16 ; mov ra_link, unif ++/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000 ++/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12 ++/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 ++/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 ++/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 ++/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 ++/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30 ++/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 ++/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 ++/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin ++/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 ++/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 ++/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 ++// :uv_b0_post12 ++/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6 ; mov r3, rb7 ++/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 ++/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2 ; mov rb4, r3 ++/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2, ra4 ; mov r3, rb5 ++/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 ++/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2 ; mov rb6, r3 ++// :uv_b0_post_fin ++/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif ; mov r0, elem_num ++/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0 ; v8subs r1, r1, r1 ++/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif ++/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x ; mov -, unif ++/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4 ++/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2 ; mov ra0, unif ++/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0 ; mov ra_y2_next, ra2.16a ++/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1 ; mov ra3, unif ++/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1 ; mov rb8, ra3.8a ++/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 ++/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif ; mov rb9, ra3.8b ++/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif ; mov rb10, ra3.8c ++/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop ; mov rb11, ra3.8d ++/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3 ++/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 +// :uvloop_b -+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 -+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0 ; mul24 r0, vpm, ra4 -+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a -+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop ; mul24 r0, r0, rb14 -+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait -+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16 -+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0 -+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+// ::mc_exit -+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0) -+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop ; nop -+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_interrupt_exit8 -+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu1 ++/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next ++/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8 ; mov.ifnz r3, ra_y2 ++/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1 ; mul24 r2, r2, rb_pitch ++/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 ++/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b ++/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 ++/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 ++/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 ++/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 ++/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 ++/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 ++/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6 ; mul24 r1, r1, ra_k256 ++/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14 ; mov rb6, ra5 ++/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4 ; mul24 r1, r1, ra1.16a ++/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0 ; mov rb4, ra4 ++/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5 ; mul24 r1, r1, ra_k256 ++/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12 ; mov rb5, ra6 ++/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31 ; mov ra6, rb7 ++/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13 ++/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop ; mov r1, r1 << 8 ++/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b ++/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13 ++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait ; mov rb7, ra8 ++/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3 ++/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 ++/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 ++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++// ::mc_interrupt_exit8c ++/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov -, vw_wait ; nop ; ldtmu0 ++/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) +/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) +/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) +/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) +/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_setup -+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16 -+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif -+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif -+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif -+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3 -+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3 -+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3 -+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1 -+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1 -+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif -+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0 -+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8 -+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3 -+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3 -+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3 -+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num -+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9 -+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1 -+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0 -+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 -+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2 -+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10 -+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3 -+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3 -+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3 -+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num -+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11 -+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 -+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1 -+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0 -+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 -+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2 -+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64 -+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00 -+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24 -+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0 -+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0 -+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0 -+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0 -+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0 -+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0 -+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0 -+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0 -+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2 -+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2 -+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2 -+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3 -+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1 -+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1 -+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9 -+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 -+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 -+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base -+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0 -+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1 -+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2 -+// :per_block_setup -+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif -+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num -+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next -+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next -+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 -+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif -+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8 ; mov ra_y_next, ra1.16b -+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif -+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0 -+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 -+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif -+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 -+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3 ; mov ra_y2_next, ra1.16b -+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif -+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0 -+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 -+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5 -+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7 -+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif -+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16 -+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3 -+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d -+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c -+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400 -+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d -+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c -+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 -+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d -+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c -+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 -+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d -+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c -+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d -+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c -+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 -+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d -+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c -+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d -+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c -+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 -+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d ; mov r0, unif -+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c ; mov r1, rb13 -+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1 ; mov rb4, ra3.8a -+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3 ; mov rb5, ra3.8b -+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31 -+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3 ; mov rb6, ra3.8c -+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d -+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9 -+// ::mc_filter -+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15 -+// :yloop -+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 -+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 -+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 -+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup -+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+// ::mc_filter_b -+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16 -+// :yloopb -+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 -+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 -+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 -+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12 -+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14 -+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8 -+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait -+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup -+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif ++/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop ++/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop ; nop ++// ::mc_exit ++// ::mc_exit_c ++/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 ++/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0) ++/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop ; nop ++/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop ; nop +// ::mc_interrupt_exit12 -+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop ; nop ++/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 ++/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) ++/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop ++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop ; nop +// ::mc_exit1 -+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop ; nop ++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0 ++/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1 ++/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop ++/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop ; nop ++// ::mc_setup ++/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1 ; mov ra8, unif ++/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif ++/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1 ++/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or rb24, r1, rb_pitch ++/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num ++/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3 ++/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3 ++/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b ++/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1 ++/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 ++/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch ++/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1 ++/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3 ++/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 ++/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b ++/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1 ++/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 ++/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch ++/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1 ++/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 ++/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 ++/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 ++/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0 ++/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000be0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9 ++/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0 ++/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1 ++/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif ; mul24 r1, r1, rb_pitch ++/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base ++/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0 ++/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1 ++/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch ++/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2 ++// :per_block_setup ++/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif ; mov r3, elem_num ++/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next ++/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next ++/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3 ++/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3 ++/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0 ++/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b ++/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3 ++/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0 ++/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b ++/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif ++/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 ++/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width ++/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5 ; mov r0, ra_height ++/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16 ++/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1 ++/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7 ++/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7 ++/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0, r0, ra_width ++/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 ++/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif ++/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16 ; mov ra5, unif ++/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400 ++/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3 ; mov rb14, ra5.16a ++/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d ++/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c ++/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d ++/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c ++/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 ++/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d ++/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c ++/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 ++/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d ++/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c ++/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d ++/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c ++/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 ++/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d ++/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c ++/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c ++/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 ++/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d ++/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c ++/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a ; mov ra18, unif ++/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b ++/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c ++/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18 ++/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif ++/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13 ++/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9 ++/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d ++// ::mc_filter ++/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1 ++// :yloop ++/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 ++/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 ++/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3 ++/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next ++/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 ++/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 ++/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 ++/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 ++/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop ++/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 ++/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 ++/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a ++/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b ++/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait ++/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 ++/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 ++/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 ++/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop ++/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 ++/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 ++/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 ++/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16 ++/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1 ++/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0 ++/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0 ++/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup ++/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 ++/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 ++/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest ++/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1 ++/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0 ++/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1 ++/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23 ++/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0 ++/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch ++/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 ++/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 ++/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop ++/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_filter_b ++// :yloopb ++/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 ++/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 ++/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3 ++/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch ++/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next ++/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 ++/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00001188] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 ++/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 ++/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 ++/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 ++/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb ++/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 ++/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 ++/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a ++/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b ++/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12 ++/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14 ++/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 ++/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait ++/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 ++/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb ++/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 ++/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 ++/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 ++/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16 ++/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1 ++/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0 ++/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0 ++/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup ++/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 ++/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 ++/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest ++/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1 ++/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0 ++/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1 ++/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23 ++/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0 ++/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch ++/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 ++/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 ++/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb ++/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop +// ::mc_end +}; +#ifdef __HIGHC__ @@ -13375,7 +16421,7 @@ index 0000000..06fb166 +#endif diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h new file mode 100644 -index 0000000..9772796 +index 0000000..d17b9fd --- /dev/null +++ b/libavcodec/rpi_shader.h @@ -0,0 +1,19 @@ @@ -13384,26 +16430,33 @@ index 0000000..9772796 + +extern unsigned int rpi_shader[]; + -+#define mc_setup_uv (rpi_shader + 0) -+#define mc_filter_uv (rpi_shader + 132) -+#define mc_filter_uv_b0 (rpi_shader + 274) -+#define mc_filter_uv_b (rpi_shader + 392) -+#define mc_exit (rpi_shader + 540) -+#define mc_interrupt_exit8 (rpi_shader + 558) -+#define mc_setup (rpi_shader + 588) -+#define mc_filter (rpi_shader + 872) -+#define mc_filter_b (rpi_shader + 992) -+#define mc_interrupt_exit12 (rpi_shader + 1114) -+#define mc_exit1 (rpi_shader + 1152) -+#define mc_end (rpi_shader + 1168) ++#define mc_setup_c (rpi_shader + 0) ++#define mc_filter_uv (rpi_shader + 152) ++#define mc_filter_uv_b0 (rpi_shader + 280) ++#define mc_interrupt_exit8c (rpi_shader + 554) ++#define mc_exit (rpi_shader + 582) ++#define mc_exit_c (rpi_shader + 582) ++#define mc_interrupt_exit12 (rpi_shader + 598) ++#define mc_exit1 (rpi_shader + 634) ++#define mc_setup (rpi_shader + 650) ++#define mc_filter (rpi_shader + 942) ++#define mc_filter_b (rpi_shader + 1094) ++#define mc_end (rpi_shader + 1246) + +#endif diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm new file mode 100644 -index 0000000..aa9e1e7 +index 0000000..aa3fe47 --- /dev/null +++ b/libavcodec/rpi_shader.qasm -@@ -0,0 +1,1098 @@ +@@ -0,0 +1,1259 @@ ++ ++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress ++# the warning that we are using rotation & ra/rb registers. r0..3 can be ++# rotated through all 16 elems ra regs can only be rotated through their ++# local 4. As it happens this is what is wanted here as we do not want the ++# constants from the other half of the calc. ++ +# register allocation +# +# ra0...ra7 eight horizontal filter coefficients @@ -13420,32 +16473,32 @@ index 0000000..aa9e1e7 +# +# rb8...rb11 eight vertical filter coefficients + -+# ra4 y: Fiter, UV: 0x10000 ++# ra4 y: Fiter, UV: part -of b0 -> b stash + +# rb12 offset to add before shift (round + weighting offsets) +# rb13 shift: denom + 6 + 9 +# rb14 L0 weight (U on left, V on right) +# rb15 -- free -- +# -+# ra16 clipped(row start address+elem_num)&~3 -+# ra17 per-channel shifts ++# ra16 width:height ++# ra17 ra_y:ra_xshift +# ra18 L1 weight (Y) -+# ra19 next ra17 ++# ra19 ra_y_next:ra_xshift_next +# +# rb16 pitch +# rb17 height + 1 -+# rb18 height + 3 -+# rb19 next ra16 ++# rb18 max(height,16) + 3 ++# rb19 frame_base2_next +# +# ra20 1 -+# ra21 ra_21 ++# ra21 ra_y2_next:ra_y2 (luma); free (chroma) +# ra22 ra_k256 256 -+# ra23 ra_y2_next ra_y2_next ++# ra23 0 +# -+# rb20 0xffffff00 -+# rb21 vpm_setup for reading/writing 16bit results into VPM ++# rb20 -- free -- ++# rb21 -- free -- +# rb22 rb_k255 255 -+# rb23 24 ++# rb23 dest (Y) +# +# rb24 vdw_setup_1(dst_pitch) +# rb25 frame width-1 @@ -13456,146 +16509,233 @@ index 0000000..aa9e1e7 +# rb30 frame height-1 +# rb31 used as temp to count loop iterations +# -+# ra24 clipped(row start address+8+elem_num)&~3 -+# ra25 per-channel shifts 2 ++# ra24 src frame base ++# ra25 src frame base 2 +# ra26 next ra24 +# ra27 next ra25 -+# ra28 next y -+# ra29 y for next texture access -+# ra30 64 ++# ra28 -- free -- ++# ra29 -- free -- +# -+# ra31 next kernel address ++# Use an even numbered register as a link register to avoid corrupting flags ++# ra30 next kernel address ++# ra31 chroma-B height+3; free otherwise + -+.set rb_frame_width_minus_1, rb25 -+.set rb_frame_height_minus_1, rb30 ++.set rb_max_x, rb25 ++.set rb_max_y, rb30 +.set rb_pitch, rb16 -+.set ra_x, ra16 ++.set ra_width_height, ra16 ++.set ra_width, ra16.16b ++.set ra_height, ra16.16a +.set ra_y2, ra21.16a +.set ra_y2_next, ra21.16b + -+.set rb_x_next, rb19 -+.set rx_frame_base2_next, rb19 ++.set rb_base2_next, rb19 + -+.set ra_frame_base, ra24 -+.set ra_frame_base_next, ra26 -+.set ra_xshift, ra17 ++.set rb_dest, rb23 ++.set ra_base, ra24 ++.set ra_base_next, ra26 ++.set ra_xshift, ra17.16a + -+.set ra_u2v_ref_offset, ra25 -+.set ra_frame_base2, ra25 ++.set ra_base2, ra25 + -+.set ra_xshift_next, ra19 -+.set rx_xshift2, rb0 -+.set rx_xshift2_next, rb1 ++# Note ra_xy & ra_xy_next should have same structure! ++.set ra_xshift_next, ra19.16a ++.set rb_xshift2, rb0 ++.set rb_xshift2_next, rb1 + -+.set ra_u2v_dst_offset, ra27 -+ -+.set ra_y_next, ra28 -+.set ra_y, ra29 ++.set ra_y_next, ra19.16b ++.set ra_y, ra17.16b + +.set ra_k1, ra20 ++.set rb_xpitch, rb20 +.set rb_k255, rb22 +.set ra_k256, ra22 ++.set ra_k0, ra23 ++ ++.set ra_link, ra30 + +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. +.set i_shift16, -16 +.set i_shift21, -11 ++.set i_shift23, -9 ++.set i_shift30, -2 ++ ++# Much of the setup code is common between Y & C ++# Macros that express this - obviously these can't be overlapped ++# so are probably unsuitable for loop code ++ ++.macro m_calc_dma_regs, r_vpm, r_dma ++ mov r2, qpu_num ++ asr r1, r2, 2 ++ shl r1, r1, 6 ++ and r0, r2, 3 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit ++ add r_vpm, r0, r1 # VPM 8bit storage ++ ++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later ++ shl r0, r0, 5 ++ add r_dma, r0, r1 # DMA out ++.endm ++ ++# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16 ++.macro m_calc_dma_regs_c, r_vpm, r_dma ++ mov r2, qpu_num ++ asr r1, r2, 1 ++ shl r1, r1, 5 ++ and r0, r2, 1 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit ++ add r_vpm, r0, r1 # VPM 8bit storage ++ ++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into ++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) ++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later ++ shl r0, r0, 6 ++ add r_dma, r0, r1 # DMA out ++.endm ++ + +################################################################################ -+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) -+::mc_setup_uv -+ -+# Read starting kernel -+mov ra31, unif ++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) ++::mc_setup_c ++ mov tmurs, 1 ; mov -, unif # No swap TMUs ; Next fn (ignored) + +# Load first request location -+add ra_x, unif, elem_num # Store x -+mov ra_y, unif # Store y -+mov ra_frame_base, unif # Store frame u base -+nop -+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame ++ mov ra0, unif # next_x_y ++ ++ mov ra_base, unif # Store frame c base + +# Read image dimensions -+sub rb25,unif,1 -+sub rb30,unif,1 -+ -+# get source pitch -+mov rb16, unif -+ -+# get destination pitch -+mov r0, unif -+mov r1, vdw_setup_1(0) -+add rb24, r1, r0 ++ sub rb_max_x, unif, 1 # pic c width ++ sub rb_max_y, unif, 1 # pic c height + +# load constants ++ mov ra_k1, 1 ++ mov ra_k256, 256 ++ mov rb_k255, 255 ++ mov ra_k0, 0 + -+mov ra4, 0x10000 -+mov ra_k1, 1 -+mov ra_k256, 256 -+mov ra30, 64 ++# touch registers to keep simulator happy + -+mov rb20, 0xffffff00 -+mov rb_k255, 255 -+mov rb23, 24 ++ # ra/b4..7: B0 -> B stash registers ++ mov ra4, 0 ; mov rb4, 0 ++ mov ra5, 0 ; mov rb5, 0 ++ mov ra6, 0 ; mov rb6, 0 ++ mov ra7, 0 ; mov rb7, 0 + -+# touch vertical context to keep simulator happy ++ mov r1, vdw_setup_1(0) # Merged with dst_stride shortly, delay slot for ra_base + -+mov ra8, 0 -+mov ra9, 0 -+mov ra10, 0 -+mov ra11, 0 -+mov ra12, 0 -+mov ra13, 0 -+mov ra14, 0 -+mov ra15, 0 ++# ; ra12..15: vertical scroll registers ++# get source pitch ++ mov rb_xpitch, unif ; mov ra12, 0 # stride2 ++ mov rb_pitch, unif ; mov ra13, 0 # stride1 ++ mov r0, elem_num ; mov ra14, 0 ++# get destination vdw setup ++ add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1 + +# Compute base address for first and second access -+mov r0, ra_x # Load x -+max r0, r0, 0; mov r1, ra_y # Load y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base # Load the frame base -+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset -+add ra_y, r1, 1 -+add r0, r0, r3 -+and r0, r0, ~3 -+max r1, r1, 0 ; mov ra_x, r0 # y -+min r1, r1, rb_frame_height_minus_1 -+# submit texture requests for first line -+add r2, r2, r0 ; mul24 r1, r1, rb_pitch -+add t0s, r0, r1 ; mov ra_frame_base, r2 -+add t1s, r2, r1 ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base + -+mov r2, 9 -+add rb13, r2, unif # denominator -+mov -, unif # Unused ++ add r0, r0, ra0.16b # Add elem no to x to get X for this slice ++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y ++ min r0, r0, rb_max_x ++ ++# Get shift ++ and r1, r0, 1 ++ shl ra_xshift_next, r1, 4 ++ ++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++ ++ and r0, r0, -2 ++ add r0, r0, r0 ; v8subs r1, r1, r1 ++ sub r1, r1, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra_y ++ add ra_base, ra_base, r0 ++ ++ max r0, r1, 0 ++ min r0, r0, rb_max_y ++ ++# submit texture requests for first line ++ add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++ add t0s, ra_base, r0 ++ ++# submit texture requests for 2nd line ++ ++ max r0, r1, 0 ++ min r0, r0, rb_max_y ++ ++ add ra_y, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++ add t0s, ra_base, r0 ++ ++ add rb13, 9, unif # denominator ++ mov -, unif # Unused + +# Compute part of VPM to use for DMA output -+mov r2, unif -+shl r2, r2, 1 # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results) -+and r2, r2, 15 -+mov r1, r2 -+asr r1, r1, 2 -+shl r1, r1, 6 -+mov r0, r2 -+and r0, r0, 3 -+add r0, r0, r1 ++ m_calc_dma_regs_c rb28, rb27 + -+mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit -+add rb28, r0, r1 # VPM 8bit storage -+asr r2, r0, 1 # r0 = bc0000d -+mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit -+add rb21, r2, r1 # VPM for 16bit intermediates -+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later -+shl r0, r0, 5 -+add rb27, r0, r1 # DMA out ++# ----------------- ++# And again for L1, but only worrying about frame2 stuff + -+# submit texture requests for second line -+max r1, ra_y, 0 -+min r1, r1, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 -+bra -, ra31 -+nop ; mul24 r1, r1, rb_pitch -+add t0s, r1, ra_x -+add t1s, r1, ra_frame_base ++ mov ra_link, unif # Next fn + ++# Load first request location ++ mov ra0, unif # next_x_y ++ ++ mov ra_base2, unif # Store frame c base ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base ++ ++ mov ra_y2, ra0.16a # Store y ++ mov r0, ra0.16b # Load x ++ add r0, r0, elem_num # Add QPU slice ++ max r0, r0, 0 ; mov -, unif # Unused 0 ++ min r0, r0, rb_max_x ; mov -, unif # Unused 1 ++ ++# Get shift ++ and r1, r0, 1 ; mov -, unif # Unused 2 ++ shl rb_xshift2_next, r1, 4 ++ ++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++ ++ and r0, r0, -2 ++ add r0, r0, r0 ; v8subs r1, r1, r1 ++ sub r1, r1, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra_y2 ++ add ra_base2, ra_base2, r0 ++ ++ max r0, r1, 0 ++ min r0, r0, rb_max_y ++ ++# submit texture requests for first line ++ add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++ add t1s, ra_base2, r0 ; mov -, unif # Unused 3 ++ ++# submit texture requests for 2nd line ++ ++ max r0, r1, 0 ; mov -, unif # Unused 4 ++ ++ bra -, ra_link ++ ++ min r0, r0, rb_max_y ; mov -, unif # Unused 5 ++ add ra_y2, r1, ra_k1 ; mul24 r0, r0, rb_pitch ++ add t1s, ra_base2, r0 ++ ++# >>> ra_link ++ ++ ++.macro setf_nz_if_v ++ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++.endm + + +################################################################################ @@ -13605,51 +16745,51 @@ index 0000000..aa9e1e7 +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block +::mc_filter_uv -+mov ra31, unif ++ mov ra_link, unif ; mov vw_setup, rb28 # ; x_y + +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+add r0, unif, elem_num # x -+max r0, r0, 0 ; mov r1, unif # y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base -+# compute offset from frame base u to frame base v -+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+shl ra_xshift_next, r0, 3 -+add r0, r0, r3 ; mov ra1, unif # ; width_height -+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs -+mov ra_y_next, r1 ; mov vw_setup, rb28 -+add ra_frame_base_next, rb_x_next, r2 ++ mov ra2, unif ; mov r0, elem_num ++ ++ setf_nz_if_v # Also acts as delay slot for ra2 ++ ++ add r0, ra2.16b, r0 ; v8subs r1, r1, r1 # x ; r1=0 ++ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base ++ max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B ++ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height ++ ++ shl ra_xshift_next, r0, 4 ++ ++ and r0, r0, -2 ; mov ra0, unif # H filter coeffs ++ add r0, r0, r0 ; mov ra_y_next, ra2.16a ++ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=x*2 (we are working in pel pairs) ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 + +# set up VPM write -+# get width,height of block + -+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+add rb17, ra1.16a, 1 -+add rb18, ra1.16a, 3 -+shl r0, ra1.16a, 7 -+add r0, r0, ra1.16b # Combine width and height of destination area -+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register -+add rb26, r0, rb27 ; mov ra3, unif # ; V filter coeffs ++ sub rb29, rb24, r2 ; mov ra3, unif # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs ++ add rb17, r1, 1 ; mov ra1, unif # ; U offset/weight ++ add rb18, r1, 3 ; mov.ifnz ra1, unif # ; V offset/weight + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++# ; unpack filter coefficients + -+# unpack filter coefficients ++ add r0, r0, r2 ; mov rb8, ra3.8a # Combine width and height of destination area ++ shl r0, r0, 15 ; mov rb9, ra3.8b # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb26, r0, rb27 ; mov r1, ra1.16b # ; r1=weight + -+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight -+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight -+nop ; mov rb10, ra3.8c -+mov r3, 0 ; mov rb11, ra3.8d # Loop count ++ shl r1, r1, rb13 ; mov rb10, ra3.8c ++ mov r3, 0 ; mov rb11, ra3.8d # Loop count + -+shl r1, ra1.16b, rb13 -+asr rb12, r1, 1 -+shl rb14, ra1.16a, 1 # b14 = weight*2 ++ asr rb12, r1, 1 ++ shl rb14, ra1.16a, 1 # b14 = weight*2 + +# rb14 - weight L0 * 2 +# rb13 = weight denom + 6 + 9 +# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) + -+# r2 is elem_num +# retrieve texture results and pick out bytes +# then submit two more texture requests + @@ -13658,123 +16798,114 @@ index 0000000..aa9e1e7 +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment -+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 # loop counter increment ++ shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next ++ shr r1, r0, 8 ; mov.ifnz r3, ra_y + -+max r2, ra_y, 0 # y -+min r2, r2, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+add t1s, ra_frame_base, r2 ++ max r2, r3, 0 ; mov.ifz ra_base, ra_base_next ++ min r2, r2, rb_max_y ++ add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + +# generate seven shifted versions +# interleave with scroll of vertical context + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ setf_nz_if_v + +# apply horizontal filter -+nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+sub r0, r2, r3 ; mov r3, rb31 -+sub.setf -, r3, 4 ; mov ra12, ra13 -+brr.anyn -, r:uvloop -+mov ra13, ra14 ; mul24 r1, ra14, rb9 -+mov ra14, ra15 -+mov ra15, r0 ; mul24 r0, ra12, rb8 ++# The filter coeffs for the two halves of this are the same (unlike in the ++# Y case) so it doesn't matter which ra0 we get them from ++ ++ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ sub r0, r2, r3 ; mov r3, rb31 ++ sub.setf -, r3, 4 ; mov ra12, ra13 ++ brr.anyn -, r:uvloop ++ mov ra13, ra14 ; mul24 r1, ra14, rb9 ++ mov ra14, ra15 ++ mov ra15, r0 ; mul24 r0, ra12, rb8 +# >>> .anyn uvloop + +# apply vertical filter and write to VPM + -+sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+add r1, r1, r0 ; mul24 r0, ra15, rb11 -+sub r1, r1, r0 ; mov -, vw_wait -+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+asr r1, r1, 14 -+nop ; mul24 r1, r1, rb14 -+shl r1, r1, 8 ++ sub r1, r1, r0 ; mul24 r0, ra14, rb10 ++ add r1, r1, r0 ; mul24 r0, ra15, rb11 ++ sub r1, r1, r0 ++ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ++ nop ; mul24 r1, r1, rb14 ++ shl r1, r1, 8 + -+add r1, r1, rb12 -+brr.anyn -, r:uvloop -+asr r1, r1, rb13 -+min r1, r1, rb_k255 # Delay 2 -+max vpm, r1, 0 # Delay 3 ++ add r1, r1, rb12 ++ asr ra1.8as, r1, rb13 ++ nop ; mov r1, r1 << 8 ++ brr.anyn -, r:uvloop ++ asr ra1.8bs, r1, rb13 ++ mov -, vw_wait ++ mov vpm, ra1 + -+# DMA out for U -+ -+mov vw_setup, rb26 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW -+ -+# DMA out for V -+# We need to wait for the U to complete first, but have nothing useful to compute while we wait. -+# Could potentially push this write into the start of the next pipeline stage. -+mov r0, 16 -+mov -, vw_wait -+ -+bra -, ra31 -+add vw_setup, rb26, r0 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW ++# >>> + ++# DMA out for U & stash for V ++ bra -, ra_link ++ mov vw_setup, rb26 ++ mov vw_setup, rb29 ++ mov vw_addr, unif # u_dst_addr ++# >>> + +################################################################################ + -+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst) ++# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst) + +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block +::mc_filter_uv_b0 -+mov ra31, unif ++ mov -, unif ; mov vw_setup, rb28 # next_fn ignored - always uv_b + +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+add r0, unif, elem_num # x -+max r0, r0, 0 ; mov r1, unif # y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base -+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ; -+shl ra_xshift_next, r0, 3 -+add r0, r0, r3 ; mov ra1, unif # ; width_height -+and rb_x_next, r0, ~3 ; mov ra0, unif # ; H filter coeffs -+mov ra_y_next, r1 ; mov vw_setup, rb21 ++ mov ra2, unif ; mov r0, elem_num + -+add ra_frame_base_next, rb_x_next, r2 ++ setf_nz_if_v # Also acts as delay slot for ra2 + -+# Need to have unsigned coeffs to so we can just unpack in the filter -+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the -+# filter code. Unpack into b regs for V ++ add r0, ra2.16b, r0 ; v8subs r1, r1, r1 # x ; r1=0 ++ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base ++ max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B ++ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height + -+# set up VPM write, we need to save 16bit precision ++ shl ra_xshift_next, r0, 4 + -+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+add rb17, ra1.16a, 1 -+add rb18, ra1.16a, 3 -+shl r0, ra1.16a, 7 -+add r0, r0, ra1.16b # Combine width and height of destination area -+shl r0, r0, i_shift16 ; mov ra3, unif # ; V filter coeffs -+add rb26, r0, rb27 ++ and r0, r0, -2 ; mov ra0, unif # H filter coeffs ++ add r0, r0, r0 ; mov ra_y_next, ra2.16a ++ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=x*2 (we are working in pel pairs) ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 + -+mov rb8, ra3.8a -+mov rb9, ra3.8b -+mov rb10, ra3.8c -+mov rb11, ra3.8d ++# set up VPM write + -+# r2 is elem_num -+# r3 is loop counter ++ sub rb29, rb24, r2 ; mov ra3, unif # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs ++ add rb17, r1, 1 ++ add ra31, r1, 3 ; mov rb8, ra3.8a # Combine width and height of destination area + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++# ; unpack filter coefficients ++ ++ add r0, r0, r2 ; mov rb9, ra3.8b ++ shl r0, r0, 15 ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb26, r0, rb27 ++ ++ mov r3, 0 ; mov rb11, ra3.8d # Loop count ++ ++ mov rb14, unif # U weight ++ mov.ifnz rb14, unif # V weight + -+mov rb14, unif # U weight L0 -+mov.ifnz rb14, unif ; mov r3, 0 # V weight L0 ; Loop counter +# rb14 unused in b0 but will hang around till the second pass + +# retrieve texture results and pick out bytes @@ -13785,108 +16916,143 @@ index 0000000..aa9e1e7 +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment -+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 # loop counter increment ++ shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next ++ shr r1, r0, 8 ; mov.ifnz r3, ra_y + -+max r2, ra_y, 0 # y -+min r2, r2, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+add t1s, ra_frame_base, r2 ++ max r2, r3, 0 ; mov.ifz ra_base, ra_base_next ++ min r2, r2, rb_max_y ++ add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + +# generate seven shifted versions +# interleave with scroll of vertical context + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + -+nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+sub r0, r2, r3 ; mov r3, rb31 -+sub.setf -, r3, 4 ; mov ra12, ra13 -+brr.anyn -, r:uvloop_b0 -+mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13 -+mov ra14, ra15 -+mov ra15, r0 ; mul24 r0, ra12, rb8 ++ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 # Need to wait 1 cycle for rotated r1 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ sub r0, r2, r3 ; mov r3, rb31 ++ sub.setf -, r3, 4 ; mov ra12, ra13 ++ brr.anyn -, r:uvloop_b0 ++ mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13 ++ mov ra14, ra15 ; mul24 r2, ra15, rb10 # ra15 is about to be ra14 ++ mov ra15, r0 ; mul24 r0, ra12, rb8 +# >>> .anyn uvloop_b0 + -+# apply vertical filter and write to VPM ++# apply vertical filter and write to B-FIFO + -+sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+sub.setf -, r3, rb18 -+brr.anyn -, r:uvloop_b0 -+add r1, r1, r0 ; mul24 r0, ra15, rb11 -+sub r1, r1, r0 ; mov -, vw_wait -+asr vpm, r1, 6 -+# >>> .anyn uvloop_b0 ++ sub r1, r1, r0 ; mov ra8.16b, ra7 # start of B FIFO writes ++ add r1, r1, r2 ; mul24 r0, ra15, rb11 # N.B. ra15 write gap ++ sub r1, r1, r0 ; mov ra7, rb6 + -+# in pass0 we don't really need to save any results, but need to discard the uniforms -+# DMA out for U ++# FIFO goes: ++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b ++# This arrangement optimizes the inner loop FIFOs at the expense of making the ++# bulk shift between loops quite a bit nastier ++# a8 used as temp + -+bra -, ra31 -+mov -, unif # Delay 1 -+mov -, unif # Delay 2 -+nop # Delay 3 ++ sub.setf -, r3, ra31 ++ asr ra8.16a, r1, 6 ; mov rb6, ra5 # This discards the high bits that might be bad ++ brr.anyn -, r:uvloop_b0 ++ mov ra5, rb4 ; mov rb4, ra4 ++ mov ra4, rb5 ; mov rb5, ra6 ++ mov ra6, rb7 ; mov rb7, ra8 ++# >>> + ++# 1st half done all results now in the a/b4..7 fifo + -+################################################################################ ++# Need to bulk rotate FIFO for heights other than 16 ++# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with ++# we are allowed 3/4 cb_size w/h :-( + -+::mc_filter_uv_b -+mov ra31, unif ++# Destination uniforms discarded ++# At the end drop through to _b - we will always do b after b0 ++ ++ sub.setf -, 15, r3 # 12 + 3 of preroll ++ brr.anyn -, r:uv_b0_post_fin # h > 12 (n) => 16 (do nothing) ++ sub r3, 11, r3 ; mov -, unif # r3 = shifts wanted ; Discard u_dst_addr ++ mov r0, i_shift16 ; mov ra_link, unif ++ mov r1, 0x10000 ++# >>> ++ brr.anyz -, r:uv_b0_post12 # h == 12 deal with specially ++# If h != 16 && h != 12 then h <= 8 so ++# shift 8 with discard (.16b = .16a on all regs) ++ shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 ++ shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 ++ shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 ++# >>> ++ shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 ++ ++ shl.setf -, r3, i_shift30 # b2 -> C, b1 -> N ++# Shift 4 ++ mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 ++ mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 ++ # If we shifted by 4 here then the max length remaining is 4 ++ # so that is it ++ ++ brr -, r:uv_b0_post_fin ++# Shift 2 ++ mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 ++ mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 ++ mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 ++ # 6 / 2 so need 6 outputs ++# >>> ++ ++:uv_b0_post12 ++# this one is annoying as we need to swap halves of things that don't ++# really want to be swapped ++ ++# b7a, a6a, b5a, a4a ++# b4a, a5a, b6a, a7a ++# b7b, a6b, b5b, a4b ++# b4b, a5b, b6b, a7b ++ ++ mov r2, ra6 ; mov r3, rb7 ++ shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 ++ mov ra5, r2 ; mov rb4, r3 ++ ++ mov r2, ra4 ; mov r3, rb5 ++ shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 ++ mov ra7, r2 ; mov rb6, r3 ++ ++:uv_b0_post_fin ++ ++##### L1 B processing + +# per-channel shifts were calculated on the *previous* invocation + -+# set up VPM write -+mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28 -+ +# get base addresses and per-channel shifts for *next* invocation -+add r0, unif, elem_num # x -+max r0, r0, 0 ; mov ra_y_next, unif # y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # V frame_base -+# compute offset from frame base u to frame base v -+sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 # U frame_base -+add r0, r0, r3 ; mov ra1, unif # width_height -+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs ++ mov ra2, unif ; mov r0, elem_num + -+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+add rb17, ra1.16a, 1 -+add rb18, ra1.16a, 3 -+shl r0, ra1.16a, 7 ++ setf_nz_if_v # Also acts as delay slot for ra2 + -+add ra_frame_base_next, rb_x_next, r2 ++ add r0, ra2.16b, r0 ; v8subs r1, r1, r1 # x ; r1=0 ++ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base ++ max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B ++ min r0, r0, rb_max_x ; mov -, unif # ; width_height + -+# r0 is currently height<<7 -+# For vr_setup we want height<<20 (so 20-7=13 additional bits) -+shl r3, r0, i_shift21 ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs -+shr r3, r3, 8 -+add vr_setup, r3, rb21 ++ shl rb_xshift2_next, r0, 4 + -+add r0, r0, ra1.16b # Combine width and height of destination area -+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register -+add rb26, r0, rb27 ++ and r0, r0, -2 ; mov ra0, unif # H filter coeffs ++ add r0, r0, r0 ; mov ra_y2_next, ra2.16a ++ and r1, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov rb8, ra3.8a # Add stripe offsets ; start unpacking filter coeffs ++ add rb_base2_next, r3, r0 + -+# get filter coefficients ++ mov ra1, unif ; mov rb9, ra3.8b # U offset/weight ++ mov.ifnz ra1, unif ; mov rb10, ra3.8c # V offset/weight + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ -+# Get offset & weight stuff -+ -+# The unif read occurs unconditionally, only the write is conditional -+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight ; -+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight ; -+nop ; mov rb10, ra3.8c -+mov r3, 0 ; mov rb11, ra3.8d # Loop counter ; -+ -+shl r1, ra1.16b, rb13 -+asr rb12, r1, 1 ++ nop ; mov rb11, ra3.8d ++ shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3 # ; r3 (loop counter) = 0 ++ asr rb12, r1, 1 + +# ra1.16a used directly in the loop + @@ -13894,125 +17060,147 @@ index 0000000..aa9e1e7 +# then submit two more texture requests + +# r3 = 0 ++ +:uvloop_b +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment -+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu1 # loop counter increment ++ shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next ++ shr r1, r0, 8 ; mov.ifnz r3, ra_y2 + -+max r2, ra_y, 0 # y -+min r2, r2, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8subs r1, r1, rb20 -+add t1s, ra_frame_base, r2 ++ max r2, r3, ra_k0 ; mov.ifz ra_base2, rb_base2_next ++ min r2, r2, rb_max_y ++ add ra_y2, r3, ra_k1 ; mul24 r2, r2, rb_pitch ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + +# generate seven shifted versions +# interleave with scroll of vertical context + +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + -+nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+sub r0, r2, r3 ; mov r3, rb31 -+sub.setf -, r3, 4 ; mov ra12, ra13 -+brr.anyn -, r:uvloop_b -+mov ra13, ra14 ; mul24 r1, ra14, rb9 -+mov ra14, ra15 -+mov ra15, r0 ; mul24 r0, ra12, rb8 ++ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ sub r0, r2, r3 ; mov r3, rb31 ++ sub.setf -, r3, 4 ; mov ra12, ra13 ++ brr.anyn -, r:uvloop_b ++ mov ra13, ra14 ; mul24 r1, ra14, rb9 ++ mov ra14, ra15 ; mul24 r2, ra15, rb10 ++ mov ra15, r0 ; mul24 r0, ra12, rb8 +# >>> .anyn uvloop_b + +# apply vertical filter and write to VPM + -+sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+add r1, r1, r0 ; mul24 r0, ra15, rb11 -+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it -+sub r1, r1, r0 ; mul24 r0, vpm, ra4 # ra4 = 0x10000 -+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+asr r1, r1, 14 # shift2=6 ++ sub r1, r1, r0 ; mov ra8.16b, ra7 # FIFO rotate (all ra/b4..7) ++ add r1, r1, r2 ; mul24 r0, ra15, rb11 ++ sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 ++ mov ra7, rb6 ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ; mov rb6, ra5 # shift2=6 + -+asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a -+nop ; mul24 r0, r0, rb14 ++ mov ra5, rb4 ; mul24 r1, r1, ra1.16a ++ add r1, r1, r0 ; mov rb4, ra4 + -+add r1, r1, r0 ; mov -, vw_wait -+shl r1, r1, 8 # Lose bad top 8 bits & sign extend ++ mov ra4, rb5 ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend ++ add r1, r1, rb12 ; mov rb5, ra6 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1) + -+add r1, r1, rb12 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1) ++ sub.setf -, r3, ra31 ; mov ra6, rb7 ++ asr ra3.8as, r1, rb13 ++ nop ; mov r1, r1 << 8 ++ brr.anyn -, r:uvloop_b ++ asr ra3.8bs, r1, rb13 ++ mov -, vw_wait ; mov rb7, ra8 # vw_wait is B-reg (annoyingly) ; Final FIFO mov ++ mov vpm, ra3 ++# >>> + -+brr.anyn -, r:uvloop_b -+asr r1, r1, rb13 # Delay 1 -+min r1, r1, rb_k255 # Delay 2 -+max vpm, r1, 0 # Delay 3 ++# DMA out + ++ bra -, ra_link ++ mov vw_setup, rb26 ++ mov vw_setup, rb29 ++ mov vw_addr, unif # c_dst_addr + -+# DMA out for U -+ -+mov vw_setup, rb26 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW -+ -+# DMA out for V -+# We need to wait for the U to complete first, but have nothing useful to compute while we wait. -+# Could potentially push this write into the start of the next pipeline stage. -+mov r0, 16 -+mov -, vw_wait -+ -+bra -, ra31 -+add vw_setup, rb26, r0 # VDW setup 0 -+mov vw_setup, rb29 # Stride -+mov vw_addr, unif # start the VDW + +################################################################################ + +# mc_exit() + ++::mc_interrupt_exit8c ++ ldtmu0 ++ ldtmu1 ++ ldtmu1 ++ mov -, vw_wait ; nop ; ldtmu0 # wait on the VDW ++ ++ mov -,sacq(0) # 1 ++ mov -,sacq(0) # 2 ++ mov -,sacq(0) # 3 ++ mov -,sacq(0) # 4 ++ mov -,sacq(0) # 5 ++ mov -,sacq(0) # 6 ++ mov -,sacq(0) # 7 ++# mov -,sacq(0) # 8 ++# mov -,sacq(0) # 9 ++# mov -,sacq(0) # 10 ++# mov -,sacq(0) # 11 ++ ++ nop ; nop ; thrend ++ mov interrupt, 1; nop # delay slot 1 ++ nop ; nop # delay slot 2 ++ ++# Chroma & Luma the same now ++::mc_exit_c +::mc_exit -+mov -, vw_wait # wait on the VDW ++ ldtmu0 ++ ldtmu1 ++ ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW + -+mov -,srel(0) ++ mov -,srel(0) + -+ldtmu0 -+ldtmu1 -+ldtmu0 -+ldtmu1 -+ -+nop ; nop ; thrend -+nop ; nop # delay slot 1 -+nop ; nop # delay slot 2 -+ -+# mc_interrupt_exit8() -+::mc_interrupt_exit8 -+mov -, vw_wait # wait on the VDW -+ -+ldtmu0 -+ldtmu1 -+ldtmu0 -+ldtmu1 -+ -+mov -,sacq(0) # 1 -+mov -,sacq(0) # 2 -+mov -,sacq(0) # 3 -+mov -,sacq(0) # 4 -+mov -,sacq(0) # 5 -+mov -,sacq(0) # 6 -+mov -,sacq(0) # 7 -+ -+nop ; nop ; thrend -+mov interrupt, 1; nop # delay slot 1 -+nop ; nop # delay slot 2 ++ nop ; nop ; thrend ++ nop ; nop # delay slot 1 ++ nop ; nop # delay slot 2 + + ++# mc_interrupt_exit12() ++::mc_interrupt_exit12 ++ ldtmu0 ++ ldtmu1 ++ ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW ++ ++ mov -,sacq(0) # 1 ++ mov -,sacq(0) # 2 ++ mov -,sacq(0) # 3 ++ mov -,sacq(0) # 4 ++ mov -,sacq(0) # 5 ++ mov -,sacq(0) # 6 ++ mov -,sacq(0) # 7 ++ mov -,sacq(0) # 8 ++ mov -,sacq(0) # 9 ++ mov -,sacq(0) # 10 ++ mov -,sacq(0) # 11 ++ ++ nop ; nop ; thrend ++ mov interrupt, 1; nop # delay slot 1 ++ nop ; nop # delay slot 2 + + ++::mc_exit1 ++ mov -, vw_wait # wait on the VDW ++ ++ ldtmu0 ++ ldtmu1 ++ ldtmu0 ++ ldtmu1 ++ nop ; nop ; thrend ++ mov interrupt, 1; nop # delay slot 1 ++ nop ; nop # delay slot 2 + +# LUMA CODE + @@ -14022,116 +17210,104 @@ index 0000000..aa9e1e7 +################################################################################ +# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel) +::mc_setup -+ mov r3, 16 -+ + # Need to save these because we need to know the frame dimensions before computing texture coordinates -+ mov ra8, unif # y_x -+ mov ra9, unif # ref_y_base -+ mov ra10, unif # y2_x2 -+ mov ra11, unif # ref_y2_base ++ mov tmurs, 1 ; mov ra8, unif # No TMU swap ; y_x ++ mov ra9, unif # ref_y_base ++ mov ra10, unif # y2_x2 ++ mov ra11, unif # ref_y2_base + +# Read image dimensions -+ mov r1, unif # width_height -+ shl r0,r1,r3 -+ asr r1,r1,r3 # width -+ asr r0,r0,r3 # height -+ sub rb_frame_width_minus_1,r1,1 -+ sub rb_frame_height_minus_1,r0,1 -+ -+# get source pitch -+ mov rb_pitch, unif # src_pitch ++ mov ra3, unif # width_height ++ mov rb_xpitch, unif # stride2 ++ sub rb_max_x, ra3.16b, 1 ++ sub rb_max_y, ra3.16a, 1 ++ mov rb_pitch, unif # stride1 + +# get destination pitch -+ mov r0, unif # dst_pitch + mov r1, vdw_setup_1(0) -+ add rb24, r1, r0 ++ or rb24, r1, rb_pitch + +# Compute base address for first and second access -+ mov r1, ra8 # y_x -+ shl r0,r1,r3 # r0 is x<<16 -+ asr r1,r1,r3 # r1 is y -+ asr r0,r0,r3 # r0 is x -+ add r0, r0, elem_num # Load x ++ mov r3, elem_num ++ add r0, ra8.16a, r3 # Load x + elem_num + max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9 # Load the frame base ++ min r0, r0, rb_max_x + shl ra_xshift_next, r0, 3 # Compute shifts -+ add ra_y, r1, 1 -+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate -+ add r2, r2, r0 # r2 is address for frame0 (not including y offset) -+ max r1, r1, 0 -+ min r1, r1, rb_frame_height_minus_1 -+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 -+ add t0s, r2, r1 ; mov ra_frame_base, r2 + -+ mov r1, ra10 # y_x -+ shl r0,r1,r3 # r0 is x<<16 -+ asr r1,r1,r3 # r1 is y -+ asr r0,r0,r3 # r0 is x -+ add r0, r0, elem_num # Load x ++ ++# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs ++ ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base, ra9, r0 ++ ++ mov r1, ra8.16b # Load y ++ add ra_y, r1, 1 # Set for next ++ max r1, r1, 0 ++ min r1, r1, rb_max_y ++ ++# submit texture requests for first line ++ nop ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ++ ++ ++ # r3 still contains elem_num ++ add r0, ra10.16a, r3 # Load x + max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11 # Load the frame base -+ shl rx_xshift2_next, r0, 3 # Compute shifts -+ add ra_y2, r1, 1 -+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate -+ add r2, r2, r0 # r2 is address for frame1 (not including y offset) -+ max r1, r1, 0 -+ min r1, r1, rb_frame_height_minus_1 -+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 -+ add t1s, r2, r1 ; mov ra_frame_base2, r2 ++ min r0, r0, rb_max_x ++ shl rb_xshift2_next, r0, 3 # Compute shifts + ++ # r2 still contains mask ++ and r0, r0, -4 ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base2, ra11, r0 ++ ++ mov r1, ra10.16b # Load y ++ add ra_y2, r1, 1 # Set for next ++ max r1, r1, 0 ++ min r1, r1, rb_max_y ++ ++# submit texture requests for first line ++ nop ; mul24 r1, r1, rb_pitch ++ add t1s, ra_base2, r1 + +# load constants + + mov ra_k1, 1 + mov ra_k256, 256 -+ mov ra30, 64 -+ -+ mov rb20, 0xffffff00 + mov rb_k255, 255 -+ mov rb23, 24 ++ mov ra_k0, 0 + +# touch vertical context to keep simulator happy + -+ mov ra8, 0 -+ mov ra9, 0 -+ mov ra10, 0 -+ mov ra11, 0 -+ mov ra12, 0 -+ mov ra13, 0 -+ mov ra14, 0 -+ mov ra15, 0 ++ mov ra8, 0 ; mov rb8, 0 ++ mov ra9, 0 ; mov rb9, 0 ++ mov ra10, 0 ; mov rb10, 0 ++ mov ra11, 0 ; mov rb11, 0 + +# Compute part of VPM to use -+ mov r2, qpu_num -+ mov r1, r2 -+ asr r1, r1, 2 -+ shl r1, r1, 6 -+ mov r0, r2 -+ and r0, r0, 3 -+ add r0, r0, r1 -+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit -+ add rb28, r0, r1 # VPM for saving data -+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later -+ shl r0, r0, 5 -+ add rb27, r0, r1 # Command for dma output ++ m_calc_dma_regs rb28, rb27 + +# Weighted prediction denom -+ add rb13, unif, 9 # unif = weight denom + 6 -+ -+ mov -, unif # Unused ++ add rb13, unif, 9 # unif = weight denom + 6 + +# submit texture requests for second line + max r1, ra_y, 0 -+ min r1, r1, rb_frame_height_minus_1 ++ min r1, r1, rb_max_y + add ra_y, ra_y, 1 -+ nop ; mul24 r1, r1, rb_pitch -+ add t0s, r1, ra_frame_base ++ mov -, unif ; mul24 r1, r1, rb_pitch # unused ; ++ add t0s, r1, ra_base + + max r1, ra_y2, 0 -+ min r1, r1, rb_frame_height_minus_1 ++ min r1, r1, rb_max_y + add ra_y2, ra_y2, 1 -+ nop ; mul24 r1, r1, rb_pitch -+ add t1s, r1, ra_frame_base2 ++ nop ; mul24 r1, r1, rb_pitch ++ add t1s, r1, ra_base2 + +# FALL THROUGHT TO PER-BLOCK SETUP + @@ -14139,47 +17315,63 @@ index 0000000..aa9e1e7 +# P and B blocks share the same setup code to save on Icache space +:per_block_setup + mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ mov ra31, unif ++ mov ra_link, unif ++#### We do all the setup even if we are about to exit - reading junk from unif.... + -+ mov ra1, unif ; mov r1, elem_num # y_x ; elem_num has implicit unpack?? ++ mov ra1, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? + +# per-channel shifts were calculated on the *previous* invocation + mov ra_xshift, ra_xshift_next -+ mov rx_xshift2, rx_xshift2_next ++ mov rb_xshift2, rb_xshift2_next + +# get base addresses and per-channel shifts for *next* invocation + -+ add r0, ra1.16a, r1 # Load x ++ add r0, ra1.16a, r3 # Load x + max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ mov r3, 8 ; mov ra_y_next, ra1.16b -+ and r0, r0, ~3 ; mov ra1, unif # y2_x2 -+ add ra_frame_base_next, r2, r0 ++ min r0, r0, rb_max_x + -+ add r0, ra1.16a, r1 # Load x ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base_next, unif, r0 # Base1 ++ mov ra_y_next, ra1.16b # Load y ++ mov ra1, unif # x2_y2 ++ nop # ra1 delay ++ ++ add r0, ra1.16a, r3 # Load x2 + max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base -+ shl rx_xshift2_next, r0, 3 # Compute shifts -+ add r3, r3, r3 ; mov ra_y2_next, ra1.16b # r3 = 16 ; -+ and r0, r0, ~3 ; mov ra1, unif # width_height ; r0 gives the clipped and aligned x coordinate -+ add rx_frame_base2_next, r2, r0 # r2 is address for frame1 (not including y offset) ++ min r0, r0, rb_max_x ++ ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add rb_base2_next, unif, r0 # Base1 ++ mov ra_y2_next, ra1.16b # Load y ++ mov ra_width_height, unif # width_height + +# set up VPM write -+ mov vw_setup, rb28 ++ mov vw_setup, rb28 # [ra1 delay] + +# get width,height of block (unif load above) -+ sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+ add rb17, ra1.16a, 5 -+ add rb18, ra1.16a, 7 -+ shl r0, ra1.16a, 7 -+ add r0, r0, ra1.16b # Combine width and height of destination area -+ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register ++ sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width) ++ add rb17, ra_height, 5 ; mov r0, ra_height ++ mov r1, 16 ++ min r0, r0, r1 ++ add rb18, r0, 7 ++ shl r0, r0, 7 ++ add r0, r0, ra_width # Combine width and height of destination area ++ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register + add rb26, r0, rb27 ; mov r0, unif # Packed filter offsets + +# get filter coefficients and discard unused B frame values -+ shl.ifz r0, r0, i_shift16 # Pick half to use -+ shl ra8, r0, 3 ++ shl.ifz r0, r0, i_shift16 ; mov ra5, unif # Pick half to use ; L0 offset/weight ++ mov r2, 0x01040400 # [ra5 delay] ++ shl ra8, r0, 3 ; mov rb14, ra5.16a + +# Pack the 1st 4 filter coefs for H & V tightly + @@ -14187,9 +17379,8 @@ index 0000000..aa9e1e7 + ror ra2.8a, r1, ra8.8d + ror ra0.8a, r1, ra8.8c + -+ mov r1,0x01040400 -+ ror ra2.8b, r1, ra8.8d -+ ror ra0.8b, r1, ra8.8c ++ ror ra2.8b, r2, ra8.8d ++ ror ra0.8b, r2, ra8.8c + + mov r1,0x050b0a00 # -ve + ror ra2.8c, r1, ra8.8d @@ -14215,37 +17406,44 @@ index 0000000..aa9e1e7 + ror ra3.8c, r1, ra8.8d + ror ra1.8c, r1, ra8.8c + -+# Extract weighted prediction information in parallel -+ + mov r1,0x01010000 # -ve -+ ror ra3.8d, r1, ra8.8d ; mov r0, unif # ; weight L1 weight L1 (hi16)/weight L0 (lo16) -+ ror ra1.8d, r1, ra8.8c ; mov r1, rb13 # ; rb13 = weight denom + 6 + 9 ++ ror ra3.8d, r1, ra8.8d ++ ror ra1.8d, r1, ra8.8c + -+# r3 = 16 from (long way) above -+ shl r1, unif, r1 ; mov rb4, ra3.8a # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ; -+ asr ra18, r0, r3 ; mov rb5, ra3.8b -+ bra -, ra31 -+ shl r0, r0, r3 ; mov rb6, ra3.8c -+ mov r3, 0 ; mov rb7, ra3.8d # loop count ; -+ asr rb12, r1, 9 ++# Extract weighted prediction information in parallel ++# We are annoyingly A src limited here + -+# >>> branch ra31 ++ mov rb4, ra3.8a ; mov ra18, unif ++ mov rb5, ra3.8b ++ mov rb6, ra3.8c ++ mov.ifnz ra5, ra18 ++ ++ mov rb_dest, unif # Destination address ++ ++ bra -, ra_link ++ ++ shl r0, ra5.16b, rb13 # Offset calc ++ asr rb12, r0, 9 # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ mov r3, 0 ; mov rb7, ra3.8d ++# >>> branch ra_link +# +# r3 = 0 -+# ra18 = weight L1 -+# r0 = weight L0 << 16 (will be put into rb14 in filter preamble) -+# rb13 = weight denom + 6 + 9 -+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) ++# ra18.16a = weight L1 ++# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) ++# rb12 = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) ++# rb13 = weight denom + 6 + 9 ++# rb14 = weight L0 + + +################################################################################ -+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) +# In a P block, y2_x2 should be y_x+8 +# At this point we have already issued two pairs of texture requests for the current block + +::mc_filter -+# r0 = weight << 16; We want weight * 2 in rb14 -+ asr rb14, r0, 15 ++# ra5.16a = weight << 16; We want weight * 2 in rb14 ++ ++ shl rb14, ra5.16a, 1 + +# r3 = 0 + @@ -14261,20 +17459,20 @@ index 0000000..aa9e1e7 +# might be B where y != y2 so we must do full processing on both y and y2 + + sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++ shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 ++ mov.ifz ra_base, ra_base_next ; mov rb31, r3 + mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next + + max r2, ra_y, 0 # y -+ min r2, r2, rb_frame_height_minus_1 ++ min r2, r2, rb_max_y + add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + + max r2, ra_y2, 0 # y -+ min r2, r2, rb_frame_height_minus_1 ++ min r2, r2, rb_max_y + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 ++ add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 + +# generate seven shifted versions +# interleave with scroll of vertical context @@ -14283,21 +17481,21 @@ index 0000000..aa9e1e7 + +# apply horizontal filter + nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + sub r0, r2, r3 ; mov r3, rb31 + + sub.setf -, r3, 8 ; mov r1, ra8 @@ -14336,18 +17534,48 @@ index 0000000..aa9e1e7 + max vpm, r1, 0 # Delay 3 +# >>> branch.anyn yloop + -+# DMA out ++# If looping again the we consumed 16 height last loop ++ # rb29 (stride) remains constant ++ # rb17 remains const (based on total height) ++ # recalc rb26, rb18 based on new segment height ++ # N.B. r3 is loop counter still + -+ brr -, r:per_block_setup ++ mov r1, 16 ++ sub r0, ra_height, r1 ++ mov ra_height, r0 ++ max.setf r0, r0, 0 # Done if Z now ++ ++# DMA out ++ brr.anyz -, r:per_block_setup + mov vw_setup, rb26 # VDW setup 0 Delay 1 + mov vw_setup, rb29 # Stride Delay 2 -+ mov vw_addr, unif # start the VDW Delay 3 ++ mov vw_addr, rb_dest # start the VDW Delay 3 ++# >>> .anyz per_block_setup ++ ++ min r0, r0, r1 ++ add rb18, rb18, r0 ++ sub r0, r0, r1 ++ shl r0, r0, i_shift23 ++ add rb26, rb26, r0 ++ ++ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 ++ add rb_dest, rb_dest, r0 ++ ++ mov vw_setup, rb28 # Reset our VDM write pointer ++ ++ brr -, r:yloop ++ nop ++ nop ++ nop ++# >>> ++ ++ + + + +################################################################################ + -+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) +# In a P block, only the first half of coefficients contain used information. +# At this point we have already issued two pairs of texture requests for the current block +# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?) @@ -14359,7 +17587,7 @@ index 0000000..aa9e1e7 + +::mc_filter_b + # r0 = weightL0 << 16, we want it in rb14 -+ asr rb14, r0, i_shift16 ++# asr rb14, r0, i_shift16 + +:yloopb +# retrieve texture results and pick out bytes @@ -14369,20 +17597,20 @@ index 0000000..aa9e1e7 +# Perhaps we could add on the pitch and clip using larger values? + + sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 ++ shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 ++ mov.ifz ra_base, ra_base_next ; mov rb31, r3 + mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next + + max r2, ra_y, 0 # y -+ min r2, r2, rb_frame_height_minus_1 ++ min r2, r2, rb_max_y + add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + + max r2, ra_y2, 0 # y -+ min r2, r2, rb_frame_height_minus_1 ++ min r2, r2, rb_max_y + add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20 ++ add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 + +# generate seven shifted versions +# interleave with scroll of vertical context @@ -14391,21 +17619,21 @@ index 0000000..aa9e1e7 + +# apply horizontal filter + nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + sub r0, r2, r3 ; mov r3, rb31 + + sub.setf -, r3, 8 ; mov r1, ra8 @@ -14417,7 +17645,6 @@ index 0000000..aa9e1e7 + # >>> .anyn yloopb + + # apply vertical filter and write to VPM -+ + nop ; mul24 r0, rb8, ra2.8a + nop ; mul24 r1, rb9, ra2.8b + sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c @@ -14433,7 +17660,7 @@ index 0000000..aa9e1e7 + + asr r1, r1, 14 + nop ; mul24 r0, r1, rb14 -+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8 ++ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 + + add r1, r1, r0 ; mov -, vw_wait + shl r1, r1, 8 @@ -14443,548 +17670,164 @@ index 0000000..aa9e1e7 + min r1, r1, rb_k255 # Delay 2 + max vpm, r1, 0 # Delay 3 + ++ ++# If looping again the we consumed 16 height last loop ++ # rb29 (stride) remains constant ++ # rb17 remains const (based on total height) ++ # recalc rb26, rb18 based on new segment height ++ # N.B. r3 is loop counter still ++ ++ mov r1, 16 ++ sub r0, ra_height, r1 ++ mov ra_height, r0 ++ max.setf r0, r0, 0 # Done if Z now ++ +# DMA out -+ brr -, r:per_block_setup ++ brr.anyz -, r:per_block_setup + mov vw_setup, rb26 # VDW setup 0 Delay 1 + mov vw_setup, rb29 # Stride Delay 2 -+ mov vw_addr, unif # start the VDW Delay 3 ++ mov vw_addr, rb_dest # start the VDW Delay 3 ++# >>> .anyz per_block_setup ++ ++ min r0, r0, r1 ++ add rb18, rb18, r0 ++ sub r0, r0, r1 ++ shl r0, r0, i_shift23 ++ add rb26, rb26, r0 ++ ++ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 ++ add rb_dest, rb_dest, r0 ++ ++ mov vw_setup, rb28 # Reset our VDM write pointer ++ ++ brr -, r:yloopb ++ nop ++ nop ++ nop + +################################################################################ + -+# mc_interrupt_exit12() -+::mc_interrupt_exit12 -+ mov -, vw_wait # wait on the VDW -+ -+ # Dummy wait to test instructions -+# mov r3,1000000 -+#:dummy_loop -+# sub.setf r3, r3, 1 -+# nop -+# nop -+# brr.anynn -, r:dummy_loop -+# nop -+# nop -+# nop -+ -+ ldtmu0 -+ ldtmu0 -+ ldtmu1 -+ ldtmu1 -+ -+ mov -,sacq(0) # 1 -+ mov -,sacq(0) # 2 -+ mov -,sacq(0) # 3 -+ mov -,sacq(0) # 4 -+ mov -,sacq(0) # 5 -+ mov -,sacq(0) # 6 -+ mov -,sacq(0) # 7 -+ mov -,sacq(0) # 8 -+ mov -,sacq(0) # 9 -+ mov -,sacq(0) # 10 -+ mov -,sacq(0) # 11 -+ -+ nop ; nop ; thrend -+ mov interrupt, 1; nop # delay slot 1 -+ nop ; nop # delay slot 2 -+ -+ -+::mc_exit1 -+ mov -, vw_wait # wait on the VDW -+ -+ ldtmu0 -+ ldtmu1 -+ ldtmu0 -+ ldtmu1 -+ nop ; nop ; thrend -+ mov interrupt, 1; nop # delay slot 1 -+ nop ; nop # delay slot 2 -+ -+ +::mc_end +# Do not add code here because mc_end must appear after all other code. -diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h +diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h new file mode 100644 -index 0000000..db41a4d +index 0000000..27cbb59 --- /dev/null -+++ b/libavcodec/rpi_user_vcsm.h -@@ -0,0 +1,459 @@ -+/***************************************************************************** -+* Copyright 2001 - 2011 Broadcom Corporation. All rights reserved. -+* -+* This program is the proprietary software of Broadcom Corporation and/or -+* its licensors, and may only be used, duplicated, modified or distributed -+* pursuant to the terms and conditions of a separate, written license -+* agreement executed between you and Broadcom (an "Authorized License"). -+* Except as set forth in an Authorized License, Broadcom grants no license -+* (express or implied), right to use, or waiver of any kind with respect to -+* the Software, and Broadcom expressly reserves all rights in and to the -+* Software and all intellectual property rights therein. IF YOU HAVE NO -+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY -+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF -+* THE SOFTWARE. -+* -+* Except as expressly set forth in the Authorized License, -+* 1. This program, including its structure, sequence and organization, -+* constitutes the valuable trade secrets of Broadcom, and you shall use -+* all reasonable efforts to protect the confidentiality thereof, and to -+* use this information only in connection with your use of Broadcom -+* integrated circuit products. -+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS" -+* AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR -+* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH -+* RESPECT TO THE SOFTWARE. BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL -+* IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS -+* FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS, -+* QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU -+* ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE. -+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS -+* LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT, -+* OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO -+* YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN -+* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS -+* OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER -+* IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF -+* ESSENTIAL PURPOSE OF ANY LIMITED REMEDY. -+*****************************************************************************/ ++++ b/libavcodec/rpi_shader_cmd.h +@@ -0,0 +1,88 @@ ++#ifndef RPI_SHADER_CMD_H ++#define RPI_SHADER_CMD_H + -+#ifndef __USER_VCSM__H__INCLUDED__ -+#define __USER_VCSM__H__INCLUDED__ ++#pragma pack(push, 4) + -+/* VideoCore Shared Memory - user interface library. -+** -+** This library provides all the necessary abstraction for any application to -+** make use of the shared memory service which is distributed accross a kernel -+** driver and a videocore service. -+** -+** It is an application design decision to choose or not to use this service. -+** -+** The logical flow of operations that a user application needs to follow when -+** using this service is: -+** -+** 1) Initialize the service. -+** 2) Allocate shared memory blocks. -+** 3) Start using the allocated blocks. -+** - In order to gain ownership on a block, lock the allocated block, -+** locking a block returns a valid address that the user application -+** can access. -+** - When finished with using the block for the current execution cycle -+** or function, and so when giving up the ownership, unlock the block. -+** 4) A block can be locked/unlocked as many times required - within or outside -+** of - a specific execution context. -+** 5) To completely release an allocated block, free it. -+** 6) If the service is no longer required, terminate it. -+** -+** -+** Some generic considerations: ++typedef struct qpu_mc_pred_c_s { ++ uint32_t next_fn; ++ int16_t next_src_y; ++ int16_t next_src_x; ++ uint32_t next_src_base_c; ++ union { ++ struct { ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ uint32_t dst_addr_c; ++ } p; ++ struct { ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t weight_u; ++ uint32_t weight_v; ++ uint32_t dummy0; ++ } b0; ++ struct { ++ uint32_t dummy0; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ uint32_t dst_addr_c; ++ } b1; ++ struct { ++ uint32_t pic_cw; // C Width (== Y width / 2) ++ uint32_t pic_ch; // C Height (== Y Height / 2) ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ uint32_t dummy0; ++ } s0; ++ struct { ++ uint32_t dummy0; ++ uint32_t dummy1; ++ uint32_t dummy2; ++ uint32_t dummy3; ++ uint32_t dummy4; ++ uint32_t dummy5; ++ } s1; ++ }; ++} qpu_mc_pred_c_t; + -+** Allocating memory blocks. -+** -+** Memory blocks can be allocated in different manners depending on the cache -+** behavior desired. A given block can either be: ++typedef struct qpu_mc_pred_y_s { ++ int16_t next_src1_x; ++ int16_t next_src1_y; ++ uint32_t next_src1_base; ++ int16_t next_src2_x; ++ int16_t next_src2_y; ++ uint32_t next_src2_base; ++ union { ++ struct { ++ uint16_t h; ++ uint16_t w; ++ uint32_t mymx21; ++ uint32_t wo1; ++ uint32_t wo2; ++ uint32_t dst_addr; ++ } p; ++ struct { ++ uint16_t pic_h; ++ uint16_t pic_w; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ uint32_t dummy0; ++ } s; ++ }; ++ uint32_t next_fn; ++} qpu_mc_pred_y_t; + -+** - Allocated in a non cached fashion all the way through host and videocore. -+** - Allocated in a cached fashion on host OR videocore. -+** - Allocated in a cached fashion on host AND videocore. -+** -+** It is an application decision to determine how to allocate a block. Evidently -+** if the application will be doing substantial read/write accesses to a given block, -+** it is recommended to allocate the block at least in a 'host cached' fashion for -+** better results. -+** -+** -+** Locking memory blocks. -+** -+** When the memory block has been allocated in a host cached fashion, locking the -+** memory block (and so taking ownership of it) will trigger a cache invalidation. -+** -+** For the above reason and when using host cached allocation, it is important that -+** an application properly implements the lock/unlock mechanism to ensure cache will -+** stay coherent, otherwise there is no guarantee it will at all be. -+** -+** It is possible to dynamically change the host cache behavior (ie cached or non -+** cached) of a given allocation without needing to free and re-allocate the block. -+** This feature can be useful for such application which requires access to the block -+** only at certain times and not otherwise. By changing the cache behavior dynamically -+** the application can optimize performances for a given duration of use. -+** Such dynamic cache behavior remapping only applies to host cache and not videocore -+** cache. If one requires to change the videocore cache behavior, then a new block -+** must be created to replace the old one. -+** -+** On successful locking, a valid pointer is returned that the application can use -+** to access to data inside the block. There is no guarantee that the pointer will -+** stay valid following the unlock action corresponding to this lock. -+** -+** -+** Unocking memory blocks. -+** -+** When the memory block has been allocated in a host cached fashion, unlocking the -+** memory block (and so forgiving its ownership) will trigger a cache flush unless -+** explicitely asked not to flush the cache for performances reasons. -+** -+** For the above reason and when using host cached allocation, it is important that -+** an application properly implements the lock/unlock mechanism to ensure cache will -+** stay coherent, otherwise there is no guarantee it will at all be. -+** -+** -+** A complete API is defined below. -+*/ ++#pragma pack(pop) + -+#ifdef __cplusplus -+extern "C" -+{ +#endif + -+/* Different status that can be dumped. -+*/ -+typedef enum -+{ -+ VCSM_STATUS_VC_WALK_ALLOC = 0, // Walks *all* the allocation on videocore. -+ // Result of the walk is seen in the videocore -+ // log. -+ VCSM_STATUS_HOST_WALK_MAP, // Walks the *full* mapping allocation on host -+ // driver (ie for all processes). Result of -+ // the walk is seen in the kernel log. -+ VCSM_STATUS_HOST_WALK_PID_MAP, // Walks the per process mapping allocation on host -+ // driver (for current process). Result of -+ // the walk is seen in the kernel log. -+ VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host -+ // driver (for current process). Result of -+ // the walk is seen in the kernel log. -+ VCSM_STATUS_VC_MAP_ALL, // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and -+ // VCSM_STATUS_HOST_WALK_MAP. -+ // -+ VCSM_STATUS_NONE, // Must be last - invalid. -+ -+} VCSM_STATUS_T; -+ -+/* Different kind of cache behavior. -+*/ -+typedef enum -+{ -+ VCSM_CACHE_TYPE_NONE = 0, // No caching applies. -+ VCSM_CACHE_TYPE_HOST, // Allocation is cached on host (user space). -+ VCSM_CACHE_TYPE_VC, // Allocation is cached on videocore. -+ VCSM_CACHE_TYPE_HOST_AND_VC, // Allocation is cached on both host and videocore. -+ -+} VCSM_CACHE_TYPE_T; -+ -+/* Initialize the vcsm processing. -+** -+** Must be called once before attempting to do anything else. -+** -+** Returns 0 on success, -1 on error. -+*/ -+int vcsm_init( void ); -+ -+ -+/* Terminates the vcsm processing. -+** -+** Must be called vcsm services are no longer needed, it will -+** take care of removing any allocation under the current process -+** control if deemed necessary. -+*/ -+void vcsm_exit( void ); -+ -+ -+/* Queries the status of the the vcsm. -+** -+** Triggers dump of various kind of information, see the -+** different variants specified in VCSM_STATUS_T. -+** -+** Pid is optional. -+*/ -+void vcsm_status( VCSM_STATUS_T status, int pid ); -+ -+ -+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory -+** allocator. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** On success, the user must invoke vcsm_lock with the returned opaque -+** handle to gain access to the memory associated with the opaque handle. -+** When finished using the memory, the user calls vcsm_unlock_xx (see those -+** function definition for more details on the one that can be used). -+** -+** A well behaved application should make every attempt to lock/unlock -+** only for the duration it needs to access the memory data associated with -+** the opaque handle. -+*/ -+unsigned int vcsm_malloc( unsigned int size, char *name ); -+ -+ -+/* Allocates a cached block of memory of size 'size' via the vcsm memory -+** allocator, the type of caching requested is passed as argument of the -+** function call. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** On success, the user must invoke vcsm_lock with the returned opaque -+** handle to gain access to the memory associated with the opaque handle. -+** When finished using the memory, the user calls vcsm_unlock_xx (see those -+** function definition for more details on the one that can be used). -+** -+** A well behaved application should make every attempt to lock/unlock -+** only for the duration it needs to access the memory data associated with -+** the opaque handle. -+*/ -+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name ); -+ -+ -+/* Shares an allocated block of memory via the vcsm memory allocator. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** On success, the user must invoke vcsm_lock with the returned opaque -+** handle to gain access to the memory associated with the opaque handle. -+** When finished using the memory, the user calls vcsm_unlock_xx (see those -+** function definition for more details on the one that can be used). -+** -+** A well behaved application should make every attempt to lock/unlock -+** only for the duration it needs to access the memory data associated with -+** the opaque handle. -+*/ -+unsigned int vcsm_malloc_share( unsigned int handle ); -+ -+ -+/* Resizes a block of memory allocated previously by vcsm_alloc. -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** The handle must be unlocked by user prior to attempting any -+** resize action. -+** -+** On error, the original size allocated against the handle -+** remains available the same way it would be following a -+** successful vcsm_malloc. -+*/ -+int vcsm_resize( unsigned int handle, unsigned int new_size ); -+ -+ -+/* Frees a block of memory that was successfully allocated by -+** a prior call the vcms_alloc. -+** -+** The handle should be considered invalid upon return from this -+** call. -+** -+** Whether any memory is actually freed up or not as the result of -+** this call will depends on many factors, if all goes well it will -+** be freed. If something goes wrong, the memory will likely end up -+** being freed up as part of the vcsm_exit process. In the end the -+** memory is guaranteed to be freed one way or another. -+*/ -+void vcsm_free( unsigned int handle ); -+ -+ -+/* Retrieves a videocore opaque handle from a mapped user address -+** pointer. The videocore handle will correspond to the actual -+** memory mapped in videocore. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** Note: the videocore opaque handle is distinct from the user -+** opaque handle (allocated via vcsm_malloc) and it is only -+** significant for such application which knows what to do -+** with it, for the others it is just a number with little -+** use since nothing can be done with it (in particular -+** for safety reason it cannot be used to map anything). -+*/ -+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr ); -+ -+ -+/* Retrieves a videocore opaque handle from a opaque handle -+** pointer. The videocore handle will correspond to the actual -+** memory mapped in videocore. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+** -+** Note: the videocore opaque handle is distinct from the user -+** opaque handle (allocated via vcsm_malloc) and it is only -+** significant for such application which knows what to do -+** with it, for the others it is just a number with little -+** use since nothing can be done with it (in particular -+** for safety reason it cannot be used to map anything). -+*/ -+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle ); -+ -+ -+/* Retrieves a user opaque handle from a mapped user address -+** pointer. -+** -+** Returns: 0 on error -+** a non-zero opaque handle on success. -+*/ -+unsigned int vcsm_usr_handle( void *usr_ptr ); -+ -+ -+/* Retrieves a mapped user address from an opaque user -+** handle. -+** -+** Returns: 0 on error -+** a non-zero address on success. -+** -+** On success, the address corresponds to the pointer -+** which can access the data allocated via the vcsm_malloc -+** call. -+*/ -+void *vcsm_usr_address( unsigned int handle ); -+ -+ -+/* Locks the memory associated with this opaque handle. -+** -+** Returns: NULL on error -+** a valid pointer on success. -+** -+** A user MUST lock the handle received from vcsm_malloc -+** in order to be able to use the memory associated with it. -+** -+** On success, the pointer returned is only valid within -+** the lock content (ie until a corresponding vcsm_unlock_xx -+** is invoked). -+*/ -+void *vcsm_lock( unsigned int handle ); -+ -+ -+/* Locks the memory associated with this opaque handle. The lock -+** also gives a chance to update the *host* cache behavior of the -+** allocated buffer if so desired. The *videocore* cache behavior -+** of the allocated buffer cannot be changed by this call and such -+** attempt will be ignored. -+** -+** The system will attempt to honour the cache_update mode request, -+** the cache_result mode will provide the final answer on which cache -+** mode is really in use. Failing to change the cache mode will not -+** result in a failure to lock the buffer as it is an application -+** decision to choose what to do if (cache_result != cache_update) -+** -+** The value returned in cache_result can only be considered valid if -+** the returned pointer is non NULL. The cache_result pointer may be -+** NULL if the application does not care about the actual outcome of -+** its action with regards to the cache behavior change. -+** -+** Returns: NULL on error -+** a valid pointer on success. -+** -+** A user MUST lock the handle received from vcsm_malloc -+** in order to be able to use the memory associated with it. -+** -+** On success, the pointer returned is only valid within -+** the lock content (ie until a corresponding vcsm_unlock_xx -+** is invoked). -+*/ -+void *vcsm_lock_cache( unsigned int handle, -+ VCSM_CACHE_TYPE_T cache_update, -+ VCSM_CACHE_TYPE_T *cache_result ); -+ -+ -+/* Unlocks the memory associated with this user mapped address. -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking a mapped address, the user should no longer -+** attempt to reference it. -+*/ -+int vcsm_unlock_ptr( void *usr_ptr ); -+ -+ -+/* Unlocks the memory associated with this user mapped address. -+** Apply special processing that would override the otherwise -+** default behavior. -+** -+** If 'cache_no_flush' is specified: -+** Do not flush cache as the result of the unlock (if cache -+** flush was otherwise applicable in this case). -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking a mapped address, the user should no longer -+** attempt to reference it. -+*/ -+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush ); -+ -+ -+/* Unlocks the memory associated with this user opaque handle. -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking an opaque handle, the user should no longer -+** attempt to reference the mapped addressed once associated -+** with it. -+*/ -+int vcsm_unlock_hdl( unsigned int handle ); -+ -+ -+/* Unlocks the memory associated with this user opaque handle. -+** Apply special processing that would override the otherwise -+** default behavior. -+** -+** If 'cache_no_flush' is specified: -+** Do not flush cache as the result of the unlock (if cache -+** flush was otherwise applicable in this case). -+** -+** Returns: 0 on success -+** -errno on error. -+** -+** After unlocking an opaque handle, the user should no longer -+** attempt to reference the mapped addressed once associated -+** with it. -+*/ -+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush ); -+ -+/* Clean and/or invalidate the memory associated with this user opaque handle -+** -+** Returns: non-zero on error -+** -+** structure contains a list of flush/invalidate commands. Commands are: -+** 0: nop -+** 1: invalidate given virtual range in L1/L2 -+** 2: clean given virtual range in L1/L2 -+** 3: clean+invalidate given virtual range in L1/L2 -+** 4: flush all L1/L2 -+*/ -+struct vcsm_user_clean_invalid_s { -+ struct { -+ unsigned int cmd; -+ unsigned int handle; -+ unsigned int addr; -+ unsigned int size; -+ } s[8]; -+}; -+ -+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s ); -+ -+#ifdef __cplusplus -+} -+#endif -+ -+#endif /* __USER_VCSM__H__INCLUDED__ */ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c new file mode 100644 -index 0000000..9580165 +index 0000000..b061fe0 --- /dev/null +++ b/libavcodec/rpi_zc.c -@@ -0,0 +1,406 @@ +@@ -0,0 +1,581 @@ +#include "config.h" +#ifdef RPI +#include "rpi_qpu.h" ++#include "rpi_mailbox.h" +#include "rpi_zc.h" ++#include "libavutil/avassert.h" ++#include + +#include "libavutil/buffer_internal.h" ++#include ++ ++#define TRACE_ALLOC 0 + +struct ZcPoolEnt; + +typedef struct ZcPool +{ + int numbytes; ++ unsigned int n; + struct ZcPoolEnt * head; + pthread_mutex_t lock; +} ZcPool; @@ -14993,27 +17836,56 @@ index 0000000..9580165 +{ + // It is important that we start with gmem as other bits of code will expect to see that + GPU_MEM_PTR_T gmem; ++ unsigned int n; + struct ZcPoolEnt * next; + struct ZcPool * pool; +} ZcPoolEnt; + -+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size) ++#if 1 ++//#define ALLOC_PAD 0x1000 ++#define ALLOC_PAD 0 ++#define ALLOC_ROUND 0x1000 ++//#define ALLOC_N_OFFSET 0x100 ++#define ALLOC_N_OFFSET 0 ++#define STRIDE_ROUND 0x80 ++#define STRIDE_OR 0x80 ++#else ++#define ALLOC_PAD 0 ++#define ALLOC_ROUND 0x1000 ++#define ALLOC_N_OFFSET 0 ++#define STRIDE_ROUND 32 ++#define STRIDE_OR 0 ++#endif ++ ++#define DEBUG_ZAP0_BUFFERS 0 ++ ++ ++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size) +{ + ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt)); + ++ // Round up to 4k & add 4k ++ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1); ++ + if (zp == NULL) { + av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n"); + goto fail0; + } + -+ if (gpu_malloc_cached(size, &zp->gmem) != 0) ++ if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0) + { -+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size); ++ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size); + goto fail1; + } + ++#if TRACE_ALLOC ++ printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm); ++#endif ++ ++ pool->numbytes = zp->gmem.numbytes; + zp->next = NULL; + zp->pool = pool; ++ zp->n = pool->n++; + return zp; + +fail1: @@ -15024,6 +17896,10 @@ index 0000000..9580165 + +static void zc_pool_ent_free(ZcPoolEnt * const zp) +{ ++#if TRACE_ALLOC ++ printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm); ++#endif ++ + gpu_free(&zp->gmem); + av_free(zp); +} @@ -15032,6 +17908,8 @@ index 0000000..9580165 +{ + ZcPoolEnt * p = pool->head; + pool->head = NULL; ++ pool->numbytes = -1; ++ + while (p != NULL) + { + ZcPoolEnt * const zp = p; @@ -15040,15 +17918,21 @@ index 0000000..9580165 + } +} + -+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes) ++static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes) +{ + ZcPoolEnt * zp; ++ int numbytes; ++ + pthread_mutex_lock(&pool->lock); + -+ if (numbytes != pool->numbytes) ++ numbytes = pool->numbytes; ++ ++ // If size isn't close then dump the pool ++ // Close in this context means within 128k ++ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes) + { + zc_pool_flush(pool); -+ pool->numbytes = numbytes; ++ numbytes = req_bytes; + } + + if (pool->head != NULL) @@ -15062,6 +17946,10 @@ index 0000000..9580165 + } + + pthread_mutex_unlock(&pool->lock); ++ ++ // Start with our buffer empty of preconceptions ++// rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE); ++ + return zp; +} + @@ -15071,6 +17959,10 @@ index 0000000..9580165 + if (zp != NULL) + { + pthread_mutex_lock(&pool->lock); ++#if TRACE_ALLOC ++ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes); ++#endif ++ + if (pool->numbytes == zp->gmem.numbytes) + { + zp->next = pool->head; @@ -15101,10 +17993,18 @@ index 0000000..9580165 + pthread_mutex_destroy(&pool->lock); +} + ++typedef struct ZcOldCtxVals ++{ ++ int thread_safe_callbacks; ++ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags); ++ void * get_buffer_context; ++} ZcOldCtxVals; + +typedef struct AVZcEnv +{ ++ unsigned int refcount; + ZcPool pool; ++ ZcOldCtxVals old; +} ZcEnv; + +// Callback when buffer unrefed to zero @@ -15124,28 +18024,94 @@ index 0000000..9580165 +} + +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( -+ const unsigned int video_width, const unsigned int video_height) ++ const int format, const unsigned int video_width, const unsigned int video_height) +{ + AVRpiZcFrameGeometry geo; -+ geo.stride_y = (video_width + 32 + 31) & ~31; -+ geo.stride_c = geo.stride_y / 2; -+// geo.height_y = (video_height + 15) & ~15; -+ geo.height_y = (video_height + 32 + 31) & ~31; -+ geo.height_c = geo.height_y / 2; ++ ++ switch (format) ++ { ++ case AV_PIX_FMT_YUV420P: ++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ // geo.stride_y = ((video_width + 32 + 31) & ~31); ++ geo.stride_c = geo.stride_y / 2; ++ // geo.height_y = (video_height + 15) & ~15; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ break; ++ ++ case AV_PIX_FMT_SAND128: ++ { ++ const unsigned int stripe_w = 128; ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ gpu_ref(); ++ mbox_get_image_params(gpu_get_mailbox(), &new_img); ++ gpu_unref(); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = (video_width + stripe_w - 1) / stripe_w; ++ ++ pthread_mutex_unlock(&sand_lock); ++ ++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); ++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); ++ break; ++ } ++ ++ default: ++ memset(&geo, 0, sizeof(geo)); ++ break; ++ } + return geo; +} + ++ +static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size) +{ + ZcPoolEnt *const zp = zc_pool_alloc(pool, size); + AVBufferRef * buf; ++ intptr_t idata = (intptr_t)zp->gmem.arm; ++#if ALLOC_N_OFFSET != 0 ++ intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1); ++#endif + + if (zp == NULL) { + av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size); + goto fail0; + } + -+ if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL) ++#if ALLOC_N_OFFSET != 0 ++ idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0); ++#endif ++ ++#if DEBUG_ZAP0_BUFFERS ++ memset((void*)idata, 0, size); ++#endif ++ ++ if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n"); + goto fail2; @@ -15159,13 +18125,12 @@ index 0000000..9580165 + return NULL; +} + -+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame) ++static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame) +{ -+ ZcEnv *const zc = s->get_buffer_context; -+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height); ++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); + const unsigned int size_y = geo.stride_y * geo.height_y; + const unsigned int size_c = geo.stride_c * geo.height_c; -+ const unsigned int size_pic = size_y + size_c * 2; ++ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes; + AVBufferRef * buf; + unsigned int i; + @@ -15173,7 +18138,7 @@ index 0000000..9580165 + + if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL) + { -+ av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); ++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); + return AVERROR(ENOMEM); + } + @@ -15184,19 +18149,24 @@ index 0000000..9580165 + } + + frame->buf[0] = buf; ++ + frame->linesize[0] = geo.stride_y; + frame->linesize[1] = geo.stride_c; + frame->linesize[2] = geo.stride_c; ++ if (geo.stripes > 1) ++ frame->linesize[3] = geo.height_y + geo.height_c; // abuse: linesize[3] = stripe stride ++ + frame->data[0] = buf->data; + frame->data[1] = frame->data[0] + size_y; -+ frame->data[2] = frame->data[1] + size_c; ++ if (geo.planes_c > 1) ++ frame->data[2] = frame->data[1] + size_c; ++ + frame->extended_data = frame->data; + // Leave extended buf alone + + return 0; +} + -+ +#define RPI_GET_BUFFER2 1 + +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags) @@ -15206,21 +18176,25 @@ index 0000000..9580165 +#else + int rv; + -+ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 || -+ frame->format != AV_PIX_FMT_YUV420P) ++ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0) + { +// printf("Do default alloc: format=%#x\n", frame->format); + rv = avcodec_default_get_buffer2(s, frame, flags); + } ++ else if (frame->format == AV_PIX_FMT_YUV420P || ++ frame->format == AV_PIX_FMT_SAND128) ++ { ++ rv = rpi_get_display_buffer(s->get_buffer_context, frame); ++ } + else + { -+ rv = rpi_get_display_buffer(s, frame); ++ rv = avcodec_default_get_buffer2(s, frame, flags); + } + +#if 0 -+ printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, -+ frame->width, frame->height, -+ frame->linesize[0], frame->linesize[1], frame->linesize[2], ++ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, ++ frame->format, frame->width, frame->height, ++ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3], + frame->data[0], frame->data[1], frame->data[2], + frame->buf[0], frame->buf[1], frame->buf[2], + av_buffer_get_opaque(frame->buf[0])); @@ -15241,7 +18215,7 @@ index 0000000..9580165 + dest->width = src->width; + dest->height = src->height; + -+ if (rpi_get_display_buffer(s, dest) != 0) ++ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0) + { + return NULL; + } @@ -15274,14 +18248,16 @@ index 0000000..9580165 +{ + assert(s != NULL); + -+ if (frame->format != AV_PIX_FMT_YUV420P) ++ if (frame->format != AV_PIX_FMT_YUV420P && ++ frame->format != AV_PIX_FMT_SAND128) + { -+ av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format); ++ av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); + return NULL; + } + + if (frame->buf[1] != NULL) + { ++ av_assert0(frame->format == AV_PIX_FMT_YUV420P); + if (maycopy) + { + av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); @@ -15317,6 +18293,18 @@ index 0000000..9580165 + return p == NULL ? -1 : p->vc_handle; +} + ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? 0 : fr_ref->data - p->arm; ++} ++ ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref) ++{ ++ return fr_ref == NULL ? 0 : fr_ref->size; ++} ++ ++ +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref) +{ + const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); @@ -15353,27 +18341,50 @@ index 0000000..9580165 + } +} + ++int av_rpi_zc_in_use(const struct AVCodecContext * const s) ++{ ++ return s->get_buffer2 == av_rpi_zc_get_buffer2; ++} ++ +int av_rpi_zc_init(struct AVCodecContext * const s) +{ -+ ZcEnv * const zc = av_rpi_zc_env_alloc(); -+ if (zc == NULL) ++ if (av_rpi_zc_in_use(s)) + { -+ return AVERROR(ENOMEM); ++ ZcEnv * const zc = s->get_buffer_context; ++ ++zc->refcount; + } ++ else ++ { ++ ZcEnv *const zc = av_rpi_zc_env_alloc(); ++ if (zc == NULL) ++ { ++ return AVERROR(ENOMEM); ++ } + -+ s->get_buffer_context = zc; -+ s->get_buffer2 = av_rpi_zc_get_buffer2; ++ zc->refcount = 1; ++ zc->old.get_buffer_context = s->get_buffer_context; ++ zc->old.get_buffer2 = s->get_buffer2; ++ zc->old.thread_safe_callbacks = s->thread_safe_callbacks; ++ ++ s->get_buffer_context = zc; ++ s->get_buffer2 = av_rpi_zc_get_buffer2; ++ s->thread_safe_callbacks = 1; ++ } + return 0; +} + +void av_rpi_zc_uninit(struct AVCodecContext * const s) +{ -+ if (s->get_buffer2 == av_rpi_zc_get_buffer2) ++ if (av_rpi_zc_in_use(s)) + { + ZcEnv * const zc = s->get_buffer_context; -+ s->get_buffer2 = avcodec_default_get_buffer2; -+ s->get_buffer_context = NULL; -+ av_rpi_zc_env_free(zc); ++ if (--zc->refcount == 0) ++ { ++ s->get_buffer2 = zc->old.get_buffer2; ++ s->get_buffer_context = zc->old.get_buffer_context; ++ s->thread_safe_callbacks = zc->old.thread_safe_callbacks; ++ av_rpi_zc_env_free(zc); ++ } + } +} + @@ -15381,19 +18392,19 @@ index 0000000..9580165 + diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h new file mode 100644 -index 0000000..f0109f4 +index 0000000..f4aeb78 --- /dev/null +++ b/libavcodec/rpi_zc.h -@@ -0,0 +1,83 @@ +@@ -0,0 +1,137 @@ +#ifndef LIBAVCODEC_RPI_ZC_H +#define LIBAVCODEC_RPI_ZC_H + +// Zero-Copy frame code for RPi +// RPi needs Y/U/V planes to be contiguous for display. By default +// ffmpeg will allocate separated planes so a memcpy is needed before -+// display. This code prodes a method a making ffmpeg allocate a single -+// bit of memory for the frame when can then be refrence counted until -+// display ahs finsihed with it. ++// display. This code provides a method a making ffmpeg allocate a single ++// bit of memory for the frame when can then be reference counted until ++// display has finished with it. + +#include "libavutil/frame.h" +#include "libavcodec/avcodec.h" @@ -15410,10 +18421,13 @@ index 0000000..f0109f4 + unsigned int height_y; + unsigned int stride_c; + unsigned int height_c; ++ unsigned int planes_c; ++ unsigned int stripes; +} AVRpiZcFrameGeometry; + + +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( ++ const int format, + const unsigned int video_width, const unsigned int video_height); + +// Replacement fn for avctx->get_buffer2 @@ -15422,7 +18436,7 @@ index 0000000..f0109f4 +// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames +// must be set to 1 as otherwise the buffer info is killed before being returned +// by avcodec_decode_video2. Note also that this means that the AVFrame that is -+// return must be manually derefed with av_frame_unref. This should be done ++// returned must be manually derefed with av_frame_unref. This should be done +// after av_rpi_zc_ref has been called. +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags); + @@ -15439,6 +18453,11 @@ index 0000000..f0109f4 +// Get the vc_handle from the frame ref +// Returns -1 if ref doesn't look valid +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref); ++// Get offset from the start of the memory referenced ++// by the vc_handle to valid data ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref); ++// Length of buffer data ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref); +// Get the number of bytes allocated from the frame ref +// Returns 0 if ref doesn't look valid +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref); @@ -15455,6 +18474,8 @@ index 0000000..f0109f4 +// Allocate the environment used by the ZC code +void av_rpi_zc_env_free(AVZcEnvPtr); + ++// Test to see if the context is using zc (checks get_buffer2) ++int av_rpi_zc_in_use(const struct AVCodecContext * const s); + +// Init ZC into a context +// There is nothing magic in this fn - it just packages setting @@ -15466,10 +18487,54 @@ index 0000000..f0109f4 +// get_buffer2 & get_buffer_context +void av_rpi_zc_uninit(struct AVCodecContext * const s); + ++ ++ ++static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame) ++{ ++ return frame->linesize[3]; ++} ++ ++static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ const unsigned int stride1 = frame->linesize[0]; ++ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y + stride2 * x2; ++} ++ ++static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) ++{ ++ const unsigned int stride1 = frame->linesize[0]; ++ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); ++ const unsigned int x = x_c * 2; ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y_c + stride2 * x2; ++} ++ ++static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y); ++} ++ ++static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y); ++} ++ ++static inline int rpi_sliced_frame(const AVFrame * const frame) ++{ ++ return frame->format == AV_PIX_FMT_SAND128; ++} ++ ++ +#endif + diff --git a/libavcodec/utils.c b/libavcodec/utils.c -index f7adb52..708526e 100644 +index f7adb52..3b398a3 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -26,6 +26,12 @@ @@ -15559,6 +18624,30 @@ index f7adb52..708526e 100644 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1, CONFIG_MEMORY_POISONING ? NULL : +@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags + { + int ret; + ++#ifdef RPI ++ // This is going to end badly if we let it continue ++ av_assert0(frame->format != AV_PIX_FMT_SAND128); ++#endif ++ + if ((ret = update_frame_pool(avctx, frame)) < 0) + return ret; + +diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c +index 21f8d9e..71ce7b9 100644 +--- a/libavfilter/avfilter.c ++++ b/libavfilter/avfilter.c +@@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args) + "options, but options were provided: %s.\n", args); + return AVERROR(EINVAL); + } ++ printf("=== args='%s'\n", args); + + #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR + if ( !strcmp(filter->filter->name, "format") || diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c index b31d233..2767306 100644 --- a/libavformat/mpegts.c @@ -15613,6 +18702,88 @@ index 0c0ce12..82e0bc3 100644 /** * @} */ +diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c +index 0dffa4d..5644176 100644 +--- a/libavutil/pixdesc.c ++++ b/libavutil/pixdesc.c +@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { + .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR | + AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA, + }, ++ [AV_PIX_FMT_SAND128] = { ++ .name = "sand128", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */ ++ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */ ++ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ ++ }, ++ .flags = 0, ++ } + }; + #if FF_API_PLUS1_MINUS1 + FF_ENABLE_DEPRECATION_WARNINGS +diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h +index 0ed01c4..4705e80 100644 +--- a/libavutil/pixfmt.h ++++ b/libavutil/pixfmt.h +@@ -303,7 +303,10 @@ enum AVPixelFormat { + AV_PIX_FMT_GBRAP10BE, ///< planar GBR 4:4:4:4 40bpp, big-endian + AV_PIX_FMT_GBRAP10LE, ///< planar GBR 4:4:4:4 40bpp, little-endian + +- AV_PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions ++// RPI - not on ifdef so can be got at by calling progs ++ AV_PIX_FMT_SAND128, ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ ++ AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions + }; + + #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A +diff --git a/libswscale/input.c b/libswscale/input.c +index 14ab5ab..e61b67a 100644 +--- a/libswscale/input.c ++++ b/libswscale/input.c +@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV, + } + } + ++ ++static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV, ++ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, ++ int width, uint32_t *unused) ++{ ++ // NIF ++} ++ + #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) + + static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, +@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) + case AV_PIX_FMT_P010BE: + c->chrToYV12 = p010BEToUV_c; + break; ++ case AV_PIX_FMT_SAND128: ++ c->chrToYV12 = sand128ToUV_c; ++ break; + } + if (c->chrSrcHSubSample) { + switch (srcFormat) { +diff --git a/libswscale/utils.c b/libswscale/utils.c +index 576d8f0..d7206cc 100644 +--- a/libswscale/utils.c ++++ b/libswscale/utils.c +@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { + [AV_PIX_FMT_AYUV64LE] = { 1, 1}, + [AV_PIX_FMT_P010LE] = { 1, 0 }, + [AV_PIX_FMT_P010BE] = { 1, 0 }, ++#ifdef RPI ++ [AV_PIX_FMT_SAND128] = { 1, 0 }, ++#endif + }; + + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) diff --git a/pi-util/conf.sh b/pi-util/conf.sh new file mode 100755 index 0000000..8b596a2 @@ -15652,21 +18823,61 @@ index 0000000..8b596a2 + +# gcc option for getting asm listing +# -Wa,-ahls +diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh +new file mode 100644 +index 0000000..160e149 +--- /dev/null ++++ b/pi-util/conf1.sh +@@ -0,0 +1,34 @@ ++echo "Configure for Pi1" ++ ++RPI_BUILDROOT=`pwd`/build ++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot ++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf ++RPI_OPT_VC=$RPI_ROOTFS/opt/vc ++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" ++#RPI_DEFS="-D__VCCOREVER__=0x04000000" ++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++./configure --enable-cross-compile\ ++ --cpu=arm1176jzf-s\ ++ --arch=armv\ ++ --disable-neon\ ++ --target-os=linux\ ++ --disable-stripping\ ++ --enable-mmal\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++ ++ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++# --enable-shared\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv new file mode 100644 -index 0000000..61d1399 +index 0000000..fc14f2a --- /dev/null +++ b/pi-util/conf_h265.csv @@ -0,0 +1,144 @@ +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5 -+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 ++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5 +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 -+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 ++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 @@ -15688,7 +18899,7 @@ index 0000000..61d1399 +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 -+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 @@ -15728,7 +18939,7 @@ index 0000000..61d1399 +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 -+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 @@ -15742,10 +18953,10 @@ index 0000000..61d1399 +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 -+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 -+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 @@ -15774,7 +18985,7 @@ index 0000000..61d1399 +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5 +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5 +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 -+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5 +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5 +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 @@ -15783,9 +18994,9 @@ index 0000000..61d1399 +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 -+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5 ++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 -+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 @@ -15804,10 +19015,10 @@ index 0000000..61d1399 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py new file mode 100644 -index 0000000..38f942f +index 0000000..c896bc6 --- /dev/null +++ b/pi-util/ffconf.py -@@ -0,0 +1,146 @@ +@@ -0,0 +1,154 @@ +#!/usr/bin/env python + +import os @@ -15851,16 +19062,18 @@ index 0000000..38f942f + except: + pass + -+ rv = False + if m1 and m2 and m1.group() == m2.group(): + print >> flog, "Match: " + m1.group() -+ rv = True ++ rv = 0 + elif not m1: + print >> flog, "****** Cannot find m1" ++ rv = 3 + elif not m2: + print >> flog, "****** Cannot find m2" ++ rv = 2 + else: + print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group() ++ rv = 1 + flog.close() + return rv + @@ -15906,19 +19119,25 @@ index 0000000..38f942f + print "==== ", name, + sys.stdout.flush() + -+ if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) : -+ if exp_test == 1: -+ failures.append(name) -+ print ": * FAIL *" -+ else: -+ print ": fail" -+ else: ++ rv = testone(os.path.join(conf_root, name), name, a[2], a[3]) ++ if (rv == 0): + if exp_test == 2: + print ": * OK *" + unx_success.append(name) + else: + print ": ok" -+ ++ elif exp_test > 1 and rv == 1: ++ print ": fail" ++ else: ++ failures.append(name) ++ if rv == 1: ++ print ": * FAIL *" ++ elif (rv == 2) : ++ print ": * CRASH *" ++ elif (rv == 3) : ++ print ": * MD5 MISSING *" ++ else : ++ print ": * BANG *" + + if failures or unx_success: + print "Unexpected Failures:", failures @@ -18462,6 +21681,21 @@ index 0000000..1eacc04 + +if __name__ == '__main__': + main() +diff --git a/pi-util/qem.sh b/pi-util/qem.sh +new file mode 100644 +index 0000000..47dd071 +--- /dev/null ++++ b/pi-util/qem.sh +@@ -0,0 +1,9 @@ ++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex ++QASM=python\ pi-util/qasm.py ++SRC_FILE=libavcodec/rpi_shader.qasm ++DST_BASE=shader ++ ++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR ++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c ++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h ++ diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py new file mode 100755 index 0000000..6a9a33f @@ -18554,3 +21788,137 @@ index 0000000..d8bdd91 +pi-util/rebase_liblinks.py $DST + + +diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py +new file mode 100644 +index 0000000..5935a11 +--- /dev/null ++++ b/pi-util/v3dusage.py +@@ -0,0 +1,128 @@ ++#!/usr/bin/env python ++ ++import sys ++import argparse ++import re ++ ++def do_logparse(logname): ++ ++ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ') ++ rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$') ++ rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$') ++ rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$') ++ ++ ttotal = {'idle':0.0} ++ tstart = {} ++ qctotal = {} ++ qtstotal = {} ++ l2hits = {} ++ l2total = {} ++ time0 = None ++ idle_start = None ++ qpu_op_no = 0 ++ op_count = 0 ++ ++ with open(logname, "rt") as infile: ++ for line in infile: ++ match = rmatch.match(line) ++ if match: ++# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":" ++ time = float(match.group(1)) ++ unit = match.group(3) ++ opstart = not match.group(2) ++ optype = match.group(7) ++ hascb = match.group(8) != "0" ++ ++ if unit == 'qpu1': ++ unit = unit + "." + str(qpu_op_no) ++ if not opstart: ++ if hascb or optype == 'EXECUTE_SYNC': ++ qpu_op_no = 0 ++ else: ++ qpu_op_no += 1 ++ ++ # Ignore sync type ++ if optype == 'EXECUTE_SYNC': ++ continue ++ ++ if not time0: ++ time0 = time ++ ++ if opstart: ++ tstart[unit] = time; ++ elif unit in tstart: ++ op_count += 1 ++ if not unit in ttotal: ++ ttotal[unit] = 0.0 ++ ttotal[unit] += time - tstart[unit] ++ del tstart[unit] ++ ++ if not idle_start and not tstart: ++ idle_start = time ++ elif idle_start and tstart: ++ ttotal['idle'] += time - idle_start ++ idle_start = None ++ ++ match = rqcycle.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in qctotal: ++ qctotal[unit] = 0 ++ qctotal[unit] += int(match.group(2)) ++ ++ match = rqtscycle.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in qtstotal: ++ qtstotal[unit] = 0 ++ qtstotal[unit] += int(match.group(2)) ++ ++ match = rl2hits.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in l2total: ++ l2total[unit] = 0 ++ l2hits[unit] = 0 ++ l2total[unit] += int(match.group(3)) ++ if match.group(2) == "hits": ++ l2hits[unit] += int(match.group(3)) ++ ++ ++ if not time0: ++ print "No v3d profile records found" ++ else: ++ tlogged = time - time0 ++ ++ print "Logged time:", tlogged, " Op count:", op_count ++ for unit in sorted(ttotal): ++ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged) ++ print ++ for unit in sorted(qctotal): ++ if not unit in qtstotal: ++ qtstotal[unit] = 0; ++ print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit]) ++ if unit in l2total: ++ print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit]) ++ ++ ++ ++if __name__ == '__main__': ++ argp = argparse.ArgumentParser( ++ formatter_class=argparse.RawDescriptionHelpFormatter, ++ description="QPU/VPU perf summary from VC logging", ++ epilog = """ ++Will also summarise TMU stalls if logging requests set in qpu noflush param ++in the profiled code. ++ ++Example use: ++ vcgencmd set_logging level=0xc0 ++ ++ sudo vcdbg log msg >& t.log ++ v3dusage.py t.log ++""") ++ ++ argp.add_argument("logfile") ++ args = argp.parse_args() ++ ++ do_logparse(args.logfile) ++ diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch index 721a065449..5240cf58ce 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch @@ -22,4 +22,3 @@ index 2fd3f2b..7165652 100644 if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) { *poutbuf = NULL; *poutbuf_size = 0; -