diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch index f9b7f1bd34..2786d22397 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch @@ -1,7 +1,7 @@ -diff --git b/.gitignore a/.gitignore +diff --git a/.gitignore b/.gitignore index 524fb73..305632b 100644 ---- b/.gitignore -+++ a/.gitignore +--- a/.gitignore ++++ b/.gitignore @@ -23,6 +23,7 @@ .\#* /.config @@ -10,81 +10,10 @@ index 524fb73..305632b 100644 /ffmpeg /ffplay /ffprobe -diff --git b/Changelog a/Changelog -index 6f023a9..ad53c9d 100644 ---- b/Changelog -+++ a/Changelog -@@ -1,7 +1,7 @@ - Entries are sorted chronologically from oldest to youngest within each release, - releases are sorted from youngest to oldest. - --version 3.3: -+version : - - CrystalHD decoder moved to new decode API - - add internal ebur128 library, remove external libebur128 dependency - - Pro-MPEG CoP #3-R2 FEC protocol -@@ -22,7 +22,6 @@ version 3.3: - - threshold filter - - midequalizer filter - - Optimal Huffman tables for (M)JPEG encoding --- VAAPI-accelerated MPEG-2 and VP8 encoding - - FM Screen Capture Codec decoder - - native Opus encoder - - ScreenPressor decoder -@@ -33,7 +32,6 @@ version 3.3: - - Removed the legacy X11 screen grabber, use XCB instead - - MPEG-7 Video Signature filter - - Removed asyncts filter (use af_aresample instead) --- Intel QSV-accelerated VP8 video decoding - - - version 3.2: -@@ -121,6 +119,7 @@ version 3.1: - - libutvideo wrapper removed - - YUY2 Lossless Codec decoder - - VideoToolbox H.264 encoder -+- VAAPI-accelerated MPEG-2 and VP8 encoding - - - version 3.0: -diff --git b/RELEASE_NOTES a/RELEASE_NOTES -new file mode 100644 -index 0000000..c3ec010 ---- /dev/null -+++ a/RELEASE_NOTES -@@ -0,0 +1,15 @@ -+ -+ ┌────────────────────────────────────────┐ -+ │ RELEASE NOTES for FFmpeg 3.2 "Hypatia" │ -+ └────────────────────────────────────────┘ -+ -+ The FFmpeg Project proudly presents FFmpeg 3.2 "Hypatia", about 4 -+ months after the release of FFmpeg 3.1. -+ -+ A complete Changelog is available at the root of the project, and the -+ complete Git history on http://source.ffmpeg.org. -+ -+ We hope you will like this release as much as we enjoyed working on it, and -+ as usual, if you have any questions about it, or any FFmpeg related topic, -+ feel free to join us on the #ffmpeg IRC channel (on irc.freenode.net) or ask -+ on the mailing-lists. -diff --git b/doc/Doxyfile a/doc/Doxyfile -index 0891899..8f855f8 100644 ---- b/doc/Doxyfile -+++ a/doc/Doxyfile -@@ -38,7 +38,7 @@ PROJECT_NAME = FFmpeg - # could be handy for archiving the generated documentation or if some version - # control system is used. - --PROJECT_NUMBER = -+PROJECT_NUMBER = 3.2 - - # Using the PROJECT_BRIEF tag one can provide an optional one line description - # for a project that appears at the top of each page and should give viewer a -diff --git b/ffmpeg.c a/ffmpeg.c -index 11faf0d..494c23d 100644 ---- b/ffmpeg.c -+++ a/ffmpeg.c +diff --git a/ffmpeg.c b/ffmpeg.c +index 4b4dae4..9a7c29c 100644 +--- a/ffmpeg.c ++++ b/ffmpeg.c @@ -23,6 +23,11 @@ * multimedia converter based on the FFmpeg libraries */ @@ -97,7 +26,7 @@ index 11faf0d..494c23d 100644 #include "config.h" #include #include -@@ -68,6 +73,25 @@ +@@ -69,6 +74,25 @@ # include "libavfilter/buffersrc.h" # include "libavfilter/buffersink.h" @@ -123,7 +52,7 @@ index 11faf0d..494c23d 100644 #if HAVE_SYS_RESOURCE_H #include #include -@@ -164,6 +188,174 @@ static int restore_tty; +@@ -165,6 +189,182 @@ static int restore_tty; static void free_input_threads(void); #endif @@ -171,7 +100,7 @@ index 11faf0d..494c23d 100644 + mmal_buffer_header_release(buffer); +} + -+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h) ++static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h) +{ + MMAL_COMPONENT_T* display; + MMAL_DISPLAYREGION_T region = @@ -182,7 +111,7 @@ index 11faf0d..494c23d 100644 + .fullscreen = 0, + .dest_rect = {x, y, w, h} + }; -+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h); ++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h); + + bcm_host_init(); // TODO is this needed? + mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display); @@ -192,7 +121,7 @@ index 11faf0d..494c23d 100644 + + { + MMAL_ES_FORMAT_T* format = display->input[0]->format; -+ format->encoding = MMAL_ENCODING_I420; ++ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420; + format->es->video.width = geo.stride_y; + format->es->video.height = geo.height_y; + format->es->video.crop.x = 0; @@ -209,7 +138,7 @@ index 11faf0d..494c23d 100644 + mmal_port_enable(display->input[0],display_cb_input); + mmal_port_enable(display->control,display_cb_control); + -+ printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y); ++ printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt); + + return display; +} @@ -249,7 +178,14 @@ index 11faf0d..494c23d 100644 + buf->offset = av_rpi_zc_offset(fr_buf); + buf->length = av_rpi_zc_length(fr_buf); + buf->alloc_size = av_rpi_zc_numbytes(fr_buf); -+ ++#if 0 ++ { ++ unsigned int n; ++ for (n = 0; n < fr->width; n += 128) { ++ memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2); ++ } ++ } ++#endif + ++rpi_display_count; +} +#else @@ -284,6 +220,7 @@ index 11faf0d..494c23d 100644 + +static void display_exit(MMAL_COMPONENT_T* display) +{ ++// sleep(120); + if (display) { + mmal_component_destroy(display); + } @@ -298,7 +235,7 @@ index 11faf0d..494c23d 100644 /* sub2video hack: Convert subtitles to video with alpha to insert them in filter graphs. This is a temporary solution until libavfilter gets real subtitles support. -@@ -575,6 +767,11 @@ static void ffmpeg_cleanup(int ret) +@@ -576,6 +776,11 @@ static void ffmpeg_cleanup(int ret) avformat_close_input(&input_files[i]->ctx); av_freep(&input_files[i]); } @@ -310,7 +247,7 @@ index 11faf0d..494c23d 100644 for (i = 0; i < nb_input_streams; i++) { InputStream *ist = input_streams[i]; -@@ -587,6 +784,9 @@ static void ffmpeg_cleanup(int ret) +@@ -588,6 +793,9 @@ static void ffmpeg_cleanup(int ret) av_freep(&ist->hwaccel_device); av_freep(&ist->dts_buffer); @@ -320,7 +257,7 @@ index 11faf0d..494c23d 100644 avcodec_free_context(&ist->dec_ctx); av_freep(&input_streams[i]); -@@ -617,6 +817,7 @@ static void ffmpeg_cleanup(int ret) +@@ -618,6 +826,7 @@ static void ffmpeg_cleanup(int ret) } term_exit(); ffmpeg_exited = 1; @@ -328,7 +265,7 @@ index 11faf0d..494c23d 100644 } void remove_avoptions(AVDictionary **a, AVDictionary *b) -@@ -1050,6 +1251,15 @@ static void do_video_out(OutputFile *of, +@@ -1053,6 +1262,15 @@ static void do_video_out(OutputFile *of, if (ost->source_index >= 0) ist = input_streams[ost->source_index]; @@ -336,7 +273,7 @@ index 11faf0d..494c23d 100644 + if (next_picture && ist != NULL) + { + if (!rpi_display) -+ rpi_display = display_init(0,0,next_picture->width,next_picture->height); ++ rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height); + display_frame(ist->dec_ctx, rpi_display, next_picture); + } +#endif @@ -344,7 +281,7 @@ index 11faf0d..494c23d 100644 frame_rate = av_buffersink_get_frame_rate(filter); if (frame_rate.num > 0 && frame_rate.den > 0) duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base)); -@@ -2873,6 +3083,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) +@@ -2884,6 +3102,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; @@ -357,23 +294,24 @@ index 11faf0d..494c23d 100644 ist->dec_ctx->thread_safe_callbacks = 1; av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0); -diff --git b/libavcodec/Makefile a/libavcodec/Makefile -index 0dd0c7b..d2eb014 100644 ---- b/libavcodec/Makefile -+++ a/libavcodec/Makefile -@@ -5,6 +5,11 @@ NAME = avcodec +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 0dd0c7b..b9732c5 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -5,6 +5,12 @@ NAME = avcodec HEADERS = avcodec.h \ avdct.h \ avfft.h \ + rpi_qpu.h \ + rpi_shader.h \ ++ rpi_shader_cmd.h \ + rpi_mailbox.h \ + rpi_hevc_transform.h \ + rpi_zc.h \ d3d11va.h \ dirac.h \ dv_profile.h \ -@@ -47,6 +52,10 @@ OBJS = allcodecs.o \ +@@ -47,6 +53,10 @@ OBJS = allcodecs.o \ resample.o \ resample2.o \ utils.o \ @@ -384,36 +322,26 @@ index 0dd0c7b..d2eb014 100644 vorbis_parser.o \ xiph.o \ -@@ -973,8 +982,7 @@ OBJS-$(CONFIG_AAC_ADTSTOASC_BSF) += aac_adtstoasc_bsf.o aacadtsdec.o \ - OBJS-$(CONFIG_CHOMP_BSF) += chomp_bsf.o - OBJS-$(CONFIG_DUMP_EXTRADATA_BSF) += dump_extradata_bsf.o - OBJS-$(CONFIG_DCA_CORE_BSF) += dca_core_bsf.o --OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF) += extract_extradata_bsf.o \ -- h2645_parse.o -+OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF) += extract_extradata_bsf.o - OBJS-$(CONFIG_H264_MP4TOANNEXB_BSF) += h264_mp4toannexb_bsf.o - OBJS-$(CONFIG_HEVC_MP4TOANNEXB_BSF) += hevc_mp4toannexb_bsf.o - OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF) += imx_dump_header_bsf.o -@@ -1103,3 +1111,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h +@@ -1103,3 +1113,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h endif + -+QASM := $(SUBDIR)../pi-util/qasm.py ++QASM_PY := ../local/bin/qasm.py + -+ifneq ("$(wildcard $(QASM))","") ++ifneq ("$(wildcard $(QASM_PY))","") +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm -+ python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ $(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ + +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm -+ python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ $(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ +endif + -+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h -diff --git b/libavcodec/allcodecs.c a/libavcodec/allcodecs.c ++$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h +diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 4df4772..ca05158 100644 ---- b/libavcodec/allcodecs.c -+++ a/libavcodec/allcodecs.c +--- a/libavcodec/allcodecs.c ++++ b/libavcodec/allcodecs.c @@ -696,6 +696,7 @@ static void register_all(void) REGISTER_PARSER(H261, h261); REGISTER_PARSER(H263, h263); @@ -422,26 +350,29 @@ index 4df4772..ca05158 100644 REGISTER_PARSER(HEVC, hevc); REGISTER_PARSER(MJPEG, mjpeg); REGISTER_PARSER(MLP, mlp); -diff --git b/libavcodec/arm/Makefile a/libavcodec/arm/Makefile -index 1eeac54..f96f93b 100644 ---- b/libavcodec/arm/Makefile -+++ a/libavcodec/arm/Makefile -@@ -135,8 +135,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index 1eeac54..a94a240 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -134,9 +134,13 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ + NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ ++ arm/hevc_misc_neon.o \ arm/hevcdsp_deblock_neon.o \ + arm/hevcdsp_epel_neon.o \ arm/hevcdsp_idct_neon.o \ - arm/hevcdsp_qpel_neon.o ++ arm/hevcdsp_cres_neon.o \ + arm/hevcdsp_qpel_neon.o \ + arm/hevcdsp_sao_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o -diff --git b/libavcodec/arm/cabac.h a/libavcodec/arm/cabac.h +diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h index fdbf86b..0a3980a 100644 ---- b/libavcodec/arm/cabac.h -+++ a/libavcodec/arm/cabac.h +--- a/libavcodec/arm/cabac.h ++++ b/libavcodec/arm/cabac.h @@ -26,13 +26,34 @@ #include "libavutil/internal.h" #include "libavcodec/cabac.h" @@ -620,11 +551,11 @@ index fdbf86b..0a3980a 100644 #endif /* HAVE_ARMV6T2_INLINE */ #endif /* AVCODEC_ARM_CABAC_H */ -diff --git b/libavcodec/arm/hevc_cabac.h a/libavcodec/arm/hevc_cabac.h +diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h new file mode 100644 index 0000000..31d3c59 --- /dev/null -+++ a/libavcodec/arm/hevc_cabac.h ++++ b/libavcodec/arm/hevc_cabac.h @@ -0,0 +1,491 @@ +/* + * This file is part of FFmpeg. @@ -1117,18 +1048,873 @@ index 0000000..31d3c59 +#endif /* HAVE_ARMV6T2_INLINE */ + +#endif /* AVCODEC_ARM_HEVC_CABAC_H */ -diff --git b/libavcodec/arm/hevcdsp_deblock_neon.S a/libavcodec/arm/hevcdsp_deblock_neon.S -index 166bddb..a088cc3 100644 ---- b/libavcodec/arm/hevcdsp_deblock_neon.S -+++ a/libavcodec/arm/hevcdsp_deblock_neon.S -@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1 +diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S +new file mode 100644 +index 0000000..373576b +--- /dev/null ++++ b/libavcodec/arm/hevc_misc_neon.S +@@ -0,0 +1,62 @@ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ rpi_zap_coeff_vals_neon( ++@ uint16_t * buf, [r0] ++@ unsigned int log_n_m2) [r1] ++ ++function rpi_zap_coeff_vals_neon, export=1 ++ vmov.i64 q8, #0 ++ adr r12, zc_tab ++ vmov.i64 q9, #0 ++ tst r0, #63 ++ vmov.i64 q10, #0 ++ add r0, #63 ++ vmov.i64 q11, #0 ++ and r0, #~63 ++ ldr pc, [r12, r1, lsl #2] ++ ++zc_tab: ++ .word zc_lc2 ++ .word zc_lc3 ++ .word zc_lc4 ++ .word zc_lc5 ++ ++@ 4*4*2: "32 bytes" 64 or 0 depending on dst address ++zc_lc2: ++ it eq ++ vstmeq r0, {q8-q11} ++ bx lr ++ ++@ 16*16*2 = 512 = 64 * 8 ++zc_lc4: ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++ vstm r0!, {q8-q11} ++@ 8*8*2 = 128 ++zc_lc3: ++ vstm r0!, {q8-q11} ++ vstm r0, {q8-q11} ++ bx lr ++ ++@ 32*32*2 = 2048 = 128 * 16 ++zc_lc5: ++ vmov.i64 q12, #0 ++ vmov.i64 q13, #0 ++ vmov.i64 q14, #0 ++ vmov.i64 q15, #0 ++ mov r2, #4 ++1: ++ vstm r0!, {q8-q15} ++ subs r2, #1 ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ bne 1b ++ bx lr ++ ++endfunc ++ +diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S +new file mode 100644 +index 0000000..880b26e +--- /dev/null ++++ b/libavcodec/arm/hevcdsp_cres_neon.S +@@ -0,0 +1,275 @@ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ General notes: ++@ ++@ Residual is only guaranteed to be cliped to 16 bits ++@ This means that we do need to do movul, qadd, qmovun ++@ rather than addw, qmovun (if we were clipped to 15 then we could get away ++@ with this) ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_4x4_u_neon_8, export=1 ++ vld1.8 {d16}, [r0, :64], r2 ++ vld1.8 {d17}, [r0, :64], r2 ++ vld1.8 {d18}, [r0, :64], r2 ++ vld1.8 {d19}, [r0, :64], r2 ++ vld1.16 {q0, q1}, [r1] ++ vmov.i64 q2, #0 ++ vmov.i64 q3, #0 ++ vmovl.u8 q10, d16 ++ sub r0, r0, r2, lsl #2 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ vst1.8 {d2}, [r0, :64], r2 ++ vst1.8 {d3}, [r0, :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_8x8_u_neon_8, export=1 ++ mov r12, #4 ++1: ++ vld2.8 {d16, d17}, [r0, :128], r2 ++ vld2.8 {d18, d19}, [r0, :128] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ sub r0, r2 ++ vmovl.u8 q11, d18 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqmovun.s16 d16, q0 ++ vqmovun.s16 d18, q1 ++ vst2.8 {d16, d17}, [r0, :128], r2 ++ vst2.8 {d18, d19}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_16x16_u_neon_8, export=1 ++ mov r12, #16 ++1: ++ vld2.8 {q8, q9}, [r0, :256] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqmovun.s16 d16, q0 ++ vqmovun.s16 d17, q1 ++ vst2.8 {q8, q9}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_4x4_v_neon_8, export=1 ++ vld1.8 {d16}, [r0, :64], r2 ++ vld1.8 {d17}, [r0, :64], r2 ++ vld1.8 {d18}, [r0, :64], r2 ++ vld1.8 {d19}, [r0, :64], r2 ++ vld1.16 {q2, q3}, [r1] ++ vmov.i64 q0, #0 ++ vmov.i64 q1, #0 ++ vmovl.u8 q10, d16 ++ sub r0, r0, r2, lsl #2 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ vst1.8 {d2}, [r0, :64], r2 ++ vst1.8 {d3}, [r0, :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_8x8_v_neon_8, export=1 ++ mov r12, #4 ++1: ++ vld2.8 {d16, d17}, [r0, :128], r2 ++ vld2.8 {d18, d19}, [r0, :128] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d17 ++ sub r0, r2 ++ vmovl.u8 q11, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqmovun.s16 d17, q0 ++ vqmovun.s16 d19, q1 ++ vst2.8 {d16, d17}, [r0, :128], r2 ++ vst2.8 {d18, d19}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_16x16_v_neon_8, export=1 ++ mov r12, #16 ++1: ++ vld2.8 {q8, q9}, [r0, :256] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d18 ++ vmovl.u8 q11, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqmovun.s16 d18, q0 ++ vqmovun.s16 d19, q1 ++ vst2.8 {q8, q9}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_4x4_c_neon_8, export=1 ++ vld1.8 {d16}, [r0, :64], r2 ++ vld1.8 {d17}, [r0, :64], r2 ++ vld1.8 {d18}, [r0, :64], r2 ++ vld1.8 {d19}, [r0, :64], r2 ++ vldm r1, {q0-q3} @ Q0/1 gets all of U, Q2/3 gets all of V ++ vmovl.u8 q10, d16 ++ sub r0, r0, r2, lsl #2 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ vst1.8 {d2}, [r0, :64], r2 ++ vst1.8 {d3}, [r0, :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_8x8_c_neon_8, export=1 ++ mov r12, #8 ++ add r3, r1, #(8*8*2) @ Offset to V ++1: ++ vld2.8 {d16, d17}, [r0, :128] ++ vld1.16 {q0}, [r1, :128]! ++ vld1.16 {q1}, [r3, :128]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst2.8 {d0, d1}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_16x16_c_neon_8, export=1 ++ mov r12, #16 ++ add r3, r1, #(16*16*2) @ Offset to V ++1: ++ vld2.8 {q8, q9}, [r0, :256] ++ vld1.16 {q0, q1}, [r1, :256]! ++ vld1.16 {q2, q3}, [r3, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d2, q2 ++ vqmovun.s16 d3, q3 ++ vst2.8 {q0, q1}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ 32x32 chroma never occurs so NIF ++ ++@ ============================================================================ ++ ++ +diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S +index 166bddb..9bd0a42 100644 +--- a/libavcodec/arm/hevcdsp_deblock_neon.S ++++ b/libavcodec/arm/hevcdsp_deblock_neon.S +@@ -15,7 +15,7 @@ + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software +- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1 + */ + + +@@ -31,6 +31,9 @@ + bxeq lr + .endm + ++@ Uses: d2, d4, d18, d19 ++@ Returns: d2, d4 ++@ Modifies: d0-d7, d22-d25 + .macro hevc_loop_filter_chroma_body + vsubl.u8 q3, d4, d2 + vsubl.u8 q11, d18, d19 +@@ -49,6 +52,33 @@ + vqmovun.s16 d4, q2 + .endm + ++ ++@ Uses r2[0:7], r2[8:15] ++@ Modifies: d0-d7, d22-d25 ++.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1 ++ vsubl.u8 q3, \Q0, \P0 ++ vsubl.u8 q11, \P1, \Q1 ++ vshl.i16 q3, #2 ++ vadd.i16 q11, q3 ++ ++ @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all) ++ vdup.16 d0, r2 ++ vmovl.u8 q0, d0 ++ vuzp.16 d0, d1 ++ ++ vrshr.s16 q11, q11, #3 ++ vneg.s16 q12, q0 ++ vmovl.u8 q2, \Q0 ++ vmin.s16 q11, q11, q0 ++ vmax.s16 q11, q11, q12 ++ vaddw.u8 q1, q11, \P0 ++ vsub.i16 q2, q11 ++ vqmovun.s16 \P0, q1 ++ vqmovun.s16 \Q0, q2 ++.endm ++ ++ ++ + .macro hevc_loop_filter_luma_start + ldr r12, [r3] + ldr r3, [r3, #4] +@@ -60,15 +90,17 @@ + lsr r3, #16 + .endm + +-.macro hevc_loop_filter_luma_body ++@ Uses: r2, r3, r12 ++@ Modifies: r5, r6, r7, r8, r9 ++function hevc_loop_filter_luma_body ++ vmovl.u8 q15, d23 ++ vmovl.u8 q14, d22 ++ vmovl.u8 q13, d21 ++ vmovl.u8 q12, d20 ++ vmovl.u8 q11, d19 ++ vmovl.u8 q10, d18 ++ vmovl.u8 q9, d17 + vmovl.u8 q8, d16 +- vmovl.u8 q9, d18 +- vmovl.u8 q10, d20 +- vmovl.u8 q11, d22 +- vmovl.u8 q12, d24 +- vmovl.u8 q13, d26 +- vmovl.u8 q14, d28 +- vmovl.u8 q15, d30 + + vadd.i16 q7, q9, q11 + vadd.i16 q6, q14, q12 +@@ -77,7 +109,6 @@ + vabd.s16 q7, q7, q10 + vabd.s16 q6, q6, q13 + +- + vdup.16 q0, r2 + vmov q4, q7 + vmov q5, q6 +@@ -152,7 +183,7 @@ + + and r9, r8, r7 + cmp r9, #0 +- beq weakfilter_\@ ++ beq weakfilter_ + + vadd.i16 q2, q11, q12 + vadd.i16 q4, q9, q8 +@@ -210,11 +241,11 @@ + vbit q13, q3, q5 + vbit q14, q2, q5 + +-weakfilter_\@: ++weakfilter_: + mvn r8, r8 + and r9, r8, r7 + cmp r9, #0 +- beq ready_\@ ++ beq ready_ + + vdup.16 q4, r2 + +@@ -275,75 +306,345 @@ weakfilter_\@: + vbit q11, q0, q5 + vbit q12, q4, q5 + +-ready_\@: ++ready_: + vqmovun.s16 d16, q8 +- vqmovun.s16 d18, q9 +- vqmovun.s16 d20, q10 +- vqmovun.s16 d22, q11 +- vqmovun.s16 d24, q12 +- vqmovun.s16 d26, q13 +- vqmovun.s16 d28, q14 +- vqmovun.s16 d30, q15 +-.endm ++ vqmovun.s16 d17, q9 ++ vqmovun.s16 d18, q10 ++ vqmovun.s16 d19, q11 ++ vqmovun.s16 d20, q12 ++ vqmovun.s16 d21, q13 ++ vqmovun.s16 d22, q14 ++ vqmovun.s16 d23, q15 ++ mov pc, lr ++endfunc ++ ++@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8])) ++function ff_hevc_v_loop_filter_luma2_neon_8, export=1 ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} @ 8 regs = 32 bytes ++ ++ ldr r4, [sp, #40] ++ b v_loop_luma_common ++endfunc ++ + + function ff_hevc_v_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start +- push {r5-r11} ++ push {r4-r10,lr} ++ ++ sub r4, r0, #4 ++v_loop_luma_common: ++ @ Why this isn't a bitmask to start with I have no idea... ++ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 ++ ldr r5, [sp, #32] ++ ldrh r10, [r5] ++ ldr r5, [sp, #36] ++ ldrh r5, [r5] ++ orr r10, r10, r5, lsl #16 @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1] ++ + vpush {d8-d15} +- sub r0, #4 +- vld1.8 {d16}, [r0], r1 +- vld1.8 {d18}, [r0], r1 +- vld1.8 {d20}, [r0], r1 +- vld1.8 {d22}, [r0], r1 +- vld1.8 {d24}, [r0], r1 +- vld1.8 {d26}, [r0], r1 +- vld1.8 {d28}, [r0], r1 +- vld1.8 {d30}, [r0], r1 +- sub r0, r0, r1, lsl #3 +- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 +- hevc_loop_filter_luma_body +- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 +- vst1.8 {d16}, [r0], r1 +- vst1.8 {d18}, [r0], r1 +- vst1.8 {d20}, [r0], r1 +- vst1.8 {d22}, [r0], r1 +- vst1.8 {d24}, [r0], r1 +- vst1.8 {d26}, [r0], r1 +- vst1.8 {d28}, [r0], r1 +- vst1.8 {d30}, [r0] ++ ++ @ Uses slightly fewer instructions to do laned loads than unlaned ++ @ and transpose. This also means that we can use the same code for ++ @ both split & unsplit deblock ++ vld4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1 ++ vld4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1 ++ ++ vld4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 ++ vld4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 ++ ++ vld4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 ++ vld4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 ++ ++ vld4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 ++ vld4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 ++ ++ vld4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 ++ vld4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 ++ ++ vld4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 ++ vld4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 ++ ++ vld4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 ++ vld4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 ++ ++ vld4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32] ++ vld4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32] ++ ++ bl hevc_loop_filter_luma_body ++ ++ neg r1, r1 ++ ++ @ no_p[1] ++ tst r10, #0xff00 ++ itt ne ++ addne r4, r4, r1, lsl #2 ++ bne 1f ++ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 ++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 ++ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 ++ ++1: ++ @ no_q[1] ++ tst r10, #0xff000000 ++ itt ne ++ addne r0, r0, r1, lsl #2 ++ bne 2f ++ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 ++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 ++ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 ++ ++2: ++ @ no_p[0] ++ tst r10, #0xff ++ bne 3f ++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 ++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32] ++ ++3: ++ @ no_q[0] ++ tst r10, #0xff0000 ++ bne 4f ++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 ++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32] ++ ++4: ++bypasswrite: + vpop {d8-d15} +- pop {r5-r11} +- bx lr ++ pop {r4-r10,pc} + endfunc + ++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] ++@ ptrdiff_t stride, [r1] ++@ int beta, [r2] ++@ int32_t *tc, [r3] ++@ uint8_t *no_p, sp[0] ++@ uint8_t *no_q); sp[4] ++@ ++@ Src should always be on 8 byte boundry & all in the same slice ++ + function ff_hevc_h_loop_filter_luma_neon, export=1 + hevc_loop_filter_luma_start +- push {r5-r11} ++ push {r4-r10,lr} ++ + vpush {d8-d15} + sub r0, r0, r1, lsl #2 ++ + vld1.8 {d16}, [r0], r1 ++ vld1.8 {d17}, [r0], r1 + vld1.8 {d18}, [r0], r1 ++ vld1.8 {d19}, [r0], r1 + vld1.8 {d20}, [r0], r1 ++ vld1.8 {d21}, [r0], r1 + vld1.8 {d22}, [r0], r1 +- vld1.8 {d24}, [r0], r1 +- vld1.8 {d26}, [r0], r1 +- vld1.8 {d28}, [r0], r1 +- vld1.8 {d30}, [r0], r1 +- sub r0, r0, r1, lsl #3 +- add r0, r1 +- hevc_loop_filter_luma_body +- vst1.8 {d18}, [r0], r1 +- vst1.8 {d20}, [r0], r1 +- vst1.8 {d22}, [r0], r1 +- vst1.8 {d24}, [r0], r1 +- vst1.8 {d26}, [r0], r1 +- vst1.8 {d28}, [r0] +-bypasswrite: ++ vld1.8 {d23}, [r0] ++ ++ bl hevc_loop_filter_luma_body ++ + vpop {d8-d15} +- pop {r5-r11} +- bx lr ++ ++ neg r1, r1 ++ add r0, r0, r1 ++ ++ @ Why this isn't a bitmask to start with I have no idea... ++ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 ++ ldr r5, [sp, #32] ++ ldrh r10, [r5] ++ ldr r5, [sp, #36] ++ ldrh r5, [r5] ++ orrs r10, r10, r5, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] ++ bne 1f ++ ++ vst1.8 {d22}, [r0], r1 ++ vst1.8 {d21}, [r0], r1 ++ vst1.8 {d20}, [r0], r1 ++ vst1.8 {d19}, [r0], r1 ++ vst1.8 {d18}, [r0], r1 ++ vst1.8 {d17}, [r0] ++ ++ pop {r4-r10,pc} ++ ++@ Partial write ++1: ++ vmov r2, r3, d22 ++ vmov r4, r5, d21 ++ vmov r6, r7, d20 ++ ++ tst r10, #0xff0000 ++ ittt eq ++ streq r2, [r0] ++ streq r4, [r0, r1] ++ streq r6, [r0, r1, lsl # 1] ++ ++ add r0, r0, #4 ++ tst r10, #0xff000000 ++ ittt eq ++ streq r3, [r0] ++ streq r5, [r0, r1] ++ streq r7, [r0, r1, lsl # 1] ++ ++ vmov r2, r3, d19 ++ vmov r4, r5, d18 ++ vmov r6, r7, d17 ++ add r0, r0, r1 ++ add r0, r0, r1, lsl # 1 ++ ++ tst r10, #0xff00 ++ ittt eq ++ streq r3, [r0] ++ streq r5, [r0, r1] ++ streq r7, [r0, r1, lsl # 1] ++ ++ tst r10, #0xff ++ ittt eq ++ streq r2, [r0, #-4]! ++ streq r4, [r0, r1] ++ streq r6, [r0, r1, lsl # 1] ++ ++ pop {r4-r10,pc} ++ + endfunc + ++@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++function ff_hevc_h_loop_filter_uv_neon_8, export=1 ++ sub r0, r0, r1, lsl #1 ++ vld2.8 {d16,d17}, [r0], r1 ++ vld2.8 {d18,d19}, [r0], r1 ++ vld2.8 {d26,d27}, [r0], r1 ++ vld2.8 {d28,d29}, [r0] ++ sub r0, r0, r1, lsl #1 ++ hevc_loop_filter_uv_body d16, d18, d26, d28 ++ lsr r2, r2, #16 ++ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ cmp r3, #0 ++ bne 1f ++ vst2.8 {d18,d19}, [r0], r1 ++ vst2.8 {d26,d27}, [r0] ++ bx lr ++ ++ @ At least one no_f bit is set ++ @ Which means we need to break this apart in an ugly fashion ++1: vzip.8 d18, d19 ++ vzip.8 d26, d27 ++ sub r1, r1, #8 ++ ++ tst r3, #1 ++ bne 1f ++ vst1.8 {d18}, [r0] ++1: add r0, r0, #8 ++ tst r3, #2 ++ bne 2f ++ vst1.8 {d19}, [r0] ++2: add r0, r0, r1 ++ ++ tst r3, #4 ++ bne 1f ++ vst1.8 {d26}, [r0] ++1: add r0, r0, #8 ++ tst r3, #8 ++ it ne ++ bxne lr ++ vst1.8 {d27}, [r0] ++ bx lr ++ ++endfunc ++ ++ ++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++function ff_hevc_v_loop_filter_uv2_neon_8, export=1 ++ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3], r1 ++ vld4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0], r1 ++ ++ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 ++ vld4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 ++ ++ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 ++ vld4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 ++ ++ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 ++ vld4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 ++ ++ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 ++ vld4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ ++ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 ++ vld4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 ++ ++ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 ++ vld4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 ++ ++ vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3] ++ vld4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0] ++ ++ hevc_loop_filter_uv_body d16, d18, d26, d28 ++ lsr r2, r2, #16 ++ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ ++ neg r1, r1 ++ ++ ldr r2, [sp, #0] ++ ++ @ p[1] ++ tst r2, #2 ++ itt ne ++ addne r3, r3, r1, lsl #2 ++ bne 1f ++ vst4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3], r1 ++ vst4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 ++ vst4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 ++ vst4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 ++ ++1: ++ @ q[1] ++ tst r2, #8 ++ itt ne ++ addne r0, r0, r1, lsl #2 ++ bne 2f ++ vst4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0], r1 ++ vst4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 ++ vst4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 ++ vst4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ ++2: ++ @ p[0] ++ tst r2, #1 ++ bne 3f ++ vst4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 ++ vst4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 ++ vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 ++ vst4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3] ++ ++3: ++ @ q[0] ++ tst r2, #4 ++ it ne ++ bxne lr ++ vst4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 ++ vst4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 ++ vst4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 ++ vst4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0] ++ ++ bx lr ++endfunc ++ ++ + function ff_hevc_v_loop_filter_chroma_neon, export=1 + hevc_loop_filter_chroma_start + sub r0, #4 +@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1 vst1.8 {d4}, [r0] bx lr endfunc + -+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, -+ * int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, -+ * MvField *curr, MvField *neigh, uint8_t *bs) ++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i ++ * int *curr_rpl0, int *curr_ ++ * MvField *curr, MvField *ne + */ +function ff_hevc_deblocking_boundary_strengths_neon, export=1 + add ip, sp, #4*4 @@ -1249,11 +2035,12 @@ index 166bddb..a088cc3 100644 +90: mov a3, #1 + b 11b +endfunc -diff --git b/libavcodec/arm/hevcdsp_epel_neon.S a/libavcodec/arm/hevcdsp_epel_neon.S ++ +diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S new file mode 100644 index 0000000..00eab9e --- /dev/null -+++ a/libavcodec/arm/hevcdsp_epel_neon.S ++++ b/libavcodec/arm/hevcdsp_epel_neon.S @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi @@ -1592,11 +2379,11 @@ index 0000000..00eab9e + .byte 4, 28, 46, 6 + .byte 2, 16, 54, 4 + .byte 2, 10, 58, 2 -diff --git b/libavcodec/arm/hevcdsp_init_neon.c a/libavcodec/arm/hevcdsp_init_neon.c -index 1a3912c..5c72e1d 100644 ---- b/libavcodec/arm/hevcdsp_init_neon.c -+++ a/libavcodec/arm/hevcdsp_init_neon.c -@@ -22,6 +22,8 @@ +diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c +index 1a3912c..c87e9d3 100644 +--- a/libavcodec/arm/hevcdsp_init_neon.c ++++ b/libavcodec/arm/hevcdsp_init_neon.c +@@ -22,11 +22,26 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/hevcdsp.h" #include "hevcdsp_arm.h" @@ -1605,10 +2392,49 @@ index 1a3912c..5c72e1d 100644 void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -@@ -43,6 +45,21 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, + void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++#ifdef RPI ++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++#endif ++ + void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit); + void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit); + void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs); +@@ -43,6 +58,52 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); ++#if RPI_HEVC_SAND ++void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++#endif ++ +void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); +void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); +void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); @@ -1623,11 +2449,21 @@ index 1a3912c..5c72e1d 100644 +void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); +void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); +void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); ++ ++void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, ++ const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo); ++ ++void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++ + #define PUT_PIXELS(name) \ void name(int16_t *dst, uint8_t *src, \ ptrdiff_t srcstride, int height, \ -@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); +@@ -58,6 +119,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8); #undef PUT_PIXELS @@ -1643,7 +2479,7 @@ index 1a3912c..5c72e1d 100644 static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int width); -@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t +@@ -142,14 +212,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); } @@ -1689,6 +2525,50 @@ index 1a3912c..5c72e1d 100644 + } +} + ++static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ // Width 32 already dealt with ++ // width 16 code works in double lines ++ if (width == 16 && (height & 1) == 0) { ++ ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst, ++ sao_offset_val_u, sao_left_class_u, ++ sao_offset_val_v, sao_left_class_v, ++ width, height); ++ } ++ else ++ { ++ const int shift = 3; // BIT_DEPTH - 5 ++ int k, y, x; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int8_t offset_table_u[32] = { 0 }; ++ int8_t offset_table_v[32] = { 0 }; ++ ++ stride_src /= sizeof(pixel); ++ stride_dst /= sizeof(pixel); ++ ++ for (k = 0; k < 4; k++) ++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; ++ for (k = 0; k < 4; k++) ++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width * 2; x += 2) ++ { ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); ++ } ++ dst += stride_dst; ++ src += stride_src; ++ ++ } ++ } ++} ++ +#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1)) +static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, + int16_t *_sao_offset_val, int eo, int width, int height) @@ -1767,6 +2647,54 @@ index 1a3912c..5c72e1d 100644 + } + } +} ++ ++ ++static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ ++ if (width == 32 && (height & 7) == 0) { ++ ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo); ++ } ++ else ++ { ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ int8_t sao_offset_val_u[8]; // padding of 3 for vld ++ int8_t sao_offset_val_v[8]; // padding of 3 for vld ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ ++ for (x = 0; x < 5; x++) { ++ sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]]; ++ sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]]; ++ } ++ ++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width * 2; x += 2) { ++ int diff0u = CMP(src[x], src[x + a_stride]); ++ int diff1u = CMP(src[x], src[x + b_stride]); ++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); ++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]); ++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++ } ++} +#undef CMP + +void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, @@ -1776,18 +2704,48 @@ index 1a3912c..5c72e1d 100644 av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) { if (bit_depth == 8) { -@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) + int x; + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon; + c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon; + c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon; ++#ifdef RPI ++ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_8; ++ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_8; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_8; ++#endif + c->idct[0] = ff_hevc_transform_4x4_neon_8; + c->idct[1] = ff_hevc_transform_8x8_neon_8; + c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8; +@@ -160,7 +455,25 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) + c->add_residual[1] = ff_hevc_add_residual_8x8_neon_8; c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8; c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8; ++#if RPI_HEVC_SAND ++ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_8; ++ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_8; ++ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_8; ++ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_8; ++ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_8; ++ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_8; ++ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_8; ++ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_8; ++ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_8; ++#endif c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; + for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) { + c->sao_band_filter[x] = ff_hevc_sao_band_neon_wrapper; ++ c->sao_band_filter_c[x] = ff_hevc_sao_band_c_neon_wrapper; + c->sao_edge_filter[x] = ff_hevc_sao_edge_neon_wrapper; ++ c->sao_edge_filter_c[x] = ff_hevc_sao_edge_c_neon_wrapper; + } ++ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_neon_8; // width=32 put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8; put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8; put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8; -@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -201,7 +514,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper; @@ -1809,7 +2767,7 @@ index 1a3912c..5c72e1d 100644 c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8; c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8; c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8; -@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -221,4 +548,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8; c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8; } @@ -1819,12 +2777,12 @@ index 1a3912c..5c72e1d 100644 + assert(offsetof(MvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon; } -diff --git b/libavcodec/arm/hevcdsp_sao_neon.S a/libavcodec/arm/hevcdsp_sao_neon.S +diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S new file mode 100644 -index 0000000..9c7808d +index 0000000..08a021d --- /dev/null -+++ a/libavcodec/arm/hevcdsp_sao_neon.S -@@ -0,0 +1,510 @@ ++++ b/libavcodec/arm/hevcdsp_sao_neon.S +@@ -0,0 +1,862 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * @@ -1950,24 +2908,186 @@ index 0000000..9c7808d + +function ff_hevc_sao_band_w64_neon_8, export=1 + init_sao_band -+1: subs r12, #1 -+ pld [r1, r3] -+ vld1.8 {q8-q9}, [r1, :128]! -+ vshr.u8 q12, q8, #3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {q10-q11}, [r1, :128], r3 -+ vshr.u8 q14, q10, #3 -+ vshr.u8 q15, q11, #3 -+ sub r1, #32 -+ sao_band_64 -+ vst1.8 {q8-q9}, [r0, :128]! -+ vst1.8 {q10-q11}, [r0, :128], r2 -+ sub r0, #32 -+ bne 1b + -+ bx lr ++ push {r4, lr} ++ subs r12, #1 ++ mov r4, r1 ++ it ne ++ addne r4, r3 ++ ++1: subs r12, #1 ++ vldm r1, {q8-q11} ++ pld [r4] ++ vshr.u8 q12, q8, #3 ++ vshr.u8 q13, q9, #3 ++ add r1, r3 ++ vshr.u8 q14, q10, #3 ++ vshr.u8 q15, q11, #3 ++ sao_band_64 ++ it ne ++ addne r4, r3 ++ vstm r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4, pc} +endfunc + ++ ++@ ff_hevc_sao_band_c_w64_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++@ As this is often done in-place on the frame buffer it is worth preloading ++@ the pixel values but we want to beware of loading ouside our buffer to avoid ++@ loading stuff into the cache that should still be invalid (in use by QPU, VPU) ++ ++function ff_hevc_sao_band_c_neon_8, export=1 ++ mov r12, sp ++ push {r4-r8, lr} // 24 bytes ++ ++ ldm r12, {r4-r7} ++ ++ add r4, #2 ++ add r6, #2 ++ vld1.16 {d16}, [r4] @ Unaligned ++ lsl r5, r5, #3 ++ vld1.16 {d18}, [r6] ++ pld [r1] ++ vmov.i8 d17, #0 ++ mov r4, r1 ++ vmov.i8 d19, #0 ++ lsl r7, r7, #3 ++ vdup.8 q1, r5 ++ ldr r5, [r12, #16] @ width ++ vdup.8 q2, r7 ++ ldr r12, [r12, #20] ++ vqmovn.s16 d0, q8 ++ cmp r5, #16 @ At some point we may want a table lookup ++ vqmovn.s16 d1, q9 ++ vmov.i8 q3, #128 ++ beq 16f ++ ++ @ d0 U lookup ++ @ d1 V lookup ++ @ q1 U raw offset ++ @ q2 V raw offset ++ @ q3 #128 ++ ++ @ r4 = r1 = src - Inteded for preload pointer ++ @ r12 = height ++ ++ @ Might (unlikely) be called with height == 1 ++ subs r12, #1 ++ it ne ++ addne r4, r3 ++ ++1: ++ subs r12, #1 ++ vld2.8 {q8-q9}, [r1, :128]! ++ vsub.u8 q12, q8, q1 ++ vld2.8 {q10-q11}, [r1, :128], r3 ++ vsub.u8 q14, q10, q1 ++ vsub.u8 q13, q9, q2 ++ sub r1, #32 ++ vsub.u8 q15, q11, q2 ++ pld [r4] ++ vshr.u8 q12, #3 ++ vadd.s8 q8, q3 ++ vshr.u8 q13, #3 ++ vadd.s8 q9, q3 ++ ++ vtbl.8 d24, {d0}, d24 ++ vshr.u8 q14, #3 ++ vtbl.8 d25, {d0}, d25 ++ vshr.u8 q15, #3 ++ vtbl.8 d26, {d1}, d26 ++ vadd.s8 q10, q3 ++ vtbl.8 d27, {d1}, d27 ++ vadd.s8 q11, q3 ++ vtbl.8 d28, {d0}, d28 ++ vqadd.s8 q8, q12 ++ vtbl.8 d29, {d0}, d29 ++ vqadd.s8 q9, q13 ++ vtbl.8 d30, {d1}, d30 ++ vqadd.s8 q10, q14 ++ vtbl.8 d31, {d1}, d31 ++ vsub.s8 q8, q3 ++ vqadd.s8 q11, q15 ++ vsub.s8 q9, q3 ++ vsub.s8 q10, q3 ++ vsub.s8 q11, q3 ++ ++ it ne ++ addne r4, r3 @ Do not inc on final pass ++ vst2.8 {q8-q9}, [r0, :128]! ++ vst2.8 {q10-q11}, [r0, :128], r2 ++ sub r0, #32 ++ bpl 1b ++ ++ pop {r4-r8, pc} ++ ++@ -- width 16 (UV pairs) -- ++16: ++ subs r12, #2 ++ it ne ++ addne r4, r4, r3, lsl #1 ++ ++1: ++ subs r12, #2 ++ vld2.8 {q8-q9}, [r1, :128], r3 ++ vsub.u8 q12, q8, q1 ++ vld2.8 {q10-q11}, [r1, :128], r3 ++ vsub.u8 q14, q10, q1 ++ vsub.u8 q13, q9, q2 ++ pld [r4] ++ vsub.u8 q15, q11, q2 ++ pld [r4, r3] ++ vshr.u8 q12, #3 ++ vadd.s8 q8, q3 ++ vshr.u8 q13, #3 ++ vadd.s8 q9, q3 ++ ++ vtbl.8 d24, {d0}, d24 ++ vshr.u8 q14, #3 ++ vtbl.8 d25, {d0}, d25 ++ vshr.u8 q15, #3 ++ vtbl.8 d26, {d1}, d26 ++ vadd.s8 q10, q3 ++ vtbl.8 d27, {d1}, d27 ++ vadd.s8 q11, q3 ++ vtbl.8 d28, {d0}, d28 ++ vqadd.s8 q8, q12 ++ vtbl.8 d29, {d0}, d29 ++ vqadd.s8 q9, q13 ++ vtbl.8 d30, {d1}, d30 ++ vqadd.s8 q10, q14 ++ vtbl.8 d31, {d1}, d31 ++ vsub.s8 q8, q3 ++ vqadd.s8 q11, q15 ++ vsub.s8 q9, q3 ++ vsub.s8 q10, q3 ++ vsub.s8 q11, q3 ++ ++ it ne ++ addne r4, r4, r3, lsl #1 ++ vst2.8 {q8-q9}, [r0, :128], r2 ++ vst2.8 {q10-q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ +.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3 + vcgt.u8 \out0, \in2, \in0 // c > a -> -1 , otherwise 0 + vcgt.u8 \tmp0, \in0, \in2 // a > c -> -1 , otherwise 0 @@ -1977,71 +3097,120 @@ index 0000000..9c7808d + vsub.s8 \out1, \tmp1, \out1 // diff0 part 2 +.endm + -+.macro table64 -+ vmov.s8 q13, #2 // 2 to all elements -+ vmov.32 d24[0], r4 // load offset table from general registers -+ vmov.32 d24[1], r5 // load rest of offset table -+ -+ vadd.s8 q0, q13 -+ vadd.s8 q1, q13 -+ vadd.s8 q2, q13 -+ vadd.s8 q3, q13 -+ -+ vmov.u8 q15, #128 // s8 #-128 -+ vtbl.8 d0, {d24}, d0 -+ vadd.s8 q13, q4, q15 -+ vtbl.8 d1, {d24}, d1 -+ vadd.s8 q14, q5, q15 -+ vtbl.8 d2, {d24}, d2 -+ vqadd.s8 q0, q13 -+ vtbl.8 d3, {d24}, d3 -+ vqadd.s8 q1, q14 -+ vtbl.8 d4, {d24}, d4 -+ vadd.s8 q13, q6, q15 -+ vtbl.8 d5, {d24}, d5 -+ vadd.s8 q14, q7, q15 -+ vtbl.8 d6, {d24}, d6 -+ vqadd.s8 q2, q13 -+ vtbl.8 d7, {d24}, d7 -+ vqadd.s8 q3, q14 -+ vsub.s8 q0, q15 -+ vsub.s8 q1, q15 -+ vsub.s8 q2, q15 -+ vsub.s8 q3, q15 -+ vst1.8 {q0-q1}, [r0, :128]! -+ vst1.8 {q2-q3}, [r0, :128], r2 -+ sub r0, #32 -+.endm + +// input +// a in q0 - q3 +// c in q4 - q7 +// b in q8 - q11 -+// offset table in r7 and r5 ++// offset table r4,r5 and r6,r7 ++// r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C +// output in q0 - q3 +// clobbers q12 - q15 -+.macro edge_w64_body -+ diff32 q12, q13, q0, q1, q0, q1, q4, q5 -+ diff32 q0, q1, q14, q15, q8, q9, q4, q5 + -+ vadd.s8 q0, q12 //diff0 + diff1 -+ vadd.s8 q1, q13 ++@ a <- c <- b ++@ ++@ It appears that Neon can stall if you try and use results too soon so we try to ++@ spread our instruction out + -+ diff32 q14, q15, q2, q3, q2, q3, q6, q7 -+ diff32 q2, q3, q12, q13, q10, q11, q6, q7 ++.macro edgeidx64 ++ ++ vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q1 ++ vcgt.u8 q14, q6, q2 ++ vcgt.u8 q15, q7, q3 ++ ++ vcgt.u8 q0, q0, q4 // a > c -> -1 , otherwise 0 ++ vcgt.u8 q1, q1, q5 ++ vcgt.u8 q2, q2, q6 ++ vcgt.u8 q3, q3, q7 ++ ++ vsub.s8 q0, q0, q12 // a = sign(c-a) ++ vsub.s8 q1, q1, q13 ++ vsub.s8 q2, q2, q14 ++ vsub.s8 q3, q3, q15 ++ ++ vcgt.u8 q12, q4, q8 // c > b -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q9 ++ vcgt.u8 q14, q6, q10 ++ vcgt.u8 q15, q7, q11 ++ ++ vsub.s8 q0, q0, q12 ++ vsub.s8 q1, q1, q13 ++ vsub.s8 q2, q2, q14 ++ vsub.s8 q3, q3, q15 ++ ++ vcgt.u8 q12, q8, q4 // c < b -> -1 , otherwise 0 ++ vcgt.u8 q13, q9, q5 ++ vcgt.u8 q14, q10, q6 ++ vcgt.u8 q15, q11, q7 ++ ++ vadd.s8 q0, q0, q12 // a = sign(c-a) + sign(c-b) ++ vadd.s8 q1, q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s8 q2, q2, q14 ++ vadd.s8 q3, q3, q15 ++ ++ vadd.s8 q0, q0, q12 ++ vadd.s8 q1, q1, q12 ++ @ whilst vmov dn, rm, rn exists it is a vfp instruction ++ @ and causes a stall till neon pipe empty - so don't do that! ++ vmov d26[0], r4 ++ vmov d26[1], r5 ++ vmov d27[0], r6 ++ vmov d27[1], r7 ++ vadd.s8 q2, q2, q12 ++ vuzp.8 q0, q1 ++ vmov.u8 q15, #128 ++ vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b) ++ ++ vtbl.8 d0, {d26}, d0 ++ vadd.s8 q12, q4, q15 // Add -128 so we can use saturating signed add ++ ++ vtbl.8 d1, {d26}, d1 ++ vadd.s8 q14, q5, q15 ++ ++ vtbl.8 d2, {d27}, d2 ++ vuzp.8 q2, q3 ++ ++ vtbl.8 d3, {d27}, d3 ++ ++ vtbl.8 d4, {d26}, d4 ++ vzip.8 q0, q1 ++ ++ vtbl.8 d5, {d26}, d5 ++ vqadd.s8 q0, q0, q12 ++ vqadd.s8 q1, q1, q14 ++ vadd.s8 q12, q6, q15 // Add -128 so we can use saturating signed add ++ ++ vtbl.8 d6, {d27}, d6 ++ vadd.s8 q14, q7, q15 // Add -128 so we can use saturating signed add ++ ++ vtbl.8 d7, {d27}, d7 ++ vzip.8 q2, q3 ++ ++ vsub.s8 q0, q0, q15 ++ vqadd.s8 q2, q2, q12 ++ vqadd.s8 q3, q3, q14 ++ vsub.s8 q1, q1, q15 ++ vsub.s8 q2, q2, q15 ++ vsub.s8 q3, q3, q15 + -+ vadd.s8 q2, q14 -+ vadd.s8 q3, q15 -+ table64 +.endm + ++function edge_w64_body ++ edgeidx64 ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bx lr ++endfunc ++ +.macro init_edge_64 -+ push {r4-r5} -+ ldr r12, [sp, #8] // height -+ ldr r5, [sp, #12] // sao_offset_val_table -+ ldr r4, [r5] -+ add r5, #4 -+ ldr r5, [r5] ++ push {r4-r8,lr} ++ ldr r12, [sp, #24] // height ++ ldr r5, [sp, #28] // sao_offset_val_table ++ ldrd r4, r5, [r5] ++ mov r6, r4 ++ mov r7, r5 +.endm + +function ff_hevc_sao_edge_eo0_w64_neon_8, export=1 @@ -2064,11 +3233,10 @@ index 0000000..9c7808d + vext.8 q9, q5, q6, #1 + vext.8 q10, q6, q7, #1 + vext.8 q11, q7, q12, #1 -+ edge_w64_body ++ bl edge_w64_body + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + +function ff_hevc_sao_edge_eo1_w64_neon_8, export=1 @@ -2088,7 +3256,7 @@ index 0000000..9c7808d + vld1.8 {q8-q9}, [r1, :128]! + vld1.8 {q10-q11}, [r1, :128], r3 + sub r1, #32 -+ edge_w64_body ++ bl edge_w64_body + // copy c to a + vmov.64 q0, q4 + vmov.64 q1, q5 @@ -2101,8 +3269,7 @@ index 0000000..9c7808d + vmov.64 q7, q11 + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + +function ff_hevc_sao_edge_eo2_w64_neon_8, export=1 @@ -2126,11 +3293,10 @@ index 0000000..9c7808d + vld1.8 {q8-q9}, [r1]! + vld1.8 {q10-q11}, [r1] + sub r1, #33 -+ edge_w64_body ++ bl edge_w64_body + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + +function ff_hevc_sao_edge_eo3_w64_neon_8, export=1 @@ -2154,13 +3320,157 @@ index 0000000..9c7808d + vld1.8 {q8-q9}, [r1]! + vld1.8 {q10-q11}, [r1] + sub r1, #31 -+ edge_w64_body ++ bl edge_w64_body + bne 1b + vpop {d8-d15} -+ pop {r4-r5} -+ bx lr ++ pop {r4-r8,pc} +endfunc + ++ ++@ void ff_hevc_sao_edge_c_eo1_w64_neon_8( ++@ uint8_t *_dst, r0 ++@ uint8_t *_src, r1 ++@ ptrdiff_t stride_dst, r2 ++@ ptrdiff_t stride_src, r3 ++@ int height, sp[0] ++@ int16_t *sao_offset_table_u, sp[4] ++@ int16_t *sao_offset_table_v); sp[8] ++@ int eo sp[12] ++ ++function ff_hevc_sao_edge_c_w64_neon_8, export=1 ++ push {r4-r8,lr} // 6 reg = 24 ++ ldr r5, [sp, #28] // sao_offset_val_table_u ++ ldr r7, [sp, #32] // sao_offset_val_table_v ++ ++ @ Load and rearrange offsets ++ @ Also "convert" from 16bit to 8bit ++ ldrb r4, [r5, #2] ++ ldrb r8, [r5, #4] ++ ldrb r6, [r7, #2] ++ ldrb r12, [r7, #4] ++ orr r4, r4, r8, lsl #8 ++ orr r6, r6, r12, lsl #8 ++ ldrb r8, [r5, #6] ++ ldrb r12, [r7, #6] ++ orr r4, r4, r8, lsl #24 ++ orr r6, r6, r12, lsl #24 ++ ldrb r5, [r5, #8] ++ ldrb r7, [r7, #8] ++ ++ ldr r12, [sp, #36] // e0 ++ adr r8, edge_c_tbl_w64 ++ ldr r8, [r8, r12, lsl #2] ++ ++ ldr r12, [sp, #24] // height ++ vpush {d8-d15} ++ mov pc, r8 ++ ++edge_c_tbl_w64: ++ .word ff_hevc_sao_edge_c_eo0_w64_neon_8 ++ .word ff_hevc_sao_edge_c_eo1_w64_neon_8 ++ .word ff_hevc_sao_edge_c_eo2_w64_neon_8 ++ .word ff_hevc_sao_edge_c_eo3_w64_neon_8 ++ ++ff_hevc_sao_edge_c_eo0_w64_neon_8: ++ sub r1, #8 ++1: subs r12, #1 ++ vld1.64 {d7}, [r1, :64]! ++ vld1.64 {q4-q5}, [r1, :128]! // load c ++ vld1.64 {q6-q7}, [r1, :128]! ++ vld1.64 {d24}, [r1, :64], r3 ++ sub r1, #72 ++ // load a ++ vext.8 q0, q3, q4, #14 ++ vext.8 q1, q4, q5, #14 ++ vext.8 q2, q5, q6, #14 ++ vext.8 q3, q6, q7, #14 ++ // load b ++ vext.8 q8, q4, q5, #2 ++ vext.8 q9, q5, q6, #2 ++ vext.8 q10, q6, q7, #2 ++ vext.8 q11, q7, q12, #2 ++ bl edge_w64_body ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++ ++ff_hevc_sao_edge_c_eo1_w64_neon_8: ++ sub r1, r3 ++ // load a ++ vldm r1, {q0-q3} ++ add r1, r3 ++ // load c ++ vldm r1, {q4-q7} ++ add r1, r3 ++1: subs r12, #1 ++ // load b ++ vldm r1, {q8-q11} ++ add r1, r3 ++ bl edge_w64_body ++ // copy c to a ++ vmov.64 q0, q4 ++ vmov.64 q1, q5 ++ vmov.64 q2, q6 ++ vmov.64 q3, q7 ++ // copy b to c ++ vmov.64 q4, q8 ++ vmov.64 q5, q9 ++ vmov.64 q6, q10 ++ vmov.64 q7, q11 ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++ ++ff_hevc_sao_edge_c_eo2_w64_neon_8: ++1: sub r1, r3 ++ // load a ++ // TODO: fix unaligned load ++ // don't reload a like in eo1 ++ sub r1, #2 ++ vld1.8 {q0-q1}, [r1]! ++ vld1.8 {q2-q3}, [r1], r3 ++ sub r1, #30 ++ subs r12, #1 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ sub r1, #32 ++ // load b ++ add r1, #2 ++ vld1.8 {q8-q9}, [r1]! ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #34 ++ bl edge_w64_body ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++ ++ff_hevc_sao_edge_c_eo3_w64_neon_8: ++1: sub r1, r3 ++ // load a ++ // TODO: fix unaligned load ++ // don't reload a like in eo1 ++ add r1, #2 ++ vld1.8 {q0-q1}, [r1]! ++ vld1.8 {q2-q3}, [r1], r3 ++ sub r1, #34 ++ subs r12, #1 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ sub r1, #32 ++ // load b ++ sub r1, #2 ++ vld1.8 {q8-q9}, [r1]! ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #30 ++ bl edge_w64_body ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r8,pc} ++endfunc ++ ++ +.macro init_edge_32 + ldr r12, [sp, #4] // sao_offset_val_table + vld1.32 {d31}, [r12] @@ -2277,7 +3587,7 @@ index 0000000..9c7808d + vext.8 q7, q11, q12, #8 + vext.8 q5, q10, q11, #7 + diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 ++ diff32 q0, q1, q10, q11, q8, q9, q2, q3 + vadd.s8 q0, q12 //diff0 + diff1 + vadd.s8 q1, q13 + table32 @@ -2317,7 +3627,7 @@ index 0000000..9c7808d + vext.8 q14, q12, q10, #7 + + diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 ++ diff32 q0, q1, q10, q11, q8, q9, q2, q3 + + vadd.s8 q0, q12 //diff0 + diff1 + vadd.s8 q1, q13 @@ -2335,10 +3645,10 @@ index 0000000..9c7808d + bx lr +endfunc + -diff --git b/libavcodec/avcodec.h a/libavcodec/avcodec.h -index d780477..5807e1b 100644 ---- b/libavcodec/avcodec.h -+++ a/libavcodec/avcodec.h +diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h +index 57334df..7648294 100644 +--- a/libavcodec/avcodec.h ++++ b/libavcodec/avcodec.h @@ -443,6 +443,8 @@ enum AVCodecID { AV_CODEC_ID_XPM, AV_CODEC_ID_AV1, @@ -2348,7 +3658,7 @@ index d780477..5807e1b 100644 /* various PCM "codecs" */ AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs AV_CODEC_ID_PCM_S16LE = 0x10000, -@@ -2925,6 +2927,7 @@ typedef struct AVCodecContext { +@@ -2935,6 +2937,7 @@ typedef struct AVCodecContext { #define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders. #define FF_BUG_TRUNCATED 16384 #define FF_BUG_IEDGE 32768 @@ -2356,7 +3666,7 @@ index d780477..5807e1b 100644 /** * strictly follow the standard (MPEG-4, ...). -@@ -3276,6 +3279,9 @@ typedef struct AVCodecContext { +@@ -3286,6 +3289,9 @@ typedef struct AVCodecContext { #define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244 #define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA) #define FF_PROFILE_H264_CAVLC_444 44 @@ -2366,7 +3676,7 @@ index d780477..5807e1b 100644 #define FF_PROFILE_VC1_SIMPLE 0 #define FF_PROFILE_VC1_MAIN 1 -@@ -3586,7 +3592,13 @@ typedef struct AVCodecContext { +@@ -3596,7 +3602,13 @@ typedef struct AVCodecContext { #endif /** @@ -2381,10 +3691,10 @@ index d780477..5807e1b 100644 * the end of the audio. I.e. this number of decoded samples must be * discarded by the caller from the end of the stream to get the original * audio without any trailing padding. -diff --git b/libavcodec/cabac.h a/libavcodec/cabac.h +diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h index 1bf1c62..ccfa991 100644 ---- b/libavcodec/cabac.h -+++ a/libavcodec/cabac.h +--- a/libavcodec/cabac.h ++++ b/libavcodec/cabac.h @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63]; typedef struct CABACContext{ int low; @@ -2401,10 +3711,10 @@ index 1bf1c62..ccfa991 100644 const uint8_t *bytestream_start; const uint8_t *bytestream; const uint8_t *bytestream_end; -diff --git b/libavcodec/codec_desc.c a/libavcodec/codec_desc.c +diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c index 9711019..9f99a2c 100644 ---- b/libavcodec/codec_desc.c -+++ a/libavcodec/codec_desc.c +--- a/libavcodec/codec_desc.c ++++ b/libavcodec/codec_desc.c @@ -1622,6 +1622,48 @@ static const AVCodecDescriptor codec_descriptors[] = { .props = AV_CODEC_PROP_LOSSLESS, .mime_types= MT("image/png"), @@ -2454,29 +3764,10 @@ index 9711019..9f99a2c 100644 /* various PCM "codecs" */ { -diff --git b/libavcodec/dvdsubdec.c a/libavcodec/dvdsubdec.c -index 4e9c058..22ce728 100644 ---- b/libavcodec/dvdsubdec.c -+++ a/libavcodec/dvdsubdec.c -@@ -189,12 +189,12 @@ static void guess_palette(DVDSubContext* ctx, - r = (((subtitle_color >> 16) & 0xff) * level) >> 8; - g = (((subtitle_color >> 8) & 0xff) * level) >> 8; - b = (((subtitle_color >> 0) & 0xff) * level) >> 8; -- rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24); -+ rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17) << 24); - color_used[colormap[i]] = (i + 1); - j++; - } else { - rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) | -- ((alpha[i] * 17U) << 24); -+ ((alpha[i] * 17) << 24); - } - } - } -diff --git b/libavcodec/h264.h a/libavcodec/h264.h +diff --git a/libavcodec/h264.h b/libavcodec/h264.h index 86df5eb..22c4f1d 100644 ---- b/libavcodec/h264.h -+++ a/libavcodec/h264.h +--- a/libavcodec/h264.h ++++ b/libavcodec/h264.h @@ -41,7 +41,9 @@ enum { H264_NAL_END_STREAM = 11, H264_NAL_FILLER_DATA = 12, @@ -2487,44 +3778,10 @@ index 86df5eb..22c4f1d 100644 }; #endif /* AVCODEC_H264_H */ -diff --git b/libavcodec/h264_parse.c a/libavcodec/h264_parse.c -index ea202e7..0c87319 100644 ---- b/libavcodec/h264_parse.c -+++ a/libavcodec/h264_parse.c -@@ -59,9 +59,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps, - if (luma_weight_flag) { - pwt->luma_weight[i][list][0] = get_se_golomb(gb); - pwt->luma_weight[i][list][1] = get_se_golomb(gb); -- if ((int8_t)pwt->luma_weight[i][list][0] != pwt->luma_weight[i][list][0] || -- (int8_t)pwt->luma_weight[i][list][1] != pwt->luma_weight[i][list][1]) -- goto out_range_weight; - if (pwt->luma_weight[i][list][0] != luma_def || - pwt->luma_weight[i][list][1] != 0) { - pwt->use_weight = 1; -@@ -79,9 +76,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps, - for (j = 0; j < 2; j++) { - pwt->chroma_weight[i][list][j][0] = get_se_golomb(gb); - pwt->chroma_weight[i][list][j][1] = get_se_golomb(gb); -- if ((int8_t)pwt->chroma_weight[i][list][j][0] != pwt->chroma_weight[i][list][j][0] || -- (int8_t)pwt->chroma_weight[i][list][j][1] != pwt->chroma_weight[i][list][j][1]) -- goto out_range_weight; - if (pwt->chroma_weight[i][list][j][0] != chroma_def || - pwt->chroma_weight[i][list][j][1] != 0) { - pwt->use_weight_chroma = 1; -@@ -110,9 +104,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps, - } - pwt->use_weight = pwt->use_weight || pwt->use_weight_chroma; - return 0; --out_range_weight: -- avpriv_request_sample(logctx, "Out of range weight\n"); -- return AVERROR_INVALIDDATA; - } - - /** -diff --git b/libavcodec/h264_parser.c a/libavcodec/h264_parser.c +diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c index bc35a61..055828c 100644 ---- b/libavcodec/h264_parser.c -+++ a/libavcodec/h264_parser.c +--- a/libavcodec/h264_parser.c ++++ b/libavcodec/h264_parser.c @@ -60,6 +60,8 @@ typedef struct H264ParseContext { uint8_t parse_history[6]; int parse_history_count; @@ -2618,30 +3875,11 @@ index bc35a61..055828c 100644 + .parser_close = h264_close, + .split = h264_split, +}; -diff --git b/libavcodec/h264_slice.c a/libavcodec/h264_slice.c -index 44a0b9f..fa1e9ae 100644 ---- b/libavcodec/h264_slice.c -+++ a/libavcodec/h264_slice.c -@@ -1778,12 +1778,9 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, - } - if ((pps->weighted_pred && sl->slice_type_nos == AV_PICTURE_TYPE_P) || - (pps->weighted_bipred_idc == 1 && -- sl->slice_type_nos == AV_PICTURE_TYPE_B)) { -- ret = ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count, -+ sl->slice_type_nos == AV_PICTURE_TYPE_B)) -+ ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count, - sl->slice_type_nos, &sl->pwt, h->avctx); -- if (ret < 0) -- return ret; -- } - - sl->explicit_ref_marking = 0; - if (nal->ref_idc) { -diff --git b/libavcodec/hevc.h a/libavcodec/hevc.h -index de77d2a..494ca48 100644 ---- b/libavcodec/hevc.h -+++ a/libavcodec/hevc.h -@@ -21,6 +21,34 @@ +diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h +index de77d2a..a63db2b 100644 +--- a/libavcodec/hevc.h ++++ b/libavcodec/hevc.h +@@ -21,6 +21,45 @@ #ifndef AVCODEC_HEVC_H #define AVCODEC_HEVC_H @@ -2649,6 +3887,8 @@ index de77d2a..494ca48 100644 +#ifndef RPI + + #define RPI_INTER 0 ++ #define RPI_TSTATS 0 ++ #define RPI_HEVC_SAND 0 + +#else + @@ -2671,15 +3911,24 @@ index de77d2a..494ca48 100644 +// #define RPI_DEBLOCK_VPU + + #define RPI_VPU_DEBLOCK_CACHED 1 ++ ++ #if HAVE_NEON ++ #define RPI_HEVC_SAND 1 ++ #else ++ // Sand bust on Pi1 currently - reasons unknown ++ #define RPI_HEVC_SAND 0 ++ #endif ++ ++ #define RPI_TSTATS 0 +#endif + /** * Table 7-3: NAL unit type codes */ -diff --git b/libavcodec/hevc_cabac.c a/libavcodec/hevc_cabac.c -index e27c54e..1dbbb16 100644 ---- b/libavcodec/hevc_cabac.c -+++ a/libavcodec/hevc_cabac.c +diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c +index e27c54e..09727d9 100644 +--- a/libavcodec/hevc_cabac.c ++++ b/libavcodec/hevc_cabac.c @@ -21,6 +21,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -2689,10 +3938,14 @@ index e27c54e..1dbbb16 100644 #include "libavutil/attributes.h" #include "libavutil/common.h" -@@ -29,8 +31,64 @@ +@@ -29,8 +31,68 @@ #include "hevc.h" #include "hevcdec.h" ++#ifdef RPI ++#include "rpi_zc.h" ++#endif ++ +// BY22 is probably faster than simple bypass if the processor has +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction +// x86 has fast int divide @@ -2754,7 +4007,7 @@ index e27c54e..1dbbb16 100644 /** * number of bin by SyntaxElement. */ -@@ -447,6 +505,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = { +@@ -447,6 +509,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = { { 28, 36, 43, 49, 54, 58, 61, 63, }, }; @@ -2966,7 +4219,7 @@ index e27c54e..1dbbb16 100644 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts) { if (s->ps.pps->entropy_coding_sync_enabled_flag && -@@ -865,19 +1128,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth) +@@ -865,19 +1132,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth) return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth); } @@ -2992,7 +4245,7 @@ index e27c54e..1dbbb16 100644 } int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) { -@@ -893,14 +1156,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) { +@@ -893,14 +1160,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) { return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx); } @@ -3009,7 +4262,7 @@ index e27c54e..1dbbb16 100644 ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); ctx_shift = (log2_size + 1) >> 2; } else { -@@ -931,22 +1194,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s, +@@ -931,22 +1198,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s, return value; } @@ -3035,7 +4288,7 @@ index e27c54e..1dbbb16 100644 { return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); } -@@ -968,90 +1225,337 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, +@@ -968,90 +1229,395 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc); } @@ -3048,7 +4301,7 @@ index e27c54e..1dbbb16 100644 + +#ifndef coeff_abs_level_remaining_decode_bypass +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param) - { ++{ + CABACContext * const c = &s->HEVClc->cc; + uint32_t y; + unsigned int prefix; @@ -3089,7 +4342,7 @@ index e27c54e..1dbbb16 100644 +#endif + +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param) -+{ + { + CABACContext * const c = &s->HEVClc->cc; int prefix = 0; int suffix = 0; @@ -3235,7 +4488,7 @@ index e27c54e..1dbbb16 100644 +static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) +{ + return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1); - } ++} +#endif + + @@ -3249,7 +4502,7 @@ index e27c54e..1dbbb16 100644 + (*stat_coeff)++; + else if (x == 0 && *stat_coeff > 0) + (*stat_coeff)--; -+} + } +#endif + + @@ -3330,6 +4583,62 @@ index e27c54e..1dbbb16 100644 + return i; +} + ++#ifdef RPI ++static void rpi_add_residual(HEVCContext * const s, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ unsigned int stride = frame->linesize[c_idx]; ++ unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; ++ unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; ++ const int is_sliced = rpi_sliced_frame(frame); ++ uint8_t * dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(frame, x, y) : ++ rpi_sliced_frame_pos_c(frame, x, y); ++ ++ if (s->enable_rpi) { ++ const unsigned int i = s->num_pred_cmds[s->pass0_job]; ++ HEVCPredCmd * const pc = s->univ_pred_cmds[s->pass0_job] + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert0(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.buf + (1 << (log2_trafo_size * 2)) && ++ pc->ta.stride == stride); ++ ++ pc->type = RPI_PRED_ADD_RESIDUAL_C; ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ s->num_pred_cmds[s->pass0_job] = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); ++ cmd->size = log2_trafo_size; ++ cmd->c_idx = c_idx; ++ cmd->ta.buf = coeffs; ++ cmd->ta.dst = dst; ++ cmd->ta.stride = stride; ++ } ++ } ++ else if (!is_sliced || c_idx == 0) { ++ s->hevcdsp.add_residual[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ } ++#if RPI_HEVC_SAND ++ else if (c_idx == 1) { ++ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ } ++ else { ++ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ } ++#endif ++} ++#endif void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int log2_trafo_size, enum ScanType scan_idx, @@ -3359,13 +4668,16 @@ index e27c54e..1dbbb16 100644 + const uint8_t *scan_x_cg, *scan_y_cg; + const xy_off_t * scan_xy_off; ++#ifndef RPI ptrdiff_t stride = s->frame->linesize[c_idx]; int hshift = s->ps.sps->hshift[c_idx]; int vshift = s->ps.sps->vshift[c_idx]; - uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + +- uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + ++ uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + ((x0 >> hshift) << s->ps.sps->pixel_shift)]; - int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); - uint8_t significant_coeff_group_flag[8][8] = {{0}}; ++#endif +#ifdef RPI + int use_vpu; +#endif @@ -3398,7 +4710,7 @@ index e27c54e..1dbbb16 100644 static const uint8_t rem6[51 + 4 * 6 + 1] = { 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, -@@ -1067,9 +1571,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1067,9 +1633,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, }; int qp_y = lc->qp_y; @@ -3419,7 +4731,7 @@ index e27c54e..1dbbb16 100644 } if (c_idx == 0) { -@@ -1102,39 +1616,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1102,39 +1678,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, qp += s->ps.sps->qp_bd_offset; } @@ -3510,7 +4822,7 @@ index e27c54e..1dbbb16 100644 &last_significant_coeff_x, &last_significant_coeff_y); if (last_significant_coeff_x > 3) { -@@ -1162,119 +1713,133 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1162,119 +1775,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int last_x_c = last_significant_coeff_x & 3; int last_y_c = last_significant_coeff_y & 3; @@ -3578,8 +4890,9 @@ index e27c54e..1dbbb16 100644 + if (s->enable_rpi) { + use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4; + coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount); -+#ifndef RPI_PRECLEAR -+ // We now do the memset after transform_add while we know the data is cached. ++#if HAVE_NEON ++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); ++#else + memset(coeffs, 0, ccount * sizeof(int16_t)); +#endif + } @@ -3708,7 +5021,7 @@ index e27c54e..1dbbb16 100644 if (log2_trafo_size == 3) { scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; } else { -@@ -1288,34 +1853,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1288,34 +1916,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } } @@ -3757,12 +5070,11 @@ index e27c54e..1dbbb16 100644 significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; nb_significant_coeff_flag++; } -@@ -1325,141 +1886,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1325,141 +1949,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } - n_end = nb_significant_coeff_flag; -- + if (nb_significant_coeff_flag != 0) { + const unsigned int gt1_idx_delta = (c_idx_nz << 2) | + ((i != 0 && !c_idx_nz) ? 2 : 0) | @@ -3810,6 +5122,9 @@ index e27c54e..1dbbb16 100644 + coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2); + } ++ // Probably not worth the overhead of starting by22 for just one value ++ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); + - if (n_end) { - int first_nz_pos_in_cg; - int last_nz_pos_in_cg; @@ -3820,9 +5135,6 @@ index e27c54e..1dbbb16 100644 - int sum_abs = 0; - int sign_hidden; - int sb_type; -+ // Probably not worth the overhead of starting by22 for just one value -+ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); - + if (coded_val) + { + if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { @@ -3833,13 +5145,18 @@ index e27c54e..1dbbb16 100644 + const unsigned int c_rice_param = *stat_coeff >> 2; + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param); -- // initialize first elem of coeff_bas_level_greater1_flag -- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; + trans_coeff_level = 3 + last_coeff_abs_level_remaining; + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + } + } +- // initialize first elem of coeff_bas_level_greater1_flag +- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; ++ { ++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; ++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; ++ const unsigned int scale_m = blk_scale[xy_off->scale]; + - if (s->ps.sps->persistent_rice_adaptation_enabled_flag) { - if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag) - sb_type = 2 * (c_idx == 0 ? 1 : 0); @@ -3847,11 +5164,7 @@ index e27c54e..1dbbb16 100644 - sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1; - c_rice_param = lc->stat_coeff[sb_type] / 4; - } -+ { -+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; -+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; -+ const unsigned int scale_m = blk_scale[xy_off->scale]; - +- - if (!(i == num_last_subset) && greater1_ctx == 0) - ctx_set++; - greater1_ctx = 1; @@ -3936,10 +5249,6 @@ index e27c54e..1dbbb16 100644 + + sum_abs += last_coeff_abs_level_remaining + 1; + *level = trans_coeff_level; -+ -+ if (stat_coeff != NULL) -+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); -+ stat_coeff = NULL; - for (m = 0; m < n_end; m++) { - n = significant_coeff_flag_idx[m]; @@ -3960,6 +5269,10 @@ index e27c54e..1dbbb16 100644 - if (lc->stat_coeff[sb_type] > 0) - lc->stat_coeff[sb_type]--; - rice_init = 1; ++ if (stat_coeff != NULL) ++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); ++ stat_coeff = NULL; ++ + if (trans_coeff_level > (3 << c_rice_param) && + (c_rice_param < 4 || rice_adaptation_enabled)) + ++c_rice_param; @@ -4060,7 +5373,7 @@ index e27c54e..1dbbb16 100644 if (lc->cu.cu_transquant_bypass_flag) { if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1469,7 +2074,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1469,7 +2137,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); } } else { @@ -4069,61 +5382,37 @@ index e27c54e..1dbbb16 100644 int rot = s->ps.sps->transform_skip_rotation_enabled_flag && log2_trafo_size == 2 && lc->cu.pred_mode == MODE_INTRA; -@@ -1490,6 +2095,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1489,7 +2157,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, + } } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { s->hevcdsp.transform_4x4_luma(coeffs); - } else { +- } else { ++ } +#ifdef RPI -+ if (!use_vpu) { -+ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); -+ if (max_xy == 0) { -+ s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs); -+ } else { -+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; -+ if (max_xy < 4) -+ col_limit = FFMIN(4, col_limit); -+ else if (max_xy < 8) -+ col_limit = FFMIN(8, col_limit); -+ else if (max_xy < 12) -+ col_limit = FFMIN(24, col_limit); -+ -+ s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit); -+ } -+ } ++ else if (!use_vpu) +#else ++ else ++#endif ++ { int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); if (max_xy == 0) s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); -@@ -1503,6 +2126,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - col_limit = FFMIN(24, col_limit); - s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); - } -+#endif - } - } - if (lc->tu.cross_pf) { -@@ -1512,6 +2136,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1512,7 +2186,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); } } +#ifdef RPI -+ if (s->enable_rpi) { -+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; -+ cmd->type = RPI_PRED_TRANSFORM_ADD; -+ cmd->size = log2_trafo_size; -+ cmd->ta.buf = coeffs; -+ cmd->ta.dst = dst; -+ cmd->ta.stride = stride; -+ return; -+ } -+#endif ++ rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++#else s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride); ++#endif } -diff --git b/libavcodec/hevc_filter.c a/libavcodec/hevc_filter.c -index 14e7c8d..e4ffd87 100644 ---- b/libavcodec/hevc_filter.c -+++ a/libavcodec/hevc_filter.c + void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size) +diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c +index 14e7c8d..0256b01 100644 +--- a/libavcodec/hevc_filter.c ++++ b/libavcodec/hevc_filter.c @@ -22,6 +22,12 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -4137,18 +5426,83 @@ index 14e7c8d..e4ffd87 100644 #include "libavutil/common.h" #include "libavutil/internal.h" -@@ -30,6 +36,10 @@ +@@ -30,6 +36,11 @@ #include "bit_depth_template.c" +#ifdef RPI +#include "rpi_qpu.h" ++#include "rpi_zc.h" +#endif + #define LUMA 0 #define CB 1 #define CR 2 -@@ -272,6 +282,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -138,6 +149,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC) + return s->qp_y_tab[x + y * s->ps.sps->min_cb_width]; + } + ++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx) ++{ ++#ifdef RPI ++ return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift; ++#else ++ return s->ps.sps->pixel_shift; ++#endif ++} ++ + static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, + ptrdiff_t stride_dst, ptrdiff_t stride_src) + { +@@ -192,7 +212,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, + ptrdiff_t stride_src, int x, int y, int width, int height, + int c_idx, int x_ctb, int y_ctb) + { +- int sh = s->ps.sps->pixel_shift; ++ const unsigned int sh = pixel_shift(s, c_idx); + int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; + int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; + +@@ -223,13 +243,14 @@ static void restore_tqb_pixels(HEVCContext *s, + int y_min = ((y0 ) >> s->ps.sps->log2_min_pu_size); + int x_max = ((x0 + width ) >> s->ps.sps->log2_min_pu_size); + int y_max = ((y0 + height) >> s->ps.sps->log2_min_pu_size); +- int len = (min_pu_size >> hshift) << s->ps.sps->pixel_shift; ++ const unsigned int sh = pixel_shift(s, c_idx); ++ int len = (min_pu_size >> hshift) << sh; + for (y = y_min; y < y_max; y++) { + for (x = x_min; x < x_max; x++) { + if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) { + int n; +- uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift); +- const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift); ++ uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh); ++ const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh); + for (n = 0; n < (min_pu_size >> vshift); n++) { + memcpy(src, dst, len); + src += stride_src; +@@ -245,7 +266,7 @@ static void restore_tqb_pixels(HEVCContext *s, + + static void sao_filter_CTB(HEVCContext *s, int x, int y) + { +- static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; + HEVCLocalContext *lc = s->HEVClc; + int c_idx; + int edges[4]; // 0 left 1 top 2 right 3 bottom +@@ -266,12 +287,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + uint8_t right_tile_edge = 0; + uint8_t up_tile_edge = 0; + uint8_t bottom_tile_edge = 0; ++#ifdef RPI ++ const int sliced = rpi_sliced_frame(s->frame); ++ const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1); ++#else ++ const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1); ++#endif + + edges[0] = x_ctb == 0; + edges[1] = y_ctb == 0; edges[2] = x_ctb == s->ps.sps->ctb_width - 1; edges[3] = y_ctb == s->ps.sps->ctb_height - 1; @@ -4159,7 +5513,300 @@ index 14e7c8d..e4ffd87 100644 if (restore) { if (!edges[0]) { left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; -@@ -495,6 +509,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -303,7 +334,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + } + } + +- for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) { ++ for (c_idx = 0; c_idx < plane_count; c_idx++) { + int x0 = x >> s->ps.sps->hshift[c_idx]; + int y0 = y >> s->ps.sps->vshift[c_idx]; + ptrdiff_t stride_src = s->frame->linesize[c_idx]; +@@ -312,28 +343,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + int width = FFMIN(ctb_size_h, (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0); + int height = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0); + int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; +- uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)]; + ptrdiff_t stride_dst; + uint8_t *dst; + ++#ifdef RPI ++ const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift; ++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; ++ uint8_t * const src = !sliced ? ++ &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(s->frame, x0, y0) : ++ rpi_sliced_frame_pos_c(s->frame, x0, y0); ++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : ++ !sliced ? src - (1 << sh) : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) : ++ rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0); ++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : ++ !sliced ? src + (width << sh) : ++ c_idx == 0 ? ++ rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) : ++ rpi_sliced_frame_pos_c(s->frame, x0 + width, y0); ++ ++ ++ if (sliced && c_idx > 1) { ++ break; ++ } ++#else ++ const unsigned int sh = s->ps.sps->pixel_shift; ++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; ++ uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)]; ++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh); ++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh); ++#endif ++ + switch (sao->type_idx[c_idx]) { + case SAO_BAND: + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); + if (s->ps.pps->transquant_bypass_enable_flag || + (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) { +- dst = lc->edge_emu_buffer; +- stride_dst = 2*MAX_PB_SIZE; +- copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src); +- s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, +- sao->offset_val[c_idx], sao->band_position[c_idx], +- width, height); +- restore_tqb_pixels(s, src, dst, stride_src, stride_dst, +- x, y, width, height, c_idx); ++ dst = lc->edge_emu_buffer; ++ stride_dst = 2*MAX_PB_SIZE; ++ copy_CTB(dst, src, width << sh, height, stride_dst, stride_src); ++#ifdef RPI ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++#endif ++ { ++ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } ++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, ++ x, y, width, height, c_idx); + } else { +- s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, +- sao->offset_val[c_idx], sao->band_position[c_idx], +- width, height); ++#ifdef RPI ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++#endif ++ { ++ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } + } + sao->type_idx[c_idx] = SAO_APPLIED; + break; +@@ -341,108 +426,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + { + int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; + int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; +- int left_edge = edges[0]; + int top_edge = edges[1]; +- int right_edge = edges[2]; + int bottom_edge = edges[3]; +- int sh = s->ps.sps->pixel_shift; +- int left_pixels, right_pixels; + + stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE; + dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE; + + if (!top_edge) { +- int left = 1 - left_edge; +- int right = 1 - right_edge; +- const uint8_t *src1[2]; + uint8_t *dst1; +- int src_idx, pos; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh); + +- dst1 = dst - stride_dst - (left << sh); +- src1[0] = src - stride_src - (left << sh); +- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh); +- pos = 0; +- if (left) { ++ dst1 = dst - stride_dst; ++ ++ if (src_l != NULL) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1, src1[src_idx], sh); +- pos += (1 << sh); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh); + } ++ + src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); +- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); +- if (right) { +- pos += width << sh; ++ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh); ++ ++ if (src_r != NULL) { + src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh); + } + } + if (!bottom_edge) { +- int left = 1 - left_edge; +- int right = 1 - right_edge; +- const uint8_t *src1[2]; +- uint8_t *dst1; +- int src_idx, pos; ++ uint8_t * const dst1 = dst + height * stride_dst; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh); ++ const unsigned int hoff = height * stride_src; + +- dst1 = dst + height * stride_dst - (left << sh); +- src1[0] = src + height * stride_src - (left << sh); +- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh); +- pos = 0; +- if (left) { ++ if (src_l != NULL) { + src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1, src1[src_idx], sh); +- pos += (1 << sh); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh); + } ++ + src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); +- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); +- if (right) { +- pos += width << sh; ++ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh); ++ ++ if (src_r != NULL) { + src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == + SAO_APPLIED); +- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh); + } + } +- left_pixels = 0; +- if (!left_edge) { ++ if (src_l != NULL) { + if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + copy_vert(dst - (1 << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { +- left_pixels = 1; ++ copy_vert(dst - (1 << sh), ++ src_l, ++ sh, height, stride_dst, stride_src); + } + } +- right_pixels = 0; +- if (!right_edge) { ++ if (src_r != NULL) { + if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { + copy_vert(dst + (width << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { +- right_pixels = 1; ++ copy_vert(dst + (width << sh), ++ src_r, ++ sh, height, stride_dst, stride_src); + } + } + +- copy_CTB(dst - (left_pixels << sh), +- src - (left_pixels << sh), +- (width + left_pixels + right_pixels) << sh, ++ copy_CTB(dst, ++ src, ++ width << sh, + height, stride_dst, stride_src); + + copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, + x_ctb, y_ctb); +- s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], +- sao->eo_class[c_idx], width, height); +- s->hevcdsp.sao_edge_restore[restore](src, dst, +- stride_src, stride_dst, +- sao, +- edges, width, +- height, c_idx, +- vert_edge, +- horiz_edge, +- diag_edge); ++#ifdef RPI ++ if (sliced && c_idx != 0) ++ { ++ // Class always the same for both U & V (which is just as well :-)) ++ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src, ++ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1], ++ width, height); ++ s->hevcdsp.sao_edge_restore_c[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } ++ else ++#endif ++ { ++ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], ++ sao->eo_class[c_idx], width, height); ++ s->hevcdsp.sao_edge_restore[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); + sao->type_idx[c_idx] = SAO_APPLIED; +@@ -452,6 +546,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + } + } + ++// Returns 2 or 0. + static int get_pcm(HEVCContext *s, int x, int y) + { + int log2_min_pu_size = s->ps.sps->log2_min_pu_size; +@@ -478,7 +573,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + uint8_t *src; + int x, y; + int chroma, beta; +- int32_t c_tc[2], tc[2]; ++ int32_t c_tc[4], tc[2]; + uint8_t no_p[2] = { 0 }; + uint8_t no_q[2] = { 0 }; + +@@ -495,6 +590,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->ps.sps->pcm.loop_filter_disable_flag) || s->ps.pps->transquant_bypass_enable_flag; @@ -4175,27 +5822,81 @@ index 14e7c8d..e4ffd87 100644 if (x0) { left_tc_offset = s->deblock[ctb - 1].tc_offset; left_beta_offset = s->deblock[ctb - 1].beta_offset; -@@ -538,6 +561,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); - } else -+#ifdef RPI_DEBLOCK_VPU -+ if (s->enable_rpi_deblock) { -+ uint8_t (*setup)[2][2][4]; -+ int num16 = (y>>4)*s->setup_width + (x>>4); -+ int a = ((y>>3) & 1) << 1; -+ int b = (x>>3) & 1; -+ setup = s->dvq->y_setup_arm[num16]; -+ setup[0][b][0][a] = beta; -+ setup[0][b][0][a + 1] = beta; -+ setup[0][b][1][a] = tc[0]; -+ setup[0][b][1][a + 1] = tc[1]; -+ } else +@@ -528,19 +632,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + + tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; + tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; +- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x - 1, y); + no_p[1] = get_pcm(s, x - 1, y + 4); + no_q[0] = get_pcm(s, x, y); + no_q[1] = get_pcm(s, x, y + 4); +- s->hevcdsp.hevc_v_loop_filter_luma_c(src, +- s->frame->linesize[LUMA], +- beta, tc, no_p, no_q); +- } else +- s->hevcdsp.hevc_v_loop_filter_luma(src, +- s->frame->linesize[LUMA], +- beta, tc, no_p, no_q); ++ } ++#ifdef RPI ++ if (rpi_sliced_frame(s->frame)) { ++ ++ // This copes properly with no_p/no_q ++ s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y), ++ s->frame->linesize[LUMA], ++ beta, tc, no_p, no_q, ++ rpi_sliced_frame_pos_y(s->frame, x - 4, y)); ++ } ++ else +#endif - s->hevcdsp.hevc_v_loop_filter_luma(src, - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); -@@ -570,6 +606,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) ++ { ++ src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; ++ if (pcmf) { ++ // Standard DSP code is broken if no_p / no_q is set ++ s->hevcdsp.hevc_v_loop_filter_luma_c(src, ++ s->frame->linesize[LUMA], ++ beta, tc, no_p, no_q); ++ } ++ else ++#ifdef RPI_DEBLOCK_VPU ++ if (s->enable_rpi_deblock) { ++ uint8_t (*setup)[2][2][4]; ++ int num16 = (y>>4)*s->setup_width + (x>>4); ++ int a = ((y>>3) & 1) << 1; ++ int b = (x>>3) & 1; ++ setup = s->dvq->y_setup_arm[num16]; ++ setup[0][b][0][a] = beta; ++ setup[0][b][0][a + 1] = beta; ++ setup[0][b][1][a] = tc[0]; ++ setup[0][b][1][a + 1] = tc[1]; ++ } else ++#endif ++ { ++ s->hevcdsp.hevc_v_loop_filter_luma(src, ++ s->frame->linesize[LUMA], ++ beta, tc, no_p, no_q); ++ } ++ } + } + } + +@@ -560,7 +696,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; + tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; + tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; +- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; ++ src = ++#ifdef RPI ++ rpi_sliced_frame(s->frame) ? ++ rpi_sliced_frame_pos_y(s->frame, x, y) : ++#endif ++ &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x, y - 1); + no_p[1] = get_pcm(s, x + 4, y - 1); +@@ -570,6 +711,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[LUMA], beta, tc, no_p, no_q); } else @@ -4215,7 +5916,113 @@ index 14e7c8d..e4ffd87 100644 s->hevcdsp.hevc_h_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q); -@@ -604,9 +653,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -578,6 +732,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + } + + if (s->ps.sps->chroma_format_idc) { ++#ifdef RPI ++ if (rpi_sliced_frame(s->frame)) { ++ const int v = 2; ++ const int h = 2; ++ ++ // vertical filtering chroma ++ for (y = y0; y < y_end; y += 8 * v) { ++ for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) { ++ const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; ++ const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2]; ++ ++ if ((bs0 == 2) || (bs1 == 2)) { ++ const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; ++ const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1; ++ unsigned int no_f = 0; ++ ++ // tc_offset here should be set to cur_tc_offset I think ++ const uint32_t tc4 = ++ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) | ++ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); ++ ++ if (tc4 == 0) ++ continue; ++ ++ if (pcmf) { ++ no_f = ++ (get_pcm(s, x - 1, y) ? 1 : 0) | ++ (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) | ++ (get_pcm(s, x, y) ? 4 : 0) | ++ (get_pcm(s, x, y + 4 * v) ? 8 : 0); ++ if (no_f == 0xf) ++ continue; ++ } ++ ++ s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->frame->linesize[1], ++ tc4, ++ rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ no_f); ++ } ++ } ++ ++ if (y == 0) ++ continue; ++ ++ // horizontal filtering chroma ++ tc_offset = x0 ? left_tc_offset : cur_tc_offset; ++ x_end2 = x_end; ++ if (x_end != s->ps.sps->width) ++ x_end2 = x_end - 8 * h; ++ ++ for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) { ++ const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; ++ const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2]; ++ if ((bs0 == 2) || (bs1 == 2)) { ++ const int qp0 = bs0 == 2 ? (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1 : 0; ++ const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0; ++ const uint32_t tc4 = ++ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) | ++ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); ++ unsigned int no_f = 0; ++ ++ if (tc4 == 0) ++ continue; ++ ++ if (pcmf) { ++ no_f = ++ (get_pcm(s, x, y - 1) ? 1 : 0) | ++ (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) | ++ (get_pcm(s, x, y) ? 4 : 0) | ++ (get_pcm(s, x + 4 * h, y) ? 8 : 0); ++ ++ if (no_f == 0xf) ++ continue; ++ } ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->frame->linesize[1], ++ tc4, no_f); ++ } ++ } ++ } ++ } ++ else ++#endif + for (chroma = 1; chroma <= 2; chroma++) { + int h = 1 << s->ps.sps->hshift[chroma]; + int v = 1 << s->ps.sps->vshift[chroma]; +@@ -594,7 +833,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + + c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0; + c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0; +- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; ++ src = ++#ifdef RPI ++ rpi_sliced_frame(s->frame) ? ++ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#endif ++ &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x - 1, y); + no_p[1] = get_pcm(s, x - 1, y + (4 * v)); +@@ -604,9 +848,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -4239,7 +6046,21 @@ index 14e7c8d..e4ffd87 100644 } } -@@ -637,6 +700,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -627,7 +885,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) + + c_tc[0] = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset) : 0; + c_tc[1] = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0; +- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; ++ src = ++#ifdef RPI ++ rpi_sliced_frame(s->frame) ? ++ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#endif ++ &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; + if (pcmf) { + no_p[0] = get_pcm(s, x, y - 1); + no_p[1] = get_pcm(s, x + (4 * h), y - 1); +@@ -637,6 +900,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -4259,7 +6080,7 @@ index 14e7c8d..e4ffd87 100644 s->hevcdsp.hevc_h_loop_filter_chroma(src, s->frame->linesize[chroma], c_tc, no_p, no_q); -@@ -647,69 +723,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -647,69 +923,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) } } @@ -4329,7 +6150,7 @@ index 14e7c8d..e4ffd87 100644 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_trafo_size) -@@ -720,10 +733,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -720,10 +933,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_min_tu_size = s->ps.sps->log2_min_tb_size; int min_pu_width = s->ps.sps->min_pu_width; int min_tu_width = s->ps.sps->min_tb_width; @@ -4339,8 +6160,9 @@ index 14e7c8d..e4ffd87 100644 - int i, j, bs; + int i, j; + RefPicList *rpl = s->ref->refPicList; -+ int min_pu_in_4pix = (1 << log2_min_pu_size) >> 2; -+ int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size; ++ const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size); ++ const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2); // Dup ++ const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep + int y_pu = y0 >> log2_min_pu_size; + int x_pu = x0 >> log2_min_pu_size; + MvField *curr = &tab_mvf[y_pu * min_pu_width + x_pu]; @@ -4354,7 +6176,7 @@ index 14e7c8d..e4ffd87 100644 boundary_upper = y0 > 0 && !(y0 & 7); if (boundary_upper && -@@ -735,34 +759,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -735,34 +960,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_upper = 0; @@ -4431,7 +6253,7 @@ index 14e7c8d..e4ffd87 100644 boundary_left = x0 > 0 && !(x0 & 7); if (boundary_left && ((!s->sh.slice_loop_filter_across_slices_enabled_flag && -@@ -773,64 +819,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -773,64 +1020,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_left = 0; @@ -4467,22 +6289,22 @@ index 14e7c8d..e4ffd87 100644 - - if (log2_trafo_size > log2_min_pu_size && !is_intra) { - RefPicList *rpl = s->ref->refPicList; -- ++ rpl; ++ MvField *left = curr - 1; + - // bs for TU internal horizontal PU boundaries - for (j = 8; j < (1 << log2_trafo_size); j += 8) { - int yp_pu = (y0 + j - 1) >> log2_min_pu_size; - int yq_pu = (y0 + j) >> log2_min_pu_size; -+ rpl; -+ MvField *left = curr - 1; ++ if (is_intra) { ++ for (j = 0; j < (1 << log2_trafo_size); j += 4) ++ bs[j * s->bs_width >> 2] = 2; - for (i = 0; i < (1 << log2_trafo_size); i += 4) { - int x_pu = (x0 + i) >> log2_min_pu_size; - MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu]; - MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu]; -+ if (is_intra) { -+ for (j = 0; j < (1 << log2_trafo_size); j += 4) -+ bs[j * s->bs_width >> 2] = 2; - +- - bs = boundary_strength(s, curr, top, rpl); - s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs; + } else { @@ -4534,7 +6356,7 @@ index 14e7c8d..e4ffd87 100644 } } } -@@ -839,11 +875,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -839,11 +1076,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, #undef CB #undef CR @@ -4544,8 +6366,8 @@ index 14e7c8d..e4ffd87 100644 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma) +{ + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ 0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma); + rpi_cache_flush_finish(rfe); +} +#endif @@ -4559,10 +6381,11 @@ index 14e7c8d..e4ffd87 100644 + const int d0 = ((int *)f->progress->data)[0]; + const unsigned int curr_y = d0 == -1 ? 0 : d0; // At start of time progress is -1 + -+ if (curr_y < (unsigned int)f->f->height) { ++ if (curr_y < (unsigned int)s->ps.sps->height) { + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ 0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y, ++ s->ps.sps->vshift[1], 1, 1); + rpi_cache_flush_finish(rfe); + } + } @@ -4639,7 +6462,7 @@ index 14e7c8d..e4ffd87 100644 if (s->ps.sps->sao_enabled) { int y_end = y >= s->ps.sps->height - ctb_size; if (y && x) -@@ -852,16 +981,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) +@@ -852,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) sao_filter_CTB(s, x - ctb_size, y); if (y && x_end) { sao_filter_CTB(s, x, y - ctb_size); @@ -4660,8 +6483,7 @@ index 14e7c8d..e4ffd87 100644 +#endif ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); + } - } -- } else if (s->threads_type & FF_THREAD_FRAME && x_end) ++ } + } else if (s->threads_type == FF_THREAD_FRAME && x_end) { + //int newh = y + ctb_size - 4; + //int currh = s->ref->tf.progress->data[0]; @@ -4677,7 +6499,8 @@ index 14e7c8d..e4ffd87 100644 + rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); +#endif + ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); -+ } + } +- } else if (s->threads_type & FF_THREAD_FRAME && x_end) +#else +#if RPI_INTER + rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); @@ -4689,11 +6512,24 @@ index 14e7c8d..e4ffd87 100644 } void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size) -diff --git b/libavcodec/hevc_ps.c a/libavcodec/hevc_ps.c -index acd55cc..0a465d4 100644 ---- b/libavcodec/hevc_ps.c -+++ a/libavcodec/hevc_ps.c -@@ -1001,6 +1001,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, +diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c +index acd55cc..c1716c2 100644 +--- a/libavcodec/hevc_ps.c ++++ b/libavcodec/hevc_ps.c +@@ -780,7 +780,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) + switch (sps->bit_depth) { + case 8: + if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8; ++#if RPI_HEVC_SAND ++ // *** Horrid kludge s.t. we start out with sand format ++ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P; ++#else + if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P; ++#endif + if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P; + if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P; + break; +@@ -1001,6 +1006,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, sps->amp_enabled_flag = get_bits1(gb); sps->sao_enabled = get_bits1(gb); @@ -4702,17 +6538,69 @@ index acd55cc..0a465d4 100644 sps->pcm_enabled_flag = get_bits1(gb); if (sps->pcm_enabled_flag) { sps->pcm.bit_depth = get_bits(gb, 4) + 1; -diff --git b/libavcodec/hevcdec.c a/libavcodec/hevcdec.c -index ef21595..b36e840 100644 ---- b/libavcodec/hevcdec.c -+++ a/libavcodec/hevcdec.c -@@ -42,8 +42,233 @@ +diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c +index 9103c84..eb26e7d 100644 +--- a/libavcodec/hevc_refs.c ++++ b/libavcodec/hevc_refs.c +@@ -206,7 +206,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush) + HEVCFrame *frame = &s->DPB[min_idx]; + AVFrame *dst = out; + AVFrame *src = frame->frame; +- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format); ++ const int fmt = src->format; ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + int pixel_shift = !!(desc->comp[0].depth > 8); + + ret = av_frame_ref(out, src); +@@ -217,12 +218,29 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush) + if (ret < 0) + return ret; + +- for (i = 0; i < 3; i++) { +- int hshift = (i > 0) ? desc->log2_chroma_w : 0; +- int vshift = (i > 0) ? desc->log2_chroma_h : 0; +- int off = ((frame->window.left_offset >> hshift) << pixel_shift) + +- (frame->window.top_offset >> vshift) * dst->linesize[i]; +- dst->data[i] += off; ++ if (fmt == AV_PIX_FMT_SAND128) ++ { ++ // Sand cannot be windowed by offset so add side data if we have an offset ++ const HEVCWindow * const window = &frame->window; ++ if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0) ++ { ++ AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan)); ++ AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data; ++ si->left_offset = window->left_offset; ++ si->top_offset = window->top_offset; ++ si->pic_width = s->ps.sps->width; ++ si->pic_height = s->ps.sps->height; ++ } ++ } ++ else ++ { ++ for (i = 0; i < 3; i++) { ++ int hshift = (i > 0) ? desc->log2_chroma_w : 0; ++ int vshift = (i > 0) ? desc->log2_chroma_h : 0; ++ int off = ((frame->window.left_offset >> hshift) << pixel_shift) + ++ (frame->window.top_offset >> vshift) * dst->linesize[i]; ++ dst->data[i] += off; ++ } + } + av_log(s->avctx, AV_LOG_DEBUG, + "Output frame with POC %d.\n", frame->poc); +diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c +index f9e8ff0..8a3d874 100644 +--- a/libavcodec/hevcdec.c ++++ b/libavcodec/hevcdec.c +@@ -42,8 +42,207 @@ #include "hevcdec.h" #include "profiles.h" +#ifdef RPI + #include "rpi_qpu.h" + #include "rpi_shader.h" ++ #include "rpi_shader_cmd.h" ++ #include "rpi_zc.h" + + // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory + #define RPI_CACHE_UNIF_MVS 1 @@ -4726,10 +6614,15 @@ index ef21595..b36e840 100644 + #include "libavutil/atomic.h" + + static void worker_core(HEVCContext * const s); ++ ++ // We can pred any block height, but caching may make some heights better than others ++ // Currently it doesn't seem to make a lot of difference ++ // 0 => any height ++ #define Y_P_MAX_H 0 ++ #define Y_B_MAX_H 0 +#endif + -+// #define DISABLE_MC -+ ++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards + +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) + @@ -4740,78 +6633,31 @@ index ef21595..b36e840 100644 +} +# define av_mod_uintp2 av_mod_uintp2_c +#endif -+ -+#define Y_B_ONLY 1 + const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; + +#if RPI_INTER + -+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks -+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks -+// For each block of 64*64 the smallest block size is 8x4 -+// We also need an extra command for the setup information ++#define MC_DUMMY_X (-32) ++#define MC_DUMMY_Y (-32) ++ ++// UV still has min 4x4 pred ++// Allow for even spread +1 for setup, +1 for rounding ++// If we have load sharingw e will want different (bigger) numbers and/or a non-constant chunk size ++ ++// Worst case (all 4x4) commands per CTU ++#define QPU_Y_CMD_PER_CTU_MAX (8 * 8) ++#define QPU_C_CMD_PER_CTU_MAX (4 * 4) ++ ++#define UV_COMMANDS_PER_QPU (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 / QPU_N_UV + 2) ++#define Y_COMMANDS_PER_QPU (((RPI_MAX_WIDTH * 64) / (4 * 4)) / QPU_N_Y + 2) + -+#define RPI_CHROMA_COMMAND_WORDS 11 -+#define UV_COMMANDS_PER_QPU ((1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS) +// The QPU code for UV blocks only works up to a block width of 8 +#define RPI_CHROMA_BLOCK_WIDTH 8 + -+typedef struct qpu_mc_pred_c_s { -+ uint32_t next_fn; -+ int16_t next_src_y; -+ int16_t next_src_x; -+ uint32_t next_src_base_u; -+ uint32_t next_src_base_v; -+ union { -+ struct { -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t wo_u; -+ uint32_t wo_v; -+ uint32_t dst_addr_u; -+ uint32_t dst_addr_v; -+ } p; -+ struct { -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t weight_u; -+ uint32_t weight_v; -+ uint32_t dummy0; -+ uint32_t dummy1; -+ } b0; -+ struct { -+ uint32_t dummy0; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t wo_u; -+ uint32_t wo_v; -+ uint32_t dst_addr_u; -+ uint32_t dst_addr_v; -+ } b1; -+ struct { -+ uint32_t pic_w; -+ uint32_t pic_h; -+ uint32_t src_stride; -+ uint32_t dst_stride; -+ uint32_t wdenom; -+ uint32_t dummy0; -+ uint32_t dummy1; -+ } s; -+ }; -+} qpu_mc_pred_c_t; -+ -+ -+static const char static_assert_qpu_mc_pred[sizeof(qpu_mc_pred_c_t) != RPI_CHROMA_COMMAND_WORDS * 4 ? -1 : 1] = {0}; -+ +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) + -+// TODO Chroma only needs 4 taps + +// Actual filter goes -ve, +ve, +ve, -ve using these values +static const uint32_t rpi_filter_coefs[8] = { @@ -4825,30 +6671,44 @@ index ef21595..b36e840 100644 + ENCODE_COEFFS( 2, 10, 58, 2) +}; + -+#define RPI_LUMA_COMMAND_WORDS 10 -+#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS) ++// Function arrays by QPU ++ ++static const int * const inter_pred_setup_c_qpu[12] = { ++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn ++}; ++ ++static const int * const inter_pred_setup_y_qpu[12] = { ++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn ++}; ++ ++static const int * const inter_pred_sync_qpu[12] = { ++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, ++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, ++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 ++}; ++ ++static const int * const inter_pred_exit_c_qpu[12] = { ++ mc_interrupt_exit12c, mc_exit_c, mc_exit_c, mc_exit_c, ++ mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c, ++ mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c ++}; ++ ++static const int * const inter_pred_exit_y_qpu[12] = { ++ mc_interrupt_exit12, mc_exit, mc_exit, mc_exit, ++ mc_exit, mc_exit, mc_exit, mc_exit, ++ mc_exit, mc_exit, mc_exit, mc_exit ++}; ++ ++ +#endif + + +#ifdef RPI_WORKER + -+typedef struct worker_global_env_s -+{ -+ volatile int arm_load; -+ pthread_mutex_t lock; -+ -+ unsigned int arm_y; -+ unsigned int arm_c; -+ unsigned int gpu_y; -+ unsigned int gpu_c; -+} worker_global_env_t; -+ -+static worker_global_env_t worker_global_env = -+{ -+ .lock = PTHREAD_MUTEX_INITIALIZER -+}; -+ -+ +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); + @@ -4940,7 +6800,7 @@ index ef21595..b36e840 100644 /** * NOTE: Each function hls_foo correspond to the function foo in the * specification (HLS stands for High Level Syntax). -@@ -56,6 +281,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 +@@ -56,6 +255,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 /* free everything allocated by pic_arrays_init() */ static void pic_arrays_free(HEVCContext *s) { @@ -4973,7 +6833,7 @@ index ef21595..b36e840 100644 av_freep(&s->sao); av_freep(&s->deblock); -@@ -92,6 +343,88 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) +@@ -92,6 +317,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) int ctb_count = sps->ctb_width * sps->ctb_height; int min_pu_size = sps->min_pu_width * sps->min_pu_height; @@ -4993,19 +6853,20 @@ index ef21595..b36e840 100644 + s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV; + + for(job=0;jobcoeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; -+ if (!s->coeffs_buf_arm[job][0]) -+ goto fail; -+ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data -+ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; -+ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; -+ if (!s->coeffs_buf_arm[job][2]) -+ goto fail; -+ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. -+ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; -+ } ++ for(job=0;jobcoeffs_buf_default[job]); ++ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; ++ if (!s->coeffs_buf_arm[job][0]) ++ goto fail; ++ ++ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data ++ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; ++ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; ++ if (!s->coeffs_buf_arm[job][2]) ++ goto fail; ++ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. ++ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; ++ } + } +#endif +#ifdef RPI_DEBLOCK_VPU @@ -5062,7 +6923,7 @@ index ef21595..b36e840 100644 s->bs_width = (width >> 2) + 1; s->bs_height = (height >> 2) + 1; -@@ -138,6 +471,29 @@ fail: +@@ -138,6 +446,29 @@ fail: return AVERROR(ENOMEM); } @@ -5092,7 +6953,52 @@ index ef21595..b36e840 100644 static void pred_weight_table(HEVCContext *s, GetBitContext *gb) { int i = 0; -@@ -678,6 +1034,11 @@ static int hls_slice_header(HEVCContext *s) +@@ -332,7 +663,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps, + static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt) + { + #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL) +- enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; ++ enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts; + int ret, i; + + pic_arrays_free(s); +@@ -351,6 +682,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + switch (sps->pix_fmt) { + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUVJ420P: ++#if RPI_HEVC_SAND ++ // Currently geometry calc is stuffed for big sizes ++ if (sps->width < 2048 && sps->height <= 1088) { ++ *fmt++ = AV_PIX_FMT_SAND128; ++ } ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -384,6 +721,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + ret = ff_thread_get_format(s->avctx, pix_fmts); + if (ret < 0) + goto fail; ++ + s->avctx->pix_fmt = ret; + } + else { +@@ -406,11 +744,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + for(c_idx = 0; c_idx < c_count; c_idx++) { + int w = sps->width >> sps->hshift[c_idx]; + int h = sps->height >> sps->vshift[c_idx]; ++ // ******** Very very nasty allocation kludge for plaited Chroma + s->sao_pixel_buffer_h[c_idx] = +- av_malloc((w * 2 * sps->ctb_height) << ++ av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) << + sps->pixel_shift); + s->sao_pixel_buffer_v[c_idx] = +- av_malloc((h * 2 * sps->ctb_width) << ++ av_malloc((h * 2 * sps->ctb_width * (1 + (c_idx == 1))) << + sps->pixel_shift); + } + } +@@ -678,6 +1017,11 @@ static int hls_slice_header(HEVCContext *s) (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) { pred_weight_table(s, gb); } @@ -5104,13 +7010,17 @@ index ef21595..b36e840 100644 sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { -@@ -933,6 +1294,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { +@@ -933,6 +1277,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { return 0; } +#ifdef RPI +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx) +{ ++ // U & V done on U call in the case of sliced frames ++ if (rpi_sliced_frame(s->frame) && c_idx > 1) ++ return; ++ + if (s->enable_rpi) { + HEVCLocalContext *lc = s->HEVClc; + HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; @@ -5121,16 +7031,21 @@ index ef21595..b36e840 100644 + cmd->i_pred.x = x0; + cmd->i_pred.y = y0; + cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; -+ } else { ++ } ++ else if (rpi_sliced_frame(s->frame) && c_idx != 0) { ++ s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx); ++ } ++ else { + s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx); + } ++ +} +#endif + static int hls_transform_unit(HEVCContext *s, int x0, int y0, int xBase, int yBase, int cb_xBase, int cb_yBase, int log2_cb_size, int log2_trafo_size, -@@ -945,8 +1325,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -945,8 +1317,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { int trafo_size = 1 << log2_trafo_size; ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size); @@ -5143,7 +7058,7 @@ index ef21595..b36e840 100644 } if (cbf_luma || cbf_cb[0] || cbf_cr[0] || -@@ -1032,7 +1415,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1032,7 +1407,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -5155,7 +7070,7 @@ index ef21595..b36e840 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1061,7 +1448,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1061,7 +1440,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -5167,7 +7082,7 @@ index ef21595..b36e840 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1090,7 +1481,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1090,7 +1473,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -5179,7 +7094,7 @@ index ef21595..b36e840 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1100,7 +1495,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1100,7 +1487,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -5191,7 +7106,7 @@ index ef21595..b36e840 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1112,26 +1511,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1112,26 +1503,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v); @@ -5238,7 +7153,7 @@ index ef21595..b36e840 100644 } } } -@@ -1277,33 +1696,23 @@ do { +@@ -1277,47 +1688,120 @@ do { return 0; } @@ -5249,12 +7164,12 @@ index ef21595..b36e840 100644 - HEVCLocalContext *lc = s->HEVClc; GetBitContext gb; - int cb_size = 1 << log2_cb_size; - ptrdiff_t stride0 = s->frame->linesize[0]; - ptrdiff_t stride1 = s->frame->linesize[1]; - ptrdiff_t stride2 = s->frame->linesize[2]; - uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; - uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; - uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; +- ptrdiff_t stride0 = s->frame->linesize[0]; +- ptrdiff_t stride1 = s->frame->linesize[1]; +- ptrdiff_t stride2 = s->frame->linesize[2]; +- uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; +- uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; +- uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; - - int length = cb_size * cb_size * s->ps.sps->pcm.bit_depth + - (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) + @@ -5271,11 +7186,47 @@ index ef21595..b36e840 100644 return ret; - s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); -+ s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); - if (s->ps.sps->chroma_format_idc) { - s->hevcdsp.put_pcm(dst1, stride1, +- if (s->ps.sps->chroma_format_idc) { +- s->hevcdsp.put_pcm(dst1, stride1, ++#if RPI_HEVC_SAND ++ if (rpi_sliced_frame(s->frame)) { ++ s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0), ++ s->frame->linesize[0], ++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ ++ s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]), ++ s->frame->linesize[1], cb_size >> s->ps.sps->hshift[1], -@@ -1318,6 +1727,59 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) + cb_size >> s->ps.sps->vshift[1], + &gb, s->ps.sps->pcm.bit_depth_chroma); +- s->hevcdsp.put_pcm(dst2, stride2, +- cb_size >> s->ps.sps->hshift[2], +- cb_size >> s->ps.sps->vshift[2], +- &gb, s->ps.sps->pcm.bit_depth_chroma); + } ++ else ++#endif ++ { ++ const int stride0 = s->frame->linesize[0]; ++ uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; ++ const int stride1 = s->frame->linesize[1]; ++ uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; ++ const int stride2 = s->frame->linesize[2]; ++ uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; ++ ++ s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ if (s->ps.sps->chroma_format_idc) { ++ s->hevcdsp.put_pcm(dst1, stride1, ++ cb_size >> s->ps.sps->hshift[1], ++ cb_size >> s->ps.sps->vshift[1], ++ &gb, s->ps.sps->pcm.bit_depth_chroma); ++ s->hevcdsp.put_pcm(dst2, stride2, ++ cb_size >> s->ps.sps->hshift[2], ++ cb_size >> s->ps.sps->vshift[2], ++ &gb, s->ps.sps->pcm.bit_depth_chroma); ++ } + ++ } return 0; } @@ -5312,9 +7263,22 @@ index ef21595..b36e840 100644 + if (s->enable_rpi) { + // Copy coeffs + const int blen = (length + 7) >> 3; -+ int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, (blen + 1) >> 1); ++ // Round allocated bytes up to nearest 32 to avoid alignment confusion ++ // Allocation is in int16_t s ++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per ++ // sample this rounding doesn't affect the total size we need to allocate for ++ // the coeff buffer ++ int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1); + memcpy(coeffs, pcm, blen); + ++ // Our coeff stash assumes that any partially allocated 64byte lump ++ // is zeroed so make that true. ++ { ++ uint8_t * const eopcm = (uint8_t *)coeffs + blen; ++ if ((-(intptr_t)eopcm & 63) != 0) ++ memset(eopcm, 0, -(intptr_t)eopcm & 63); ++ } ++ + // Add command + { + HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; @@ -5335,99 +7299,7 @@ index ef21595..b36e840 100644 /** * 8.5.3.2.2.1 Luma sample unidirectional interpolation process * -@@ -1334,6 +1796,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) - * @param luma_offset additive offset applied to the luma prediction value - */ - -+#if RPI_INTER -+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+ AVFrame *ref, const Mv *mv, int x_off, int y_off, -+ int block_w, int block_h, int luma_weight, int luma_offset) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_LUMA_UNI; -+ cmd->dst = dst; -+ cmd->dststride = dststride; -+ cmd->src = ref->data[0]; -+ cmd->srcstride = ref->linesize[0]; -+ cmd->mv = *mv; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->weight = luma_weight; -+ cmd->offset = luma_offset; -+} -+ -+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+ AVFrame *ref0, const Mv *mv0, int x_off, int y_off, -+ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, -+ const struct MvField * const current_mv) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_LUMA_BI; -+ cmd->dst = dst; -+ cmd->dststride = dststride; -+ cmd->src = ref0->data[0]; -+ cmd->srcstride = ref0->linesize[0]; -+ cmd->mv = *mv0; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->src1 = ref1->data[0]; -+ cmd->srcstride1 = ref1->linesize[0]; -+ cmd->mv1 = *mv1; -+ cmd->ref_idx[0] = current_mv->ref_idx[0]; -+ cmd->ref_idx[1] = current_mv->ref_idx[1]; -+} -+ -+static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0, -+ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, -+ int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_CHROMA_UNI; -+ cmd->dst = dst0; -+ cmd->dststride = dststride; -+ cmd->src = src0; -+ cmd->srcstride = srcstride; -+ cmd->mv = *mv; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->weight = chroma_weight; -+ cmd->offset = chroma_offset; -+} -+ -+static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, -+ int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_CHROMA_BI+cidx; -+ cmd->dst = dst0; -+ cmd->dststride = dststride; -+ cmd->src = ref0->data[cidx+1]; -+ cmd->srcstride = ref0->linesize[cidx+1]; -+ cmd->mv = current_mv->mv[0]; -+ cmd->mv1 = current_mv->mv[1]; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->src1 = ref1->data[cidx+1]; -+ cmd->srcstride1 = ref1->linesize[cidx+1]; -+ cmd->ref_idx[0] = current_mv->ref_idx[0]; -+ cmd->ref_idx[1] = current_mv->ref_idx[1]; -+} -+ -+#endif -+ - static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - AVFrame *ref, const Mv *mv, int x_off, int y_off, - int block_w, int block_h, int luma_weight, int luma_offset) -@@ -1349,6 +1896,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1349,6 +1833,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag); int idx = ff_hevc_pel_weight[block_w]; @@ -5438,7 +7310,7 @@ index ef21595..b36e840 100644 x_off += mv->x >> 2; y_off += mv->y >> 2; src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1395,7 +1946,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1395,7 +1883,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, * @param mv1 motion vector1 (relative to block position) to get pixel data from * @param current_mv current motion vector structure */ @@ -5447,7 +7319,7 @@ index ef21595..b36e840 100644 AVFrame *ref0, const Mv *mv0, int x_off, int y_off, int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) { -@@ -1419,6 +1970,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1419,6 +1907,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift); uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift); @@ -5458,7 +7330,7 @@ index ef21595..b36e840 100644 if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER || x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER || y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) { -@@ -1504,6 +2059,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, +@@ -1504,6 +1996,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, intptr_t _mx = mx << (1 - hshift); intptr_t _my = my << (1 - vshift); @@ -5469,7 +7341,7 @@ index ef21595..b36e840 100644 x_off += mv->x >> (2 + hshift); y_off += mv->y >> (2 + vshift); src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1568,6 +2127,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF +@@ -1568,6 +2064,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF int hshift = s->ps.sps->hshift[1]; int vshift = s->ps.sps->vshift[1]; @@ -5480,7 +7352,7 @@ index ef21595..b36e840 100644 intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift); intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift); intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift); -@@ -1695,14 +2258,312 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, +@@ -1695,14 +2195,582 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, } } @@ -5489,6 +7361,106 @@ index ef21595..b36e840 100644 - int log2_cb_size, int partIdx, int idx) + +#if RPI_INTER ++ ++static HEVCRpiInterPredQ * ++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) ++{ ++ HEVCRpiInterPredQ * yp = ipe->q + ipe->curr; ++ HEVCRpiInterPredQ * ypt = yp + 1; ++ for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) { ++ if (ypt->load < yp->load) ++ yp = ypt; ++ } ++ ++ yp->load += load_val; ++ ipe->used_grp = 1; ++ ((uint32_t *)yp->qpu_mc_curr)[-1] = fn; // Link is always last el of previous cmd ++ ++ return yp; ++} ++ ++ ++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) ++{ ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ ((uint32_t *)q->qpu_mc_curr)[-1] = q->code_sync; ++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)((uint32_t *)q->qpu_mc_curr + 1); ++ q->load = 0; ++ } ++} ++ ++// Returns 0 on success, -1 if Q is dangerously full ++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) ++{ ++ if (!ipe->used_grp) ++ return 0; ++ ++ if ((ipe->curr += ipe->n_grp) >= ipe->n) ++ { ++ ipe->curr = 0; ++ rpi_inter_pred_sync(ipe); ++ } ++ ipe->used = 1; ++ ipe->used_grp = 0; ++ ++ for (unsigned int i = 0; i != ipe->n_grp; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr; ++ if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) { ++ return -1; ++ } ++ } ++ return 0; ++} ++ ++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ ipe->curr = 0; ++ ipe->used = 0; ++ ipe->used_grp = 0; ++ for (i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base; ++ q->load = 0; ++ q->last_l0 = NULL; ++ q->last_l1 = NULL; ++ } ++} ++ ++static void rpi_alloc_inter_pred(HEVCRpiInterPredEnv * const ipe, ++ const unsigned int n, const unsigned int n_grp, ++ const unsigned int q1_size, const unsigned int min_gap, ++ const int * const * const setup_fns, ++ const int * const * const sync_fns, ++ const int * const * const exit_fns) ++{ ++ unsigned int i; ++ ++ memset(ipe, 0, sizeof(*ipe)); ++ av_assert0((ipe->q = av_mallocz(n * sizeof(*ipe->q))) != NULL); ++ ipe->n = n; ++ ipe->n_grp = n_grp; ++ ipe->q1_size = q1_size; ++ ipe->max_fill = ipe->q1_size - min_gap; ++ ++#if RPI_CACHE_UNIF_MVS ++ gpu_malloc_cached(n * q1_size, &ipe->gptr); ++#else ++ gpu_malloc_uncached(n * q1_size, &ipe->gptr); ++#endif ++ ++ for(i = 0; i < n; i++) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base = ++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); ++ q->code_setup = qpu_fn(setup_fns[i]); ++ q->code_sync = qpu_fn(sync_fns[i]); ++ q->code_exit = qpu_fn(exit_fns[i]); ++ } ++} ++ ++ +static void +rpi_pred_y(HEVCContext *const s, const int x0, const int y0, + const int nPbW, const int nPbH, @@ -5497,69 +7469,175 @@ index ef21595..b36e840 100644 + const int weight_offset, + AVFrame *const src_frame) +{ -+ const unsigned int y_off = x0 + y0 * s->frame->linesize[0]; -+ -+ rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, -+ mv, x0, y0, nPbW, nPbH, -+ weight_mul, weight_offset); ++ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my << 8) | mx; ++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; ++ const uint32_t src_vc_address_y = get_vc_address_y(src_frame); ++ uint32_t dst_addr = get_vc_address_y(s->frame) + y_off; ++ const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul); ++ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip; + ++ if (my_mx == 0) ++ { ++ const int x1 = x0 + (mv->x >> 2); ++ const int y1 = y0 + (mv->y >> 2); ++ ++#if Y_P_MAX_H == 0 ++ const int bh = nPbH; ++ const int start_y = 0; ++#else ++ for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H) ++ { ++ const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H); ++#endif ++ ++ for (int start_x = 0; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_p00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; ++ ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred1_x0y0; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ ++ src1->x = x1 + start_x; ++ src1->y = y1 + start_y; ++ src1->base = src_vc_address_y; ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->wo1 = wo; ++ cmd_y->dst_addr = dst_addr + start_x; ++ yp->last_l0 = &cmd_y->next_src1; ++ *(qpu_mc_pred_y_p00_t **)&yp->qpu_mc_curr = cmd_y + 1; ++ } ++#if Y_P_MAX_H != 0 ++ } ++#endif ++ } ++ else + { -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my << 8) | mx; -+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; + const int x1_m3 = x0 + (mv->x >> 2) - 3; + const int y1_m3 = y0 + (mv->y >> 2) - 3; -+ const uint32_t src_vc_address_y = get_vc_address_y(src_frame); -+ uint32_t *y = s->curr_y_mvs; -+ uint32_t dst_base = get_vc_address_y(s->frame) + y_off; -+ const uint32_t wo_0 = PACK2(weight_offset * 2 + 1, weight_mul); + -+ // Potentially we could change the assembly code to support taller sizes in one go -+ for (int start_y = 0; start_y < nPbH; start_y += 16, dst_base += s->frame->linesize[0] * 16) { ++#if Y_P_MAX_H == 0 ++ const int bh = nPbH; ++ const int start_y = 0; ++#else ++ for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H) ++ { ++ const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H); ++#endif + const uint32_t src_yx_y = y1_m3 + start_y; + int start_x = 0; -+ const int bh = FFMIN(nPbH - start_y, 16); -+ uint32_t *const py = y - RPI_LUMA_COMMAND_WORDS; -+ uint32_t *const ppy = y - RPI_LUMA_COMMAND_WORDS * 2; + ++#if 1 + // As Y-pred operates on two independant 8-wide src blocks we can merge + // this pred with the previous one if it the previous one is 8 pel wide, + // the same height as the current block, immediately to the left of our + // current dest block and mono-pred. -+ // -+ // In the init (1st) block w/h is pic width height so given -+ // that no pic will ever be 8 pixels wide the first test here -+ // should fail if this is the first pred (i.e. after that test -+ // ppy is valid) -+ if (py[4] == ((8 << 16) | bh) && py[8] + 8 == dst_base && ppy[9] == s->qpu_filter) { ++ ++ qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p; ++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + 8 == dst_addr) ++ { + const int bw = FFMIN(nPbW, 8); ++ qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1; + -+ ppy[2] = PACK2(src_yx_y, x1_m3); -+ ppy[3] = src_vc_address_y; -+ py[4] += bw << 16; -+ py[5] = PACK2(my2_mx2_my_mx, py[5]); -+ // py[6] stays the same -+ py[7] = wo_0; ++ last_y8_src2->x = x1_m3; ++ last_y8_src2->y = src_yx_y; ++ last_y8_src2->base = src_vc_address_y; ++ last_y8_p->w += bw; ++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); ++ last_y8_p->wo2 = wo; + ++ s->last_y8_p = NULL; ++ s->last_y8_l1 = NULL; + start_x = bw; ++#if RPI_TSTATS ++ ++s->tstats.y_pred1_y8_merge; ++#endif + } ++#endif + -+ for (; start_x < nPbW; start_x += 16) { -+ const int bw = FFMIN(nPbW - start_x, 16);; -+ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + start_x); -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + 8 + start_x); -+ y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y; -+ *y++ = PACK2(bw, bh); -+ *y++ = my2_mx2_my_mx; -+ *y++ = wo_0; -+ *y++ = wo_0; -+ *y++ = dst_base + start_x; -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter; ++ for (; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ if (mx == 0 && my == 0) ++ ++ts->y_pred1_x0y0; ++ else if (mx == 0) ++ ++ts->y_pred1_x0; ++ else if (my == 0) ++ ++ts->y_pred1_y0; ++ else ++ ++ts->y_pred1_xy; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ src1->x = x1_m3 + start_x; ++ src1->y = src_yx_y; ++ src1->base = src_vc_address_y; ++ if (bw <= 8) ++ { ++ src2->x = MC_DUMMY_X; ++ src2->y = MC_DUMMY_Y; ++ src2->base = s->qpu_dummy_frame; ++ } ++ else ++ { ++ src2->x = x1_m3 + start_x + 8; ++ src2->y = src_yx_y; ++ src2->base = src_vc_address_y; ++ } ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo; ++ cmd_y->wo2 = wo; ++ cmd_y->dst_addr = dst_addr + start_x; ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1; ++ ++ if (bw == 8) { ++ s->last_y8_l1 = src2; ++ s->last_y8_p = cmd_y; ++ } + } ++#if Y_P_MAX_H != 0 + } -+ s->curr_y_mvs = y; ++#endif + } +} + @@ -5571,58 +7649,146 @@ index ef21595..b36e840 100644 + AVFrame *const src_frame, + AVFrame *const src_frame2) +{ -+ const unsigned int y_off = x0 + y0 * s->frame->linesize[0]; ++ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); + const Mv * const mv = mv_field->mv + 0; + const Mv * const mv2 = mv_field->mv + 1; + -+ rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, -+ mv, x0, y0, nPbW, nPbH, -+ src_frame2, mv2, mv_field); -+#if !Y_B_ONLY ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my<<8) | mx; ++ const unsigned int mx2 = mv2->x & 3; ++ const unsigned int my2 = mv2->y & 3; ++ const unsigned int my2_mx2 = (my2<<8) | mx2; ++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const unsigned int ref_idx0 = mv_field->ref_idx[0]; ++ const unsigned int ref_idx1 = mv_field->ref_idx[1]; ++ const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] + ++ s->sh.luma_offset_l1[ref_idx1] + 1; ++ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++ ++ uint32_t dst = get_vc_address_y(s->frame) + y_off; ++ const uint32_t src1_base = get_vc_address_y(src_frame); ++ const uint32_t src2_base = get_vc_address_y(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip; ++ ++ if (my2_mx2_my_mx == 0) + { -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my<<8) | mx; -+ const unsigned int mx2 = mv2->x & 3; -+ const unsigned int my2 = mv2->y & 3; -+ const unsigned int my2_mx2 = (my2<<8) | mx2; -+ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const int x1 = x0 + (mv->x >> 2); ++ const int y1 = y0 + (mv->y >> 2); ++ const int x2 = x0 + (mv2->x >> 2); ++ const int y2 = y0 + (mv2->y >> 2); ++ ++#if Y_B_MAX_H == 0 ++ const int bh = nPbH; ++ const int start_y = 0; ++#else ++ for (int start_y = 0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H) ++ { ++ const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H); ++#endif ++ // Can do chunks a full 16 wide if we don't want the H filter ++ for (int start_x=0; start_x < nPbW; start_x += 16) ++ { ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_b00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred2_x0y0; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1 + start_y; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2 + start_y; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 16); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = 0; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + start_x; ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1; ++ } ++#if Y_P_MAX_H != 0 ++ } ++#endif ++ } ++ else ++ { ++ // Filter requires a run-up of 3 + const int x1 = x0 + (mv->x >> 2) - 3; + const int y1 = y0 + (mv->y >> 2) - 3; + const int x2 = x0 + (mv2->x >> 2) - 3; + const int y2 = y0 + (mv2->y >> 2) - 3; -+ const unsigned int ref_idx0 = mv_field->ref_idx[0]; -+ const unsigned int ref_idx1 = mv_field->ref_idx[1]; -+ const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] + -+ s->sh.luma_offset_l1[ref_idx1] + 1; -+ const uint32_t wo_0 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); -+ const uint32_t wo_1 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); + -+ uint32_t * y = s->curr_y_mvs; -+ uint32_t dst = get_vc_address_y(s->frame) + y_off; -+ -+ for(int start_y=0;start_y < nPbH;start_y+=16) { // Potentially we could change the assembly code to support taller sizes in one go -+ for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time -+ int bw = nPbW-start_x; -+ int bh = nPbH-start_y; -+ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y1 + start_y, x1 + start_x); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame); -+ y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y2 + start_y, x2 + start_x); -+ y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame2); -+ *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16); -+ *y++ = my2_mx2_my_mx; -+ -+ *y++ = wo_0; -+ *y++ = wo_1; -+ -+ *y++ = dst + start_x; -+ y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter_b; -+ } -+ dst += s->frame->linesize[0] * 16; -+ } -+ s->curr_y_mvs = y; -+ } ++#if Y_B_MAX_H == 0 ++ const int bh = nPbH; ++ const int start_y = 0; ++#else ++ for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H) ++ { ++ const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H); +#endif ++ for (int start_x=0; start_x < nPbW; start_x += 8) ++ { // B blocks work 8 at a time ++ // B weights aren't doubled as the QPU code does the same ++ // amount of work as it does for P ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter_b); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ const unsigned int mmx = mx | mx2; ++ const unsigned int mmy = my | my2; ++ if (mmx == 0 && mmy == 0) ++ ++ts->y_pred2_x0y0; ++ else if (mmx == 0) ++ ++ts->y_pred2_x0; ++ else if (mmy == 0) ++ ++ts->y_pred2_y0; ++ else ++ ++ts->y_pred2_xy; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1 + start_y; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2 + start_y; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 8); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + start_x; ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1; ++ } ++#if Y_B_MAX_H != 0 ++ } ++#endif ++ } +} + + @@ -5634,65 +7800,48 @@ index ef21595..b36e840 100644 + const int16_t * const c_offsets, + AVFrame * const src_frame) +{ ++ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = s->ps.sps->hshift[1]; ++ const int vshift = s->ps.sps->vshift[1]; + -+ const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1]; -+ av_assert0(s->frame->linesize[1] == s->frame->linesize[2]); -+ -+ rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1], -+ x0_c, y0_c, nPbW_c, nPbH_c, mv, -+ c_weights[0], c_offsets[0]); -+ -+ rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2], -+ x0_c, y0_c, nPbW_c, nPbH_c, mv, -+ c_weights[1], c_offsets[1]); ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ const uint32_t src_base_u = get_vc_address_u(src_frame); ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; ++ const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]); ++ const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]); ++ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; ++ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip; + ++ for(int start_y=0;start_y < nPbH_c;start_y+=16) + { -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; ++ const int bh = FFMIN(nPbH_c-start_y, 16); + -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; -+ const uint32_t src_base_u = get_vc_address_u(src_frame); -+ const uint32_t src_base_v = get_vc_address_v(src_frame); -+ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; -+ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; -+ const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]); -+ const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]); -+ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; -+ uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off; -+ -+ qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs; -+ -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) + { -+ const int bh = FFMIN(nPbH_c-start_y, 16); -+ // We are allowed 3/4 powers of two as well as powers of 2 -+ av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2); ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, s->qpu_filter_uv); ++ qpu_mc_pred_c_p_t * const u = &cp->qpu_mc_curr->c.p; ++ qpu_mc_src_t * const last_l0 = cp->last_l0; ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + -+ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH, ++u) -+ { -+ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); -+ u[-1].next_fn = s->qpu_filter_uv; -+ u[-1].next_src_x = x1_c + start_x; -+ u[-1].next_src_y = y1_c + start_y; -+ u[-1].next_src_base_u = src_base_u; -+ u[-1].next_src_base_v = src_base_v; -+ u[0].p.h = bh; -+ u[0].p.w = bw; -+ u[0].p.coeffs_x = x_coeffs; -+ u[0].p.coeffs_y = y_coeffs; -+ u[0].p.wo_u = wo_u; -+ u[0].p.wo_v = wo_v; -+ u[0].p.dst_addr_u = dst_base_u + start_x; -+ u[0].p.dst_addr_v = dst_base_v + start_x; -+ } -+ -+ dst_base_u += s->frame->linesize[1] * 16; -+ dst_base_v += s->frame->linesize[2] * 16; ++ last_l0->x = x1_c + start_x; ++ last_l0->y = y1_c + start_y; ++ last_l0->base = src_base_u; ++ u[0].h = bh; ++ u[0].w = bw; ++ u[0].coeffs_x = x_coeffs; ++ u[0].coeffs_y = y_coeffs; ++ u[0].wo_u = wo_u; ++ u[0].wo_v = wo_v; ++ u[0].dst_addr_c = dst_base_u + start_x * 2; ++ cp->last_l0 = &u->next_src; ++ *(qpu_mc_pred_c_p_t **)&cp->qpu_mc_curr = u + 1; + } -+ s->curr_u_mvs = (uint32_t *)u; ++ ++ dst_base_u += s->frame->linesize[1] * 16; + } -+ return; ++ return; +} + +static void @@ -5706,81 +7855,74 @@ index ef21595..b36e840 100644 + AVFrame * const src_frame, + AVFrame * const src_frame2) +{ -+ const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1]; -+ av_assert0(s->frame->linesize[1] == s->frame->linesize[2]); ++ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = s->ps.sps->hshift[1]; ++ const int vshift = s->ps.sps->vshift[1]; ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; + -+ rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2, -+ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0); ++ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); ++ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); ++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; ++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; + -+ rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2, -+ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1); ++ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); ++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; ++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector + ++ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ ++ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; ++ const uint32_t src1_base = get_vc_address_u(src_frame); ++ const uint32_t src2_base = get_vc_address_u(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip; ++ ++ for (int start_y = 0; start_y < nPbH_c; start_y += 16) + { -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; -+ const Mv * const mv = mv_field->mv + 0; -+ const Mv * const mv2 = mv_field->mv + 1; ++ const unsigned int bh = FFMIN(nPbH_c-start_y, 16); + -+ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); -+ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); -+ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; -+ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) ++ { ++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + -+ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); -+ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); -+ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; -+ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu_filter_uv_b0); ++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; ++ qpu_mc_src_t * const src_l0 = cp->last_l0; ++ qpu_mc_src_t * const src_l1 = cp->last_l1; + -+ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; -+ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ src_l0->x = x1_c + start_x; ++ src_l0->y = y1_c + start_y; ++ src_l0->base = src1_base; ++ src_l1->x = x2_c + start_x; ++ src_l1->y = y2_c + start_y; ++ src_l1->base = src2_base; + -+ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; -+ uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off; -+ qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs; ++ u[0].h = bh; ++ u[0].w = bw; ++ u[0].coeffs_x1 = coefs0_x; ++ u[0].coeffs_y1 = coefs0_y; ++ u[0].weight_u1 = c_weights[0]; // Weight L0 U ++ u[0].weight_v1 = c_weights[1]; // Weight L0 V ++ u[0].coeffs_x2 = coefs1_x; ++ u[0].coeffs_y2 = coefs1_y; ++ u[0].wo_u2 = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]); ++ u[0].wo_v2 = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]); ++ u[0].dst_addr_c = dst_base_u + start_x * 2; + -+ for (int start_y = 0; start_y < nPbH_c; start_y += 16) { -+ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH, u += 2) { -+ int bw = nPbW_c-start_x; -+ int bh = nPbH_c-start_y; -+ u[-1].next_fn = s->qpu_filter_uv_b0; // In fact ignored -+ u[-1].next_src_x = x1_c + start_x; -+ u[-1].next_src_y = y1_c + start_y; -+ u[-1].next_src_base_u = get_vc_address_u(src_frame); -+ u[-1].next_src_base_v = get_vc_address_v(src_frame); -+ -+ u[0].next_fn = s->qpu_filter_uv_b; -+ u[0].next_src_x = x2_c + start_x; -+ u[0].next_src_y = y2_c + start_y; -+ u[0].next_src_base_u = get_vc_address_u(src_frame2); -+ u[0].next_src_base_v = get_vc_address_v(src_frame2); -+ -+ u[0].b0.h = (bh<16 ? bh : 16); -+ u[0].b0.w = (bwframe->linesize[1] * 16; -+ dst_base_v += s->frame->linesize[2] * 16; ++ cp->last_l0 = &u[0].next_src1; ++ cp->last_l1 = &u[0].next_src2; ++ *(qpu_mc_pred_c_b_t **)&cp->qpu_mc_curr = u + 1; + } + -+ s->curr_u_mvs = (uint32_t *)u; ++ dst_base_u += s->frame->linesize[1] * 16; + } +} ++ ++ +#endif + + @@ -5797,7 +7939,7 @@ index ef21595..b36e840 100644 int merge_idx = 0; struct MvField current_mv = {{{ 0 }}}; -@@ -1720,8 +2581,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1720,8 +2788,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int y_cb = y0 >> log2_min_cb_size; int x_pu, y_pu; int i, j; @@ -5807,7 +7949,7 @@ index ef21595..b36e840 100644 if (!skip_flag) lc->pu.merge_flag = ff_hevc_merge_flag_decode(s); -@@ -1765,12 +2625,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1765,12 +2832,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -5838,7 +7980,7 @@ index ef21595..b36e840 100644 chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]); -@@ -1784,12 +2661,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1784,12 +2868,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -5869,7 +8011,7 @@ index ef21595..b36e840 100644 chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]); -@@ -1804,11 +2698,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1804,11 +2905,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -5902,7 +8044,7 @@ index ef21595..b36e840 100644 chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0); -@@ -2083,7 +2997,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) +@@ -2083,7 +3204,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); ret = hls_pcm_sample(s, x0, y0, log2_cb_size); if (s->ps.sps->pcm.loop_filter_disable_flag) @@ -5912,7 +8054,7 @@ index ef21595..b36e840 100644 if (ret < 0) return ret; -@@ -2306,6 +3222,741 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, +@@ -2306,6 +3429,373 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]])); } @@ -5958,9 +8100,16 @@ index ef21595..b36e840 100644 +} +#endif + ++ +// I-pred, transform_and_add for all blocks types done here +// All ARM ++#define RPI_OPT_SEP_PRED 0 ++ ++#if RPI_OPT_SEP_PRED ++static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma) ++#else +static void rpi_execute_pred_cmds(HEVCContext * const s) ++#endif +{ + int i; + int job = s->pass1_job; @@ -5972,7 +8121,12 @@ index ef21595..b36e840 100644 +#endif + + for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) { -+ //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); ++// printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); ++#if RPI_OPT_SEP_PRED ++ if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) { ++ continue; ++ } ++#endif + + switch (cmd->type) + { @@ -5983,16 +8137,26 @@ index ef21595..b36e840 100644 + lc->na.cand_up_left = (cmd->na >> 2) & 1; + lc->na.cand_up = (cmd->na >> 1) & 1; + lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0) ++ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ else ++ s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); + break; + -+ case RPI_PRED_TRANSFORM_ADD: ++ case RPI_PRED_ADD_RESIDUAL: + s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); -+#ifdef RPI_PRECLEAR -+ memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache -+#endif + break; -+ ++#if RPI_HEVC_SAND ++ case RPI_PRED_ADD_RESIDUAL_U: ++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_V: ++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_C: ++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++#endif + case RPI_PRED_I_PCM: + pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); + break; @@ -6002,463 +8166,141 @@ index ef21595..b36e840 100644 + abort(); + } + } -+ s->num_pred_cmds[job] = 0; ++#if RPI_OPT_SEP_PRED ++ if (do_luma) ++#endif ++ { ++ s->num_pred_cmds[job] = 0; ++ } +} + -+// Do any inter-pred that we want to do in software -+// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here -+// All ARM -+static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only) -+{ -+ unsigned int cidx; -+ AVFrame myref; -+ AVFrame myref1; -+ struct MvField mymv; -+ -+ for(; n>0 ; n--, cmd++) { -+ switch(cmd->cmd) { -+ case RPI_CMD_LUMA_UNI: -+ if (b_only) -+ break; -+ myref.data[0] = cmd->src; -+ myref.linesize[0] = cmd->srcstride; -+ luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset); -+ break; -+ case RPI_CMD_LUMA_BI: -+ myref.data[0] = cmd->src; -+ myref.linesize[0] = cmd->srcstride; -+ myref1.data[0] = cmd->src1; -+ myref1.linesize[0] = cmd->srcstride1; -+ mymv.ref_idx[0] = cmd->ref_idx[0]; -+ mymv.ref_idx[1] = cmd->ref_idx[1]; -+ luma_mc_bi(s, cmd->dst, cmd->dststride, -+ &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, -+ &myref1, &cmd->mv1, &mymv); -+ break; -+ case RPI_CMD_CHROMA_UNI: -+ if (b_only) -+ break; -+ mymv.mv[0] = cmd->mv; -+ chroma_mc_uni(s, cmd->dst, -+ cmd->dststride, cmd->src, cmd->srcstride, 0, -+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset); -+ break; -+ case RPI_CMD_CHROMA_BI: -+ case RPI_CMD_CHROMA_BI+1: -+ cidx = cmd->cmd - RPI_CMD_CHROMA_BI; -+ myref.data[cidx+1] = cmd->src; -+ myref.linesize[cidx+1] = cmd->srcstride; -+ myref1.data[cidx+1] = cmd->src1; -+ myref1.linesize[cidx+1] = cmd->srcstride1; -+ mymv.ref_idx[0] = cmd->ref_idx[0]; -+ mymv.ref_idx[1] = cmd->ref_idx[1]; -+ mymv.mv[0] = cmd->mv; -+ mymv.mv[1] = cmd->mv1; -+ chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1, -+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx); -+ break; -+ } -+ } -+} -+ -+static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only) -+{ -+ const int job = s->pass1_job; -+ -+ if (!qpu_luma || luma_b_only) -+ do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma); -+ s->num_mv_cmds_y[job] = 0; -+ if (!qpu_chroma || chroma_b_only) -+ do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma); -+ s->num_mv_cmds_c[job] = 0; -+} + +#endif + +#ifdef RPI ++ +// Set initial uniform job values & zero ctu_count +static void rpi_begin(HEVCContext *s) +{ +#if RPI_INTER + int job = s->pass0_job; + int i; ++ HEVCRpiJob * const jb = s->jobs + job; ++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; ++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; + -+ int pic_width = s->ps.sps->width >> s->ps.sps->hshift[1]; -+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[1]; ++ const uint16_t pic_width_y = s->ps.sps->width; ++ const uint16_t pic_height_y = s->ps.sps->height; + ++ const uint16_t pic_width_c = s->ps.sps->width >> s->ps.sps->hshift[1]; ++ const uint16_t pic_height_c = s->ps.sps->height >> s->ps.sps->vshift[1]; ++ ++ rpi_inter_pred_reset(cipe); + for(i=0; i < QPU_N_UV;i++) { -+ qpu_mc_pred_c_t * const u = (qpu_mc_pred_c_t *)s->mvs_base[job][i]; ++ HEVCRpiInterPredQ * const cp = cipe->q + i; ++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; ++ ++ u->next_src1.x = 0; ++ u->next_src1.y = 0; ++ u->next_src1.base = 0; ++ u->pic_cw = pic_width_c; ++ u->pic_ch = pic_height_c; ++ u->stride2 = rpi_sliced_frame_stride2(s->frame); ++ u->stride1 = s->frame->linesize[1]; ++ u->wdenom = s->sh.chroma_log2_weight_denom + 6; ++ cp->last_l0 = &u->next_src1; + + u->next_fn = 0; -+ u->next_src_x = 0; -+ u->next_src_y = 0; -+ u->next_src_base_u = 0; -+ u->next_src_base_v = 0; -+ u->s.pic_w = pic_width; -+ u->s.pic_h = pic_height; -+ u->s.src_stride = s->frame->linesize[1]; -+ u->s.dst_stride = s->frame->linesize[1]; -+ u->s.wdenom = s->sh.chroma_log2_weight_denom + 6; -+ u->s.dummy0 = 0; -+ u->s.dummy1 = 0; ++ u->next_src2.x = 0; ++ u->next_src2.y = 0; ++ u->next_src2.base = 0; ++ cp->last_l1 = &u->next_src2; + -+ s->u_mvs[job][i] = (uint32_t *)(u + 1); ++ *(qpu_mc_pred_c_s_t **)&cp->qpu_mc_curr = u + 1; + } -+ s->curr_u_mvs = s->u_mvs[job][0]; + ++ rpi_inter_pred_reset(yipe); + for(i=0;i < QPU_N_Y;i++) { -+ // This needs to have a generally similar structure to the -+ // actual filter code as various pipelined bits need to land correctly -+ // when inserted by the filter requests -+ s->y_mvs[job][i] = s->y_mvs_base[job][i]; -+ *s->y_mvs[job][i]++ = 0; // y_x -+ *s->y_mvs[job][i]++ = 0; // ref_y_base -+ *s->y_mvs[job][i]++ = 0; // y2_x2 -+ *s->y_mvs[job][i]++ = 0; // ref_y2_base -+ *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height; -+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch -+ *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch -+ *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6; // weight demon + 6 -+ *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block -+ *s->y_mvs[job][i]++ = 0; // Next kernel ++ HEVCRpiInterPredQ * const yp = s->jobs[job].luma_ip.q + i; ++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; ++ ++ y->next_src1.x = 0; ++ y->next_src1.y = 0; ++ y->next_src1.base = 0; ++ y->next_src2.x = 0; ++ y->next_src2.y = 0; ++ y->next_src2.base = 0; ++ y->pic_h = pic_height_y; ++ y->pic_w = pic_width_y; ++ y->stride2 = rpi_sliced_frame_stride2(s->frame); ++ y->stride1 = s->frame->linesize[0]; ++ y->wdenom = s->sh.luma_log2_weight_denom + 6; ++ y->next_fn = 0; ++ yp->last_l0 = &y->next_src1; ++ yp->last_l1 = &y->next_src2; ++ ++ *(qpu_mc_pred_y_s_t **)&yp->qpu_mc_curr = y + 1; + } -+ s->curr_y_mvs = s->y_mvs[job][0]; ++ ++ s->last_y8_p = NULL; ++ s->last_y8_l1 = NULL; +#endif + s->ctu_count = 0; +} +#endif + -+#ifdef RPI_SIMULATE_QPUS -+#error Rotted -+ -+static int32_t clipx(int x,int FRAME_WIDTH) -+{ -+ if (x<=0) return 0; -+ if (x>=FRAME_WIDTH) return FRAME_WIDTH-1; -+ return x; -+} -+ -+static int32_t clipy(int y,int FRAME_HEIGHT) -+{ -+ if (y<=0) return 0; -+ if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1; -+ return y; -+} -+ -+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 8; x++) -+ hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch]; -+ -+ vsum += lumaFilter[my][y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+round)>>denom)+offset; -+ -+ return av_clip_uint8( vsum ); -+}*/ -+ -+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ int chromaFilterH[4]; -+ int chromaFilterV[4]; -+ int i; -+ int offset_after = offset_weight>>16; -+ int weight = (offset_weight<<16)>>16; -+ for(i=0;i<4;i++) { -+ chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24; -+ chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24; -+ } -+ -+ for (y = 0; y < 4; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 4; x++) -+ hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; -+ -+ vsum += chromaFilterV[y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; -+ -+ return vsum; -+} -+ -+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} }; -+ -+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height) -+{ -+ int32_t vsum = 0; -+ int x, y; -+ int i; -+ int offset_after = offset_weight>>16; -+ int weight = (offset_weight<<16)>>16; -+ -+ for (y = 0; y < 8; y++) { -+ int32_t hsum = 0; -+ -+ for (x = 0; x < 8; x++) -+ hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch]; -+ -+ vsum += lumaFilter[(my_mx>>8)&3][y]*hsum; -+ } -+ vsum >>= 6; -+ vsum = (((vsum*weight)+offset_before)>>denom)+offset_after; -+ -+ return vsum; -+} -+ -+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx) -+{ -+ //int pic_width = s->ps.sps->width >> s->ps.sps->hshift[cIdx]; -+ int pic_height = s->ps.sps->height >> s->ps.sps->vshift[cIdx]; -+ int pitch = frame->linesize[cIdx]; -+ uint32_t base = cIdx == 0 ? get_vc_address_y(frame) : -+ cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame); -+ if (p>=base && pdata[cIdx] + (p-base); -+ } -+ return NULL; -+} -+ -+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx) -+{ -+ SliceHeader *sh = &s->sh; -+ uint8_t *arm = test_frame(s,p,s->frame,cIdx); -+ int i; -+ if (arm) return arm; -+ if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE) -+ { -+ for(i=0;inb_refs[L0];i++) { -+ arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx); -+ if (arm) return arm; -+ } -+ } -+ if (sh->slice_type == B_SLICE) { -+ for(i=0;inb_refs[L1];i++) { -+ arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx); -+ if (arm) return arm; -+ } -+ } -+ printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT)); -+ exit(-1); -+ return NULL; -+} -+ -+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p) -+{ -+ uint32_t next_kernel; -+ uint32_t x0; -+ uint32_t y0; -+ uint8_t *ref_u_base; -+ uint8_t *ref_v_base; -+ uint32_t frame_width = p[5]; -+ uint32_t frame_height = p[6]; -+ uint32_t pitch = p[7]; -+ uint32_t dst_pitch = p[8]; -+ int32_t offset_before = p[9]; -+ int32_t denom = p[10]; -+ uint32_t vpm_id = p[11]; -+ uint32_t tmp_u_dst[256]; -+ uint32_t tmp_v_dst[256]; -+ while(1) { -+ p += 12; -+ next_kernel = p[0-12]; -+ x0 = p[1-12]; -+ y0 = p[2-12]; -+ if (next_kernel==s->qpu_filter_uv || next_kernel==s->qpu_filter_uv_b0 || next_kernel==s->qpu_filter_uv_b) { -+ int x,y; -+ uint32_t width_height = p[5]; -+ uint32_t hcoeffs = p[6]; -+ uint32_t vcoeffs = p[7]; -+ uint32_t offset_weight_u = p[8]; -+ uint32_t offset_weight_v = p[9]; -+ uint8_t *this_u_dst; -+ uint8_t *this_v_dst; -+ uint32_t width = width_height >> 16; -+ uint32_t height = (width_height << 16) >> 16; -+ ref_u_base = compute_arm_addr(s,p[3-12],1); -+ ref_v_base = compute_arm_addr(s,p[4-12],2); -+ if (next_kernel!=s->qpu_filter_uv_b0) -+ { -+ this_u_dst = compute_arm_addr(s,p[10],1); -+ this_v_dst = compute_arm_addr(s,p[11],2); -+ } -+ for (y=0; yqpu_filter_uv) { -+ int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height); -+ int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height); -+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); -+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } else if (next_kernel==s->qpu_filter_uv_b0) { -+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); -+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height); -+ tmp_u_dst[x+y*16] = refa; -+ tmp_v_dst[x+y*16] = refb; -+ } else { -+ int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height); -+ int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height); -+ this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa); -+ this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } -+ } -+ } -+ } else { -+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); -+ break; -+ } -+ } -+} -+ -+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel) -+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan) -+{ -+ uint32_t next_kernel; -+ int y_x,y2_x2; -+ int x0; -+ int y0; -+ int x2; -+ int y2; -+ uint32_t *p0 = p; -+ uint8_t *ref_y_base; -+ uint8_t *ref_y2_base; -+ uint32_t frame_width_height = p[4]; -+ uint32_t frame_width = frame_width_height>>16; -+ uint32_t frame_height = (frame_width_height<<16)>>16; -+ uint32_t pitch = p[5]; -+ uint32_t dst_pitch = p[6]; -+ int offset_shift = p[7]; -+ int32_t offset_before = offset_shift>>16; -+ int32_t denom = (offset_shift<<16)>>16; -+ while(1) { -+ p += 9; -+ next_kernel = p[8-9]; -+ y_x = p[0-9]; -+ x0 = (y_x<<16)>>16; -+ y0 = y_x>>16; -+ y2_x2 = p[2-9]; -+ x2 = (y2_x2<<16)>>16; -+ y2 = y2_x2>>16; -+ -+ if (next_kernel==s->qpu_filter || next_kernel==s->qpu_filter_b) { -+ // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+ int x,y; -+ uint32_t width_height = p[4]; -+ uint32_t my2_mx2_my_mx = p[5]; -+ uint32_t offset_weight = p[6]; -+ uint8_t *this_dst = compute_arm_addr(s,p[7],0); -+ uint32_t width = width_height >> 16; -+ uint32_t height = (width_height << 16) >> 16; -+ uint8_t *dst_base = s->frame->data[0]; -+ ref_y_base = compute_arm_addr(s,p[1-9],0); -+ ref_y2_base = compute_arm_addr(s,p[3-9],0); -+ for (y=0; yqpu_filter) { -+ int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height); -+ refa = av_clip_uint8(refa); -+ this_dst[x+y*dst_pitch] = refa; -+ } -+ else { -+ int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height); -+ int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height); -+ this_dst[x+y*dst_pitch] = av_clip_uint8(refb); -+ } -+ } -+ } -+ } else { -+ av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) ); -+ break; -+ } -+ } -+} -+ -+static void rpi_simulate_inter_qpu(HEVCContext *s) -+{ -+ // First run the transform as normal -+ int i; -+ rpi_execute_transform(s); -+ for(i=0;i<8;i++) -+ { -+ rpi_simulate_inter_chroma(s,s->mvs_base[i]); -+ } -+ for(i=0;i<12;i++) -+ { -+ rpi_simulate_inter_luma(s,s->y_mvs_base[i],i); -+ } -+} -+ -+#endif -+ + +#if RPI_INTER -+static unsigned int mc_terminate_y(HEVCContext * const s, const int job) ++static unsigned int mc_terminate_add(HEVCContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) +{ + unsigned int i; -+ const uint32_t exit_fn = qpu_fn(mc_exit); -+ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12); -+ const uint32_t dummy_texture = qpu_fn(mc_setup_uv); -+ unsigned int tc = 0; ++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; + -+ // Add final commands to Q -+ for(i = 0; i != QPU_N_Y; ++i) { -+ uint32_t * const pu = s->y_mvs[job][i] - RPI_LUMA_COMMAND_WORDS; -+ const int cmd_count = pu - s->y_mvs_base[job][i]; -+ tc += cmd_count; -+ -+ av_assert0(cmd_count < Y_COMMANDS_PER_QPU - 1); -+ -+ // We use this code as a dummy texture - safe? -+ pu[0] = 0; // x,y -+ pu[1] = dummy_texture; -+ pu[2] = 0; -+ pu[3] = dummy_texture; -+ pu[RPI_LUMA_COMMAND_WORDS - 1] = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2; // Actual fn ptr ++ if (!ipe->used) { ++ return 0; + } + -+ return tc; -+} ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } + -+static unsigned int mc_terminate_uv(HEVCContext * const s, const int job) -+{ -+ unsigned int i; -+ const uint32_t exit_fn = qpu_fn(mc_exit_c); -+#if QPU_N_UV == 8 -+ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit8c); -+#elif QPU_N_UV == 12 -+ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12c); -+#else -+#error Need appropriate exit code ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ ++ ((uint32_t *)yp->qpu_mc_curr)[-1] = yp->code_exit; ++ ++ av_assert0((char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base <= ipe->q1_size); ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ ++ // Add to mailbox list ++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); ++ mail[i][1] = yp->code_setup; ++ } ++ ++#if RPI_CACHE_UNIF_MVS ++ rpi_cache_flush_add_gm_ptr(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); +#endif -+ const uint32_t dummy_texture = qpu_fn(mc_setup_uv); -+ unsigned int tc = 0; ++ vpu_qpu_job_add_qpu(vqj, QPU_N_UV, (uint32_t *)mail); + -+ // Add final commands to Q -+ for(i = 0; i != QPU_N_UV; ++i) { -+ qpu_mc_pred_c_t * const pu = (qpu_mc_pred_c_t *)s->u_mvs[job][i] - 1; -+ const int cmd_count = (uint32_t *)pu - s->mvs_base[job][i]; -+ tc += cmd_count; -+ -+ pu->next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2; // Actual fn ptr -+ // Need to set the src to something that can be (pointlessly) prefetched -+ pu->next_src_x = 0; -+ pu->next_src_y = 0; -+ // We use this code as a dummy texture - safe? -+ pu->next_src_base_u = dummy_texture; -+ pu->next_src_base_v = dummy_texture; -+ } -+ -+ return tc; ++ return 1; +} ++ +#endif + +#ifdef RPI @@ -6475,17 +8317,10 @@ index ef21595..b36e840 100644 +// Core execution tasks +static void worker_core(HEVCContext * const s) +{ -+ worker_global_env_t * const wg = &worker_global_env; -+ int arm_cost = 0; -+// vpu_qpu_wait_h sync_c; ++#if RPI_OPT_SEP_PRED ++ vpu_qpu_wait_h sync_c; ++#endif + vpu_qpu_wait_h sync_y; -+ int qpu_luma = 0; -+ int qpu_chroma = 0; -+ int gpu_load; -+ int arm_load; -+ static const int arm_const_cost = 2; -+ -+// static int z = 0; + + const int job = s->pass1_job; + unsigned int flush_start = 0; @@ -6509,36 +8344,6 @@ index ef21595..b36e840 100644 + + +#if RPI_INTER -+ pthread_mutex_lock(&wg->lock); -+ -+// ++z; -+ gpu_load = vpu_qpu_current_load(); -+ arm_load = avpriv_atomic_int_get(&wg->arm_load); -+#if !Y_B_ONLY -+ qpu_luma = gpu_load + 2 < arm_load; -+ qpu_chroma = gpu_load < arm_load + 8; -+#elif 1 -+ qpu_luma = gpu_load < arm_load + 2; -+ qpu_chroma = gpu_load < arm_load + 8; -+#else -+ qpu_chroma = 1; -+ qpu_luma = 1; -+#endif -+ -+ arm_cost = !qpu_chroma * 2 + !qpu_luma * 3; -+ avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost); -+ -+ wg->gpu_c += qpu_chroma; -+ wg->gpu_y += qpu_luma; -+ wg->arm_c += !qpu_chroma; -+ wg->arm_y += !qpu_luma; -+ -+ -+// if ((z & 511) == 0) { -+// printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y); -+// } -+ -+ + { + int (*d)[2] = s->dblk_cmds[job]; + unsigned int high=(*d)[1]; @@ -6550,60 +8355,26 @@ index ef21595..b36e840 100644 + flush_start = FFMIN(flush_start, y); + high=FFMAX(high,y); + } -+ // Avoid flushing past end of frame -+ flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start; ++ flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->ps.sps->height) - flush_start; + } + -+ if (qpu_chroma && mc_terminate_uv(s, job) != 0) ++ if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].chroma_ip) != 0) + { -+ uint32_t * const unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc; -+ const uint32_t code = qpu_fn(mc_setup_uv); -+ uint32_t * p; -+ unsigned int i; -+ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS]; -+ -+ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) { -+ *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm)); -+ *p++ = code; -+ } -+ -+ vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv); -+ -+#if RPI_CACHE_UNIF_MVS -+ rpi_cache_flush_add_gm_ptr(rfe, s->unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+#endif -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ flush_start, flush_count, s->ps.sps->vshift[1], 0, 1); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ 0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 0, 1); + } + +// We can take a sync here and try to locally overlap QPU processing with ARM +// but testing showed a slightly negative benefit with noticable extra complexity -+// vpu_qpu_job_add_sync_this(vqj, &sync_c); -+ -+ if (qpu_luma && mc_terminate_y(s, job) != 0) -+ { -+ uint32_t * const y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc; -+ const uint32_t code = qpu_fn(mc_setup); -+ uint32_t * p; -+ unsigned int i; -+ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS]; -+ -+ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) { -+ *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)); -+ *p++ = code; -+ } -+ -+ vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y); -+ -+#if RPI_CACHE_UNIF_MVS -+ rpi_cache_flush_add_gm_ptr(rfe, s->y_unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++#if RPI_OPT_SEP_PRED ++ vpu_qpu_job_add_sync_this(vqj, &sync_c); +#endif -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ flush_start, flush_count, s->ps.sps->vshift[1], 1, 0); ++ ++ if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].luma_ip) != 0) ++ { ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ 0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 1, 0); + } -+ -+ pthread_mutex_unlock(&wg->lock); -+ +#endif + + vpu_qpu_job_add_sync_this(vqj, &sync_y); @@ -6612,31 +8383,35 @@ index ef21595..b36e840 100644 + rpi_cache_flush_finish(rfe); + vpu_qpu_job_finish(vqj); + -+ memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job])); //???? Surely we haven't done the smaller ++ memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job])); + -+#if Y_B_ONLY -+ if (qpu_luma) -+ vpu_qpu_wait(&sync_y); -+#endif -+ // Perform inter prediction -+ rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0); ++ // We would do ARM inter prediction here but no longer ++ // Look back in git if you find you want it back - As we have ++ // no arm/neon sand pred code there doesn't seem a lot of point ++ // keeping it around + ++#if RPI_OPT_SEP_PRED + // Wait for transform completion ++ vpu_qpu_wait(&sync_c); + + // Perform intra prediction and residual reconstruction -+ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost); -+#if Y_B_ONLY -+ if (!qpu_luma) -+ vpu_qpu_wait(&sync_y); -+#else ++ rpi_execute_pred_cmds(s, 0, 1); ++ ++ // Wait for transform completion + vpu_qpu_wait(&sync_y); -+#endif ++ ++ // Perform intra prediction and residual reconstruction ++ rpi_execute_pred_cmds(s, 1, 0); ++#else ++ // Wait for transform completion ++ vpu_qpu_wait(&sync_y); ++ ++ // Perform intra prediction and residual reconstruction + rpi_execute_pred_cmds(s); ++#endif + + // Perform deblocking for CTBs in this row + rpi_execute_dblk_cmds(s); -+ -+ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost); +} + +static void rpi_do_all_passes(HEVCContext *s) @@ -6648,19 +8423,19 @@ index ef21595..b36e840 100644 +} + + -+ +#endif + static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) { HEVCContext *s = avctxt->priv_data; -@@ -2315,6 +3966,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2315,6 +3805,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) int y_ctb = 0; int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; +#ifdef RPI -+ s->enable_rpi = s->ps.sps->bit_depth == 8 -+ && !s->ps.pps->cross_component_prediction_enabled_flag; ++ s->enable_rpi = s->ps.sps->bit_depth == 8 && ++ s->frame->format == AV_PIX_FMT_SAND128 && ++ !s->ps.pps->cross_component_prediction_enabled_flag; + + if (!s->enable_rpi) { + if (s->ps.pps->cross_component_prediction_enabled_flag) @@ -6672,7 +8447,7 @@ index ef21595..b36e840 100644 if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); return AVERROR_INVALIDDATA; -@@ -2328,6 +3990,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2328,6 +3830,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) } } @@ -6687,7 +8462,7 @@ index ef21595..b36e840 100644 while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) { int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -@@ -2335,6 +4005,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2335,6 +3845,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts); @@ -6695,56 +8470,48 @@ index ef21595..b36e840 100644 ff_hevc_cabac_init(s, ctb_addr_ts); hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); -@@ -2343,7 +4014,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; +@@ -2344,6 +3855,49 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; -+#if RPI_INTER -+ s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV]; -+ s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y]; -+#endif -+ more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); + +#ifdef RPI -+#if RPI_INTER -+ s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV]= s->curr_u_mvs; -+ s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y] = s->curr_y_mvs; -+#endif -+ + if (s->enable_rpi) { -+ //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0); -+ //av_assert0(s->num_dblk_cmds[s->pass0_job]pass0_jobpass0_job>=0); -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb; -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb; -+ s->ctu_count++; ++ int q_full = (s->ctu_count >= s->max_ctu_count); + -+ if ( s->ctu_count >= s->max_ctu_count ) { ++ if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].luma_ip) != 0) ++ q_full = 1; ++ if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].chroma_ip) != 0) ++ q_full = 1; ++ ++ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb; ++ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb; ++ s->ctu_count++; ++ ++ if (q_full) { +#ifdef RPI_WORKER -+ if (s->used_for_ref) -+ { -+// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb); ++ if (s->used_for_ref) ++ { ++// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb); + -+// worker_wait(s); -+ // Split work load onto separate threads so we make as rapid progress as possible with this frame -+ // Pass on this job to worker thread -+ worker_submit_job(s); ++// worker_wait(s); ++ // Split work load onto separate threads so we make as rapid progress as possible with this frame ++ // Pass on this job to worker thread ++ worker_submit_job(s); + -+ // Make sure we have space to prepare the next job -+ worker_pass0_ready(s); ++ // Make sure we have space to prepare the next job ++ worker_pass0_ready(s); + -+ // Prepare the next batch of commands -+ rpi_begin(s); -+ } else { -+ // Non-ref frame so do it all on this thread -+ rpi_do_all_passes(s); -+ } ++ // Prepare the next batch of commands ++ rpi_begin(s); ++ } else { ++ // Non-ref frame so do it all on this thread ++ rpi_do_all_passes(s); ++ } +#else -+ rpi_do_all_passes(s); ++ rpi_do_all_passes(s); +#endif -+ } ++ } + + } +#endif @@ -6753,7 +8520,7 @@ index ef21595..b36e840 100644 if (more_data < 0) { s->tab_slice_address[ctb_addr_rs] = -1; return more_data; -@@ -2352,9 +4073,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2352,9 +3906,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) ctb_addr_ts++; ff_hevc_save_states(s, ctb_addr_ts); @@ -6778,12 +8545,25 @@ index ef21595..b36e840 100644 + rpi_do_all_passes(s); + } + ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", ++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, ++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, ++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, ++ ts->y_pred2_hgt16, ts->y_pred2_hle16); ++ memset(ts, 0, sizeof(*ts)); ++ } ++#endif ++ +#endif + if (x_ctb + ctb_size >= s->ps.sps->width && y_ctb + ctb_size >= s->ps.sps->height) ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); -@@ -2389,6 +4130,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int +@@ -2389,6 +3976,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int s = s1->sList[self_id]; lc = s->HEVClc; @@ -6795,7 +8575,7 @@ index ef21595..b36e840 100644 if(ctb_row) { ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]); -@@ -2771,6 +4517,20 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) +@@ -2771,6 +4363,33 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) if (ret < 0) return ret; @@ -6809,14 +8589,27 @@ index ef21595..b36e840 100644 + s->nal_unit_type == HEVC_NAL_RADL_N || + s->nal_unit_type == HEVC_NAL_RASL_N); + ++#if DEBUG_DECODE_N ++ { ++ static int z = 0; ++ if (IS_IDR(s)) { ++ z = 1; ++ } ++ if (z != 0 && z++ > DEBUG_DECODE_N) { ++ s->is_decoded = 0; ++ break; ++ } ++ } ++#endif + if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { + s->is_decoded = 0; + break; + } - if (s->max_ra == INT_MAX) { - if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { - s->max_ra = s->poc; -@@ -2894,10 +4654,18 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) ++ + if (s->sh.first_slice_in_pic_flag) { + if (s->max_ra == INT_MAX) { + if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { +@@ -2894,10 +4513,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) } } @@ -6829,16 +8622,17 @@ index ef21595..b36e840 100644 +#endif ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); - -+ } else if (s->ref) { ++ } +#if RPI_INTER ++ else if (s->ref && s->enable_rpi) { + // When running single threaded we need to flush the whole frame + flush_frame(s,s->frame); -+#endif + } ++#endif return ret; } -@@ -3150,6 +4918,41 @@ fail: +@@ -3150,6 +4778,48 @@ fail: return AVERROR(ENOMEM); } @@ -6875,12 +8669,19 @@ index ef21595..b36e840 100644 + s->worker_head=0; + s->kill_worker=0; +} ++ ++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) ++{ ++ av_freep(&ipe->q); ++ gpu_free(&ipe->gptr); ++} ++ +#endif + static av_cold int hevc_decode_free(AVCodecContext *avctx) { HEVCContext *s = avctx->priv_data; -@@ -3161,6 +4964,33 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) +@@ -3161,6 +4831,27 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) av_freep(&s->cabac_state); @@ -6891,44 +8692,27 @@ index ef21595..b36e840 100644 +#endif + + for(i=0;iunif_mv_cmds_y[i]); -+ av_freep(&s->unif_mv_cmds_c[i]); -+ av_freep(&s->univ_pred_cmds[i]); ++ ++ av_freep(&s->univ_pred_cmds[i]); + +#if RPI_INTER -+ if (s->unif_mvs[i]) { -+ gpu_free( &s->unif_mvs_ptr[i] ); -+ s->unif_mvs[i] = 0; -+ } -+ if (s->y_unif_mvs[i]) { -+ gpu_free( &s->y_unif_mvs_ptr[i] ); -+ s->y_unif_mvs[i] = 0; -+ } ++ rpi_free_inter_pred(&s->jobs[i].chroma_ip); ++ rpi_free_inter_pred(&s->jobs[i].luma_ip); +#endif + } + + vpu_qpu_term(); + ++ av_rpi_zc_uninit(avctx); +#endif + for (i = 0; i < 3; i++) { av_freep(&s->sao_pixel_buffer_h[i]); av_freep(&s->sao_pixel_buffer_v[i]); -@@ -3202,10 +5032,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) +@@ -3202,10 +4893,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) return 0; } -+#ifdef RPI -+#ifdef RPI_PRECLEAR -+static av_cold void memclear16(int16_t *p, int n) -+{ -+ vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1); -+ //int i; -+ //for(i=0;iavctx = avctx; -@@ -3215,6 +5060,82 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) +@@ -3215,6 +4910,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) s->HEVClcList[0] = s->HEVClc; s->sList[0] = s; @@ -6949,65 +8733,42 @@ index ef21595..b36e840 100644 + // many times as we have threads (init_thread_copy is called for the + // threads). So to match init & term put the init here where it will be + // called by both init & copy ++ av_rpi_zc_init(avctx); ++ + if (vpu_qpu_init() != 0) + goto fail; + + for(job = 0; job < RPI_MAX_JOBS; job++) { -+ s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y); -+ if (!s->unif_mv_cmds_y[job]) -+ goto fail; -+ s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C); -+ if (!s->unif_mv_cmds_c[job]) -+ goto fail; + s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS); + if (!s->univ_pred_cmds[job]) + goto fail; + } + +#if RPI_INTER -+ // We divide the image into blocks 256 wide and 64 high -+ // We support up to 2048 widths -+ // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted -+ // Also add space for the startup command for each stream. + + for (job = 0; job < RPI_MAX_JOBS; job++) { -+ uint32_t *p; -+#if RPI_CACHE_UNIF_MVS -+ gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] ); -+#else -+ gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] ); -+#endif -+ s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm; ++ HEVCRpiJob * const jb = s->jobs + job; ++ // ** Sizeof the union structure might be overkill but at the moment it ++ // is correct (it certainly isn't going to be too samll) + -+ // Set up initial locations for uniform streams -+ p = s->unif_mvs[job]; -+ for(i = 0; i < QPU_N_UV; i++) { -+ s->mvs_base[job][i] = p; -+ p += UV_COMMANDS_PER_QPU; -+ } ++ rpi_alloc_inter_pred(&jb->chroma_ip, ++ QPU_N_UV, QPU_N_GRP_UV, ++ UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), ++ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t), ++ inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu); ++ rpi_alloc_inter_pred(&jb->luma_ip, ++ QPU_N_Y, QPU_N_GRP_Y, ++ Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t), ++ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t), ++ inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu); + } ++ + s->qpu_filter_uv = qpu_fn(mc_filter_uv); + s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0); -+ s->qpu_filter_uv_b = qpu_fn(mc_filter_uv_b); -+ -+ for (job=0; job < RPI_MAX_JOBS; job++) -+ { -+ uint32_t *p; -+#if RPI_CACHE_UNIF_MVS -+ gpu_malloc_cached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] ); -+#else -+ gpu_malloc_uncached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] ); -+#endif -+ s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm; -+ -+ // Set up initial locations for uniform streams -+ p = s->y_unif_mvs[job]; -+ for(i = 0; i < QPU_N_Y; i++) { -+ s->y_mvs_base[job][i] = p; -+ p += Y_COMMANDS_PER_QPU; -+ } -+ } ++ s->qpu_dummy_frame = qpu_fn(mc_start); // Use our code as a dummy frame + s->qpu_filter = qpu_fn(mc_filter); ++ s->qpu_filter_y_p00 = qpu_fn(mc_filter_y_p00); ++ s->qpu_filter_y_b00 = qpu_fn(mc_filter_y_b00); + s->qpu_filter_b = qpu_fn(mc_filter_b); +#endif + //gpu_malloc_uncached(2048*64,&s->dummy); @@ -7023,7 +8784,7 @@ index ef21595..b36e840 100644 s->cabac_state = av_malloc(HEVC_CONTEXTS); if (!s->cabac_state) goto fail; -@@ -3357,9 +5278,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) +@@ -3357,9 +5105,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) } if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) @@ -7036,7 +8797,7 @@ index ef21595..b36e840 100644 return 0; } -@@ -3418,6 +5339,8 @@ AVCodec ff_hevc_decoder = { +@@ -3418,6 +5166,8 @@ AVCodec ff_hevc_decoder = { .update_thread_context = hevc_update_thread_context, .init_thread_copy = hevc_init_thread_copy, .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | @@ -7045,10 +8806,10 @@ index ef21595..b36e840 100644 AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS, .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE, .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), -diff --git b/libavcodec/hevcdec.h a/libavcodec/hevcdec.h -index 0c78812..e068936 100644 ---- b/libavcodec/hevcdec.h -+++ a/libavcodec/hevcdec.h +diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h +index 0c78812..c268d39 100644 +--- a/libavcodec/hevcdec.h ++++ b/libavcodec/hevcdec.h @@ -334,17 +334,6 @@ typedef struct CodingUnit { uint8_t cu_transquant_bypass_flag; } CodingUnit; @@ -7102,15 +8863,13 @@ index 0c78812..e068936 100644 #define BOUNDARY_LEFT_SLICE (1 << 0) #define BOUNDARY_LEFT_TILE (1 << 1) -@@ -464,6 +460,89 @@ typedef struct HEVCLocalContext { +@@ -464,6 +460,149 @@ typedef struct HEVCLocalContext { int boundary_flags; } HEVCLocalContext; +#ifdef RPI + +// The processing is done in chunks -+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma) -+// This is a distance of 1536 pixels across the screen +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing, +// but allocate more memory and increase the latency before data in the next frame can be processed +#define RPI_NUM_CHUNKS 4 @@ -7133,9 +8892,6 @@ index 0c78812..e068936 100644 +#define RPI_CMD_CHROMA_BI 3 +#define RPI_CMD_V_BI 4 + -+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed? -+// #define RPI_PRECLEAR -+ +// Command for inter prediction +typedef struct HEVCMvCmd { + uint8_t cmd; @@ -7158,9 +8914,16 @@ index 0c78812..e068936 100644 + + +// Command for intra prediction and transform_add of predictions to coefficients -+#define RPI_PRED_TRANSFORM_ADD 0 -+#define RPI_PRED_INTRA 1 -+#define RPI_PRED_I_PCM 2 ++enum rpi_pred_cmd_e ++{ ++ RPI_PRED_ADD_RESIDUAL, ++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V ++ RPI_PRED_INTRA, ++ RPI_PRED_I_PCM, ++ RPI_PRED_CMD_MAX ++}; + +typedef struct HEVCPredCmd { + uint8_t type; @@ -7188,11 +8951,69 @@ index 0c78812..e068936 100644 +} HEVCPredCmd; + +#endif ++ ++#ifdef RPI ++ ++union qpu_mc_pred_cmd_s; ++struct qpu_mc_pred_y_p_s; ++struct qpu_mc_src_s; ++ ++typedef struct HEVCRpiInterPredQ ++{ ++ union qpu_mc_pred_cmd_u *qpu_mc_base; ++ union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ struct qpu_mc_src_s *last_l0; ++ struct qpu_mc_src_s *last_l1; ++ unsigned int load; ++ uint32_t code_setup; ++ uint32_t code_sync; ++ uint32_t code_exit; ++} HEVCRpiInterPredQ; ++ ++typedef struct HEVCRpiInterPredEnv ++{ ++ HEVCRpiInterPredQ * q; ++ unsigned int n; // Number of Qs ++ unsigned int n_grp; // Number of Q in a group ++ unsigned int curr; // Current Q number (0..n-1) ++ int used; // 0 if nothing in any Q, 1 otherwise ++ int used_grp; // 0 if nothing in any Q in the current group ++ unsigned int max_fill; ++ GPU_MEM_PTR_T gptr; ++ unsigned int q1_size; // size of 1 uniform Q ++} HEVCRpiInterPredEnv; ++ ++typedef struct HEVCRpiJob { ++ HEVCRpiInterPredEnv chroma_ip; ++ HEVCRpiInterPredEnv luma_ip; ++} HEVCRpiJob; ++ ++#if RPI_TSTATS ++typedef struct HEVCRpiStats { ++ int y_pred1_y8_merge; ++ int y_pred1_xy; ++ int y_pred1_x0; ++ int y_pred1_y0; ++ int y_pred1_x0y0; ++ int y_pred1_wle8; ++ int y_pred1_wgt8; ++ int y_pred1_hle16; ++ int y_pred1_hgt16; ++ int y_pred2_xy; ++ int y_pred2_x0; ++ int y_pred2_y0; ++ int y_pred2_x0y0; ++ int y_pred2_hle16; ++ int y_pred2_hgt16; ++} HEVCRpiStats; ++#endif ++ ++#endif + typedef struct HEVCContext { const AVClass *c; // needed by private avoptions AVCodecContext *avctx; -@@ -472,6 +551,9 @@ typedef struct HEVCContext { +@@ -472,6 +611,9 @@ typedef struct HEVCContext { HEVCLocalContext *HEVClcList[MAX_NB_THREADS]; HEVCLocalContext *HEVClc; @@ -7202,15 +9023,13 @@ index 0c78812..e068936 100644 uint8_t threads_type; uint8_t threads_number; -@@ -479,6 +561,98 @@ typedef struct HEVCContext { +@@ -479,6 +621,90 @@ typedef struct HEVCContext { int width; int height; + int used_for_ref; // rpi +#ifdef RPI + int enable_rpi; -+ HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS]; -+ HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS]; + HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS]; + int buf_width; + GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS]; @@ -7231,28 +9050,22 @@ index 0c78812..e068936 100644 + int ctu_per_y_chan; // Number of CTUs per luma QPU + int ctu_per_uv_chan; // Number of CTUs per chroma QPU + ++ HEVCRpiJob jobs[RPI_MAX_JOBS]; ++#if RPI_TSTATS ++ HEVCRpiStats tstats; ++#endif +#if RPI_INTER -+ GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS]; -+ uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands ++ struct qpu_mc_pred_y_p_s * last_y8_p; ++ struct qpu_mc_src_s * last_y8_l1; + -+ // _base pointers are to the start of the row -+ uint32_t *mvs_base[RPI_MAX_JOBS][QPU_N_UV]; -+ // these pointers are to the next free space -+ uint32_t *u_mvs[RPI_MAX_JOBS][QPU_N_UV]; -+ uint32_t *curr_u_mvs; // Current uniform stream to use for chroma + // Function pointers + uint32_t qpu_filter_uv; + uint32_t qpu_filter_uv_b0; -+ uint32_t qpu_filter_uv_b; -+ -+ GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS]; -+ uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands -+ uint32_t *y_mvs_base[RPI_MAX_JOBS][QPU_N_Y]; -+ uint32_t *y_mvs[RPI_MAX_JOBS][QPU_N_Y]; -+ uint32_t *curr_y_mvs; // Current uniform stream for luma -+ // Function pointers ++ uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory + uint32_t qpu_filter; + uint32_t qpu_filter_b; ++ uint32_t qpu_filter_y_p00; ++ uint32_t qpu_filter_y_b00; +#endif + +#ifdef RPI_WORKER @@ -7301,7 +9114,7 @@ index 0c78812..e068936 100644 uint8_t *cabac_state; /** 1 if the independent slice segment header was successfully parsed */ -@@ -596,6 +770,9 @@ typedef struct HEVCContext { +@@ -596,6 +822,9 @@ typedef struct HEVCContext { uint32_t max_mastering_luminance; uint32_t min_mastering_luminance; @@ -7311,7 +9124,7 @@ index 0c78812..e068936 100644 } HEVCContext; int ff_hevc_decode_nal_sei(HEVCContext *s); -@@ -703,6 +880,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -703,6 +932,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); @@ -7323,19 +9136,26 @@ index 0c78812..e068936 100644 /** * Reset SEI values that are stored on the Context. * e.g. Caption data that was extracted during NAL -@@ -716,4 +898,8 @@ extern const uint8_t ff_hevc_qpel_extra_before[4]; +@@ -716,4 +950,15 @@ extern const uint8_t ff_hevc_qpel_extra_before[4]; extern const uint8_t ff_hevc_qpel_extra_after[4]; extern const uint8_t ff_hevc_qpel_extra[4]; +#ifdef RPI +int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n); ++ ++// arm/hevc_misc_neon.S ++// Neon coeff zap fn ++#if HAVE_NEON ++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); ++#endif ++ +#endif + #endif /* AVCODEC_HEVCDEC_H */ -diff --git b/libavcodec/hevcdsp.c a/libavcodec/hevcdsp.c -index 23e923f..a985f02 100644 ---- b/libavcodec/hevcdsp.c -+++ a/libavcodec/hevcdsp.c +diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c +index 23e923f..c4f1a6c 100644 +--- a/libavcodec/hevcdsp.c ++++ b/libavcodec/hevcdsp.c @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = { #include "hevcdsp_template.c" #undef BIT_DEPTH @@ -7457,7 +9277,74 @@ index 23e923f..a985f02 100644 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) { #undef FUNC -@@ -257,6 +371,8 @@ int i = 0; +@@ -193,12 +307,38 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ + PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) + ++#if !RPI_HEVC_SAND ++#define SLICED_LOOP_FILTERS(depth) ++#define SLICED_ADD_RESIDUAL(depth) ++#else ++#define SLICED_ADD_RESIDUAL(depth)\ ++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ ++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ ++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ ++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ ++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ ++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ ++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ ++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ ++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ ++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ ++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ ++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ ++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth); ++#define SLICED_LOOP_FILTERS(depth)\ ++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ ++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) ++#endif ++ ++ + #define HEVC_DSP(depth) \ + hevcdsp->put_pcm = FUNC(put_pcm, depth); \ + hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \ + hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ + hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ + hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ ++ SLICED_ADD_RESIDUAL(depth); \ + hevcdsp->dequant = FUNC(dequant, depth); \ + hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ + hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ +@@ -225,6 +365,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ + hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ + \ ++ hevcdsp->sao_band_filter_c[0] = \ ++ hevcdsp->sao_band_filter_c[1] = \ ++ hevcdsp->sao_band_filter_c[2] = \ ++ hevcdsp->sao_band_filter_c[3] = \ ++ hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth); \ ++ hevcdsp->sao_edge_filter_c[0] = \ ++ hevcdsp->sao_edge_filter_c[1] = \ ++ hevcdsp->sao_edge_filter_c[2] = \ ++ hevcdsp->sao_edge_filter_c[3] = \ ++ hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth); \ ++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ ++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth); \ ++ \ + QPEL_FUNCS(depth); \ + QPEL_UNI_FUNCS(depth); \ + QPEL_BI_FUNCS(depth); \ +@@ -232,6 +385,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + EPEL_UNI_FUNCS(depth); \ + EPEL_BI_FUNCS(depth); \ + \ ++ SLICED_LOOP_FILTERS(depth); \ + hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ + hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ + hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ +@@ -257,6 +411,8 @@ int i = 0; break; } @@ -7466,11 +9353,19 @@ index 23e923f..a985f02 100644 if (ARCH_X86) ff_hevc_dsp_init_x86(hevcdsp, bit_depth); if (ARCH_ARM) -diff --git b/libavcodec/hevcdsp.h a/libavcodec/hevcdsp.h -index eefb3cd..a41aa09 100644 ---- b/libavcodec/hevcdsp.h -+++ a/libavcodec/hevcdsp.h -@@ -42,6 +42,17 @@ typedef struct SAOParams { +diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h +index eefb3cd..9e44e7f 100644 +--- a/libavcodec/hevcdsp.h ++++ b/libavcodec/hevcdsp.h +@@ -25,6 +25,7 @@ + #ifndef AVCODEC_HEVCDSP_H + #define AVCODEC_HEVCDSP_H + ++#include "hevc.h" + #include "get_bits.h" + + #define MAX_PB_SIZE 64 +@@ -42,11 +43,30 @@ typedef struct SAOParams { uint8_t type_idx[3]; ///< sao_type_idx } SAOParams; @@ -7488,21 +9383,795 @@ index eefb3cd..a41aa09 100644 typedef struct HEVCDSPContext { void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, struct GetBitContext *gb, int pcm_bit_depth); -@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext { + + void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); ++#if RPI_HEVC_SAND ++ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride); ++ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride); ++ ++ void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride); ++ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, ++ struct GetBitContext *gb, int pcm_bit_depth); ++#endif + + void (*dequant)(int16_t *coeffs, int16_t log2_size); + +@@ -60,14 +80,23 @@ typedef struct HEVCDSPContext { + + void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); + + /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */ + void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, + int16_t *sao_offset_val, int sao_eo_class, int width, int height); ++ void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height); + + void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, + uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); ++ void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++ struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, ++ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); + + void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width); +@@ -120,6 +149,22 @@ typedef struct HEVCDSPContext { void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q); ++#ifdef RPI ++ void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++ void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++ void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++#endif ++ + void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc, + int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, + MvField *curr, MvField *neigh, uint8_t *bs); } HEVCDSPContext; void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth); -diff --git b/libavcodec/hevcpred_template.c a/libavcodec/hevcpred_template.c -index 6ae87cc..28d2653 100644 ---- b/libavcodec/hevcpred_template.c -+++ a/libavcodec/hevcpred_template.c -@@ -20,6 +20,8 @@ +diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c +index 25f1a81..d475b3d 100644 +--- a/libavcodec/hevcdsp_template.c ++++ b/libavcodec/hevcdsp_template.c +@@ -26,6 +26,10 @@ + #include "bit_depth_template.c" + #include "hevcdsp.h" + ++#ifdef RPI ++#include "rpi_zc.h" ++#endif ++ + static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, + GetBitContext *gb, int pcm_bit_depth) + { +@@ -41,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height + } + } + ++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height, ++ GetBitContext *gb, int pcm_bit_depth) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth); ++ dst += stride; ++ } ++ ++ dst = (pixel *)_dst + 1; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth); ++ dst += stride; ++ } ++} ++ ++ + static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res, + ptrdiff_t stride, int size) + { +@@ -58,6 +85,44 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res, + } + } + ++#if RPI_HEVC_SAND ++static av_always_inline void FUNC(add_residual_u_v)(uint8_t *_dst, const int16_t *res, ++ ptrdiff_t stride, int size) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + *res); ++ res++; ++ } ++ dst += stride; ++ } ++} ++ ++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res, ++ ptrdiff_t stride, unsigned int size) ++{ ++ unsigned int x, y; ++ pixel *dst = (pixel *)_dst; ++ const int16_t * ru = res; ++ const int16_t * rv = res + size * size; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++); ++ } ++ dst += stride; ++ } ++} ++#endif ++ + static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, + ptrdiff_t stride) + { +@@ -82,6 +147,90 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, + FUNC(add_residual)(_dst, res, stride, 32); + } + ++#if RPI_HEVC_SAND ++// -- U -- (plaited) ++ ++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_u_v)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_u_v)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_u_v)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- V -- (plaited) ++ ++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_u_v)(_dst + 1, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_u_v)(_dst + 1, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_u_v)(_dst + 1, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- C -- (plaited - both U & V) ++ ++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++#endif ++ ++ + static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) + { + int16_t *coeffs = (int16_t *) _coeffs; +@@ -361,7 +510,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, + int x, y; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; +- int16_t *sao_offset_val = sao->offset_val[c_idx]; + int sao_eo_class = sao->eo_class[c_idx]; + int init_x = 0, width = _width, height = _height; + +@@ -370,33 +518,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, + + if (sao_eo_class != SAO_EO_VERT) { + if (borders[0]) { +- int offset_val = sao_offset_val[0]; + for (y = 0; y < height; y++) { +- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val); ++ dst[y * stride_dst] = src[y * stride_src]; + } + init_x = 1; + } + if (borders[2]) { +- int offset_val = sao_offset_val[0]; + int offset = width - 1; + for (x = 0; x < height; x++) { +- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val); ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; + } + width--; + } + } + if (sao_eo_class != SAO_EO_HORIZ) { + if (borders[1]) { +- int offset_val = sao_offset_val[0]; + for (x = init_x; x < width; x++) +- dst[x] = av_clip_pixel(src[x] + offset_val); ++ dst[x] = src[x]; + } + if (borders[3]) { +- int offset_val = sao_offset_val[0]; + ptrdiff_t y_stride_dst = stride_dst * (height - 1); + ptrdiff_t y_stride_src = stride_src * (height - 1); + for (x = init_x; x < width; x++) +- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val); ++ dst[x + y_stride_dst] = src[x + y_stride_src]; + height--; + } + } +@@ -411,7 +555,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + int x, y; + pixel *dst = (pixel *)_dst; + pixel *src = (pixel *)_src; +- int16_t *sao_offset_val = sao->offset_val[c_idx]; + int sao_eo_class = sao->eo_class[c_idx]; + int init_x = 0, init_y = 0, width = _width, height = _height; + +@@ -420,34 +563,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + + if (sao_eo_class != SAO_EO_VERT) { + if (borders[0]) { +- int offset_val = sao_offset_val[0]; + for (y = 0; y < height; y++) { +- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val); ++ dst[y * stride_dst] = src[y * stride_src]; + } + init_x = 1; + } + if (borders[2]) { +- int offset_val = sao_offset_val[0]; + int offset = width - 1; + for (x = 0; x < height; x++) { +- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val); ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; + } + width--; + } + } + if (sao_eo_class != SAO_EO_HORIZ) { + if (borders[1]) { +- int offset_val = sao_offset_val[0]; + for (x = init_x; x < width; x++) +- dst[x] = av_clip_pixel(src[x] + offset_val); ++ dst[x] = src[x]; + init_y = 1; + } + if (borders[3]) { +- int offset_val = sao_offset_val[0]; + ptrdiff_t y_stride_dst = stride_dst * (height - 1); + ptrdiff_t y_stride_src = stride_src * (height - 1); + for (x = init_x; x < width; x++) +- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val); ++ dst[x + y_stride_dst] = src[x + y_stride_src]; + height--; + } + } +@@ -488,6 +627,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + } + } + ++ ++// --- Plaited chroma versions ++ ++#if BIT_DEPTH != 8 ++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#else ++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table_u[32] = { 0 }; ++ int offset_table_v[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ width *= 2; ++ ++ for (k = 0; k < 4; k++) ++ { ++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; ++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; ++ } ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) ++ { ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); ++ } ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++#endif ++ ++#if BIT_DEPTH != 8 ++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, ++ int eo, int width, int height) { ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#else ++ ++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, ++ int eo, int width, int height) { ++ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ stride_dst /= sizeof(pixel); ++ width *= 2; ++ ++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) { ++ int diff0u = CMP(src[x], src[x + a_stride]); ++ int diff1u = CMP(src[x], src[x + b_stride]); ++ int offset_valu = edge_idx[2 + diff0u + diff1u]; ++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); ++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); ++ int offset_valv = edge_idx[2 + diff0v + diff1v]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); ++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} ++#endif ++ ++#if BIT_DEPTH != 8 ++static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#else ++// Any old 2 byte 'normal' restore will work for these ++#define sao_edge_restore_c_0_8 sao_edge_restore_0_10 ++#define sao_edge_restore_c_1_8 sao_edge_restore_1_10 ++#endif ++ ++ + #undef CMP + + //////////////////////////////////////////////////////////////////////////////// +@@ -1690,3 +1950,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, + #undef TQ1 + #undef TQ2 + #undef TQ3 ++ ++#ifdef RPI ++ ++// line zero ++#define P3 pix_l[0 * xstride] ++#define P2 pix_l[1 * xstride] ++#define P1 pix_l[2 * xstride] ++#define P0 pix_l[3 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++#define Q2 pix_r[2 * xstride] ++#define Q3 pix_r[3 * xstride] ++ ++// line three. used only for deblocking decision ++#define TP3 pix_l[0 * xstride + 3 * ystride] ++#define TP2 pix_l[1 * xstride + 3 * ystride] ++#define TP1 pix_l[2 * xstride + 3 * ystride] ++#define TP0 pix_l[3 * xstride + 3 * ystride] ++#define TQ0 pix_r[0 * xstride + 3 * ystride] ++#define TQ1 pix_r[1 * xstride + 3 * ystride] ++#define TQ2 pix_r[2 * xstride + 3 * ystride] ++#define TQ3 pix_r[3 * xstride + 3 * ystride] ++ ++// This is identical to hevc_loop_filter_luma except that the P/Q ++// components are on separate pointers ++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t _tc[2], ++ const uint8_t _no_p[2], const uint8_t _no_q[2], ++ uint8_t * _pix_l) ++{ ++ int d, j; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ const ptrdiff_t xstride = 1; ++ const ptrdiff_t ystride = _stride / sizeof(pixel); ++ ++ beta <<= BIT_DEPTH - 8; ++ ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ const int no_p = _no_p[j]; ++ const int no_q = _no_q[j]; ++ ++ if (d0 + d3 >= beta) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); ++ ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; ++ ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++ } ++ } ++} ++ ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 ++ ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 ++ ++#define P1 pix_l[0 * xstride] ++#define P0 pix_l[1 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++ ++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, const int32_t *_tc, ++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; ++ ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); ++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); ++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); ++} ++ ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++ ++ ++#endif ++ +diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c +index 7a86ed3..7d32c4a 100644 +--- a/libavcodec/hevcpred.c ++++ b/libavcodec/hevcpred.c +@@ -24,6 +24,7 @@ + + #include "hevcpred.h" + ++#define PRED_C 0 + #define BIT_DEPTH 8 + #include "hevcpred_template.c" + #undef BIT_DEPTH +@@ -39,13 +40,37 @@ + #define BIT_DEPTH 12 + #include "hevcpred_template.c" + #undef BIT_DEPTH ++#undef PRED_C ++ ++#ifdef RPI ++#define PRED_C 1 ++#define BIT_DEPTH 8 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C ++#endif + + void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) + { + #undef FUNC + #define FUNC(a, depth) a ## _ ## depth + +-#define HEVC_PRED(depth) \ ++#undef FUNCC ++#define FUNCC(a, depth) a ## _ ## depth ## _c ++ ++#define HEVC_PRED_Y(depth) \ + hpc->intra_pred[0] = FUNC(intra_pred_2, depth); \ + hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \ + hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \ +@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) + hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ + hpc->pred_angular[3] = FUNC(pred_angular_3, depth); + ++#define HEVC_PRED_C(depth) \ ++ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \ ++ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \ ++ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \ ++ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \ ++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ ++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ ++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ ++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ ++ hpc->pred_dc_c = FUNCC(pred_dc, depth); \ ++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); ++ ++#ifdef RPI ++#define HEVC_PRED(depth) \ ++ HEVC_PRED_Y(depth); \ ++ HEVC_PRED_C(depth); ++#else ++#define HEVC_PRED(depth) \ ++ HEVC_PRED_Y(depth); ++#endif ++ + switch (bit_depth) { + case 9: + HEVC_PRED(9); +diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h +index eb17663..00ba3f9 100644 +--- a/libavcodec/hevcpred.h ++++ b/libavcodec/hevcpred.h +@@ -38,6 +38,17 @@ typedef struct HEVCPredContext { + void (*pred_angular[4])(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); ++#ifdef RPI ++ void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx); ++ ++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int c_idx, int mode); ++#endif + } HEVCPredContext; + + void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); +diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c +index 6ae87cc..c14dddd 100644 +--- a/libavcodec/hevcpred_template.c ++++ b/libavcodec/hevcpred_template.c +@@ -20,13 +20,55 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -7511,7 +10180,54 @@ index 6ae87cc..28d2653 100644 #include "libavutil/pixdesc.h" #include "bit_depth_template.c" -@@ -69,8 +71,11 @@ do { \ + #include "hevcpred.h" + ++#ifdef RPI ++#include "rpi_zc.h" ++#endif ++ ++#define DUMP_PRED 0 ++ + #define POS(x, y) src[(x) + stride * (y)] + ++#if PRED_C ++ ++typedef uint8_t (* c8_dst_ptr_t)[2]; ++typedef const uint8_t (* c8_src_ptr_t)[2]; ++ ++#if BIT_DEPTH == 8 ++#undef BIT_DEPTH ++#define BIT_DEPTH 16 ++#include "bit_depth_template.c" ++#undef FUNC ++#define FUNC(a) FUNC3(a, 8, _c) ++#else ++#undef FUNC ++#define FUNC FUNCC ++#endif ++ ++#endif ++ ++#if DUMP_PRED ++#ifndef DEBUG_ONCE ++#define DEBUG_ONCE ++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) ++{ ++ for (unsigned int y = 0; y != size; y++, data += stride * 2) { ++ for (unsigned int x = 0; x != size; x++) { ++ printf("%4d", data[x * 2]); ++ } ++ printf("\n"); ++ } ++ printf("\n"); ++} ++#endif ++#endif ++ + static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0, + int log2_size, int c_idx) + { +@@ -69,8 +111,11 @@ do { \ AV_WN4P(&ptr[i], a); \ else \ a = PIXEL_SPLAT_X4(ptr[i + 3]) @@ -7524,54 +10240,403 @@ index 6ae87cc..28d2653 100644 int i; int hshift = s->ps.sps->hshift[c_idx]; int vshift = s->ps.sps->vshift[c_idx]; -@@ -114,6 +119,10 @@ do { \ +@@ -79,15 +124,23 @@ do { \ + int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; + int size_in_luma_v = size << vshift; + int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; +- int x = x0 >> hshift; +- int y = y0 >> vshift; ++ const int x = x0 >> hshift; ++ const int y = y0 >> vshift; + int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + + int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb); + +- ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); ++ const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); ++#if defined(RPI) ++ pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ? ++ (pixel*)s->frame->data[c_idx] + x + y * stride : ++ c_idx == 0 ? ++ (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) : ++ (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y); ++#else + pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride; ++#endif + + int min_pu_width = s->ps.sps->min_pu_width; + +@@ -95,14 +148,20 @@ do { \ + lc->tu.intra_pred_mode; + pixel4 a; + pixel left_array[2 * MAX_TB_SIZE + 1]; ++#if !PRED_C + pixel filtered_left_array[2 * MAX_TB_SIZE + 1]; ++#endif + pixel top_array[2 * MAX_TB_SIZE + 1]; ++#if !PRED_C + pixel filtered_top_array[2 * MAX_TB_SIZE + 1]; ++#endif + + pixel *left = left_array + 1; + pixel *top = top_array + 1; ++#if !PRED_C + pixel *filtered_left = filtered_left_array + 1; + pixel *filtered_top = filtered_top_array + 1; ++#endif + int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); + int cand_left = lc->na.cand_left; + int cand_up_left = lc->na.cand_up_left; +@@ -114,6 +173,26 @@ do { \ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - (x0 + size_in_luma_h)) >> hshift; ++ pixel * src_l = src - 1; ++ pixel * src_u = src - stride; ++ pixel * src_ur = src_u + size; ++ +#ifdef DISABLE_INTRA + return; +#endif ++ ++#if defined(RPI) ++ if (s->frame->format == AV_PIX_FMT_SAND128) { ++ const AVFrame * const frame = s->frame; ++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 ++ const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride; ++ if ((x & mask) == 0) ++ src_l -= stripe_adj; ++ if (((x + size) & mask) == 0) ++ src_ur += stripe_adj; ++ } ++#endif + if (s->ps.pps->constrained_intra_pred_flag == 1) { int size_in_luma_pu_v = PU(size_in_luma_v); int size_in_luma_pu_h = PU(size_in_luma_h); -diff --git b/libavcodec/mjpegenc_common.c a/libavcodec/mjpegenc_common.c -index 6d9c982..83a9e95 100644 ---- b/libavcodec/mjpegenc_common.c -+++ a/libavcodec/mjpegenc_common.c -@@ -91,17 +91,13 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p, - { - int i, j, size; - uint8_t *ptr; -- MpegEncContext *s = NULL; +@@ -163,23 +242,24 @@ do { \ + top[-1] = 128; + } + if (cand_up_left) { +- left[-1] = POS(-1, -1); ++ left[-1] = src_l[-stride]; + top[-1] = left[-1]; + } + if (cand_up) +- memcpy(top, src - stride, size * sizeof(pixel)); ++ // Always good - even with sand ++ memcpy(top, src_u, size * sizeof(pixel)); + if (cand_up_right) { +- memcpy(top + size, src - stride + size, size * sizeof(pixel)); +- EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1), ++ memcpy(top + size, src_ur, top_right_size * sizeof(pixel)); ++ EXTEND(top + size + top_right_size, top[size + top_right_size - 1], + size - top_right_size); + } + if (cand_left) + for (i = 0; i < size; i++) +- left[i] = POS(-1, i); ++ left[i] = src_l[stride * i]; + if (cand_bottom_left) { + for (i = size; i < size + bottom_left_size; i++) +- left[i] = POS(-1, i); +- EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1), ++ left[i] = src_l[stride * i]; ++ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1], + size - bottom_left_size); + } + +@@ -268,7 +348,11 @@ do { \ + cand_up_left = 1; + cand_left = 1; + } else { // No samples available ++#if PRED_C && BIT_DEPTH == 16 ++ left[-1] = 0x8080; ++#else + left[-1] = (1 << (BIT_DEPTH - 1)); ++#endif + EXTEND(top, left[-1], 2 * size); + EXTEND(left, left[-1], 2 * size); + } +@@ -287,6 +371,9 @@ do { \ + top[-1] = left[-1]; + + // Filtering process ++ // Sand128 can only apply to chroma_format_idc == 1 so we don't need to ++ // worry about chroma smoothing for that case ++#if !PRED_C + if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { + if (mode != INTRA_DC && size != 4){ + int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; +@@ -342,13 +429,46 @@ do { \ + mode); + break; + } ++#else ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, log2_size, c_idx); ++ break; ++ default: ++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, c_idx, ++ mode); ++ break; ++ } ++ ++#if DUMP_PRED ++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); ++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); ++#endif ++#endif + } + ++#if !PRED_C || BIT_DEPTH == 16 + #define INTRA_PRED(size) \ + static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ + { \ + FUNC(intra_pred)(s, x0, y0, size, c_idx); \ + } ++#else ++#define INTRA_PRED(size) \ ++static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ ++{ \ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ ++ abort(); \ ++} ++#endif + + INTRA_PRED(2) + INTRA_PRED(3) +@@ -357,6 +477,7 @@ INTRA_PRED(5) + + #undef INTRA_PRED + ++#if !PRED_C + static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, ptrdiff_t stride, + int trafo_size) +@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to + POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + + (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); + } ++#else ++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, ++ const uint8_t * _left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ int size = 1 << trafo_size; ++ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; ++ const c8_src_ptr_t top = (c8_src_ptr_t)_top; ++ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ ++ for (y = 0; y < size; y++, src += stride) ++ { ++ for (x = 0; x < size; x++) ++ { ++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + ++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); ++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + ++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); ++ } ++ } ++} ++#endif + ++#if !PRED_C || BIT_DEPTH == 16 + #define PRED_PLANAR(size)\ + static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ + const uint8_t *left, ptrdiff_t stride) \ + { \ + FUNC(pred_planar)(src, top, left, stride, size + 2); \ + } ++#else ++#define PRED_PLANAR(size)\ ++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__); \ ++ abort(); \ ++} ++#endif + + PRED_PLANAR(0) + PRED_PLANAR(1) +@@ -386,6 +540,7 @@ PRED_PLANAR(3) + + #undef PRED_PLANAR + ++#if !PRED_C + static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + const uint8_t *_left, + ptrdiff_t stride, int log2_size, int c_idx) +@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, + POS(0, y) = (left[y] + 3 * dc + 2) >> 2; + } + } ++#else ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size, int c_idx) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; ++ const c8_src_ptr_t top = (c8_src_ptr_t)_top; ++ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ unsigned int dc0 = size; ++ unsigned int dc1 = size; ++ ++ for (i = 0; i < size; i++) ++ { ++ dc0 += left[i][0] + top[i][0]; ++ dc1 += left[i][1] + top[i][1]; ++ } ++ ++ dc0 >>= log2_size + 1; ++ dc1 >>= log2_size + 1; ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = dc0; ++ src[j][1] = dc1; + ++ } ++ } ++} ++#endif ++ ++#ifndef ANGLE_CONSTS ++#define ANGLE_CONSTS ++static const int intra_pred_angle[] = { ++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, ++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 ++}; ++static const int inv_angle[] = { ++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, ++ -630, -910, -1638, -4096 ++}; ++#endif ++ ++#if !PRED_C + static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const uint8_t *_top, + const uint8_t *_left, +@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + const pixel *top = (const pixel *)_top; + const pixel *left = (const pixel *)_left; + +- static const int intra_pred_angle[] = { +- 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, +- -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 +- }; +- static const int inv_angle[] = { +- -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, +- -630, -910, -1638, -4096 +- }; - -- /* Since avctx->priv_data will point to LJpegEncContext in this case */ -- if (avctx->codec_id != AV_CODEC_ID_LJPEG) -- s = avctx->priv_data; -+ MpegEncContext *s = avctx->priv_data; + int angle = intra_pred_angle[mode - 2]; + pixel ref_array[3 * MAX_TB_SIZE + 4]; + pixel *ref_tmp = ref_array + size; +@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, + } + } + } ++#else ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int c_idx, ++ int mode, int size) ++{ ++ int x, y; ++ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; ++ c8_src_ptr_t top = (c8_src_ptr_t)_top; ++ c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ ++ const int angle = intra_pred_angle[mode - 2]; ++ uint8_t ref_array[3 * MAX_TB_SIZE + 4][2]; ++ c8_dst_ptr_t ref_tmp = ref_array + size; ++ c8_src_ptr_t ref; ++ const int last = (size * angle) >> 5; ++ ++ if (mode >= 18) { ++ ref = top - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, top - 1, (size + 1) * 2); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c8_src_ptr_t)ref_tmp; ++ } ++ ++ for (y = 0; y < size; y++, src += stride) { ++ const int idx = ((y + 1) * angle) >> 5; ++ const int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; ++x) { ++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + ++ fact * ref[x + idx + 2][0] + 16) >> 5; ++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + ++ fact * ref[x + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ memcpy(src, ref + idx + 1, size * 2); ++ } ++ } ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, left - 1, (size + 1) * 2); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c8_src_ptr_t)ref_tmp; ++ } ++ ++ for (x = 0; x < size; x++, src++) { ++ const int idx = ((x + 1) * angle) >> 5; ++ const int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + ++ fact * ref[y + idx + 2][0] + 16) >> 5; ++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + ++ fact * ref[y + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ { ++ src[y * stride][0] = ref[y + idx + 1][0]; ++ src[y * stride][1] = ref[y + idx + 1][1]; ++ } ++ } ++ } ++ } ++} ++#endif - if (avctx->codec_id != AV_CODEC_ID_LJPEG) { - int matrix_count = 1 + !!memcmp(luma_intra_matrix, - chroma_intra_matrix, - sizeof(luma_intra_matrix[0]) * 64); -- if (s && s->force_duplicated_matrix) -+ if (s->force_duplicated_matrix) - matrix_count = 2; - /* quant matrixes */ - put_marker(p, DQT); -@@ -138,7 +134,7 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p, - - // Only MJPEG can have a variable Huffman variable. All other - // formats use the default Huffman table. -- if (s && s->huffman == HUFFMAN_TABLE_OPTIMAL) { -+ if (s->out_format == FMT_MJPEG && s->huffman == HUFFMAN_TABLE_OPTIMAL) { - size += put_huffman_table(p, 0, 0, s->mjpeg_ctx->bits_dc_luminance, - s->mjpeg_ctx->val_dc_luminance); - size += put_huffman_table(p, 0, 1, s->mjpeg_ctx->bits_dc_chrominance, -diff --git b/libavcodec/mmaldec.c a/libavcodec/mmaldec.c + static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, + const uint8_t *left, +diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c index 81fcebc..7858478 100644 ---- b/libavcodec/mmaldec.c -+++ a/libavcodec/mmaldec.c +--- a/libavcodec/mmaldec.c ++++ b/libavcodec/mmaldec.c @@ -24,6 +24,9 @@ * MMAL Video Decoder */ @@ -7590,11 +10655,11 @@ index 81fcebc..7858478 100644 #include #include "avcodec.h" -diff --git b/libavcodec/mpeg4videodec.c a/libavcodec/mpeg4videodec.c -index 791a07b..502c21f 100644 ---- b/libavcodec/mpeg4videodec.c -+++ a/libavcodec/mpeg4videodec.c -@@ -2249,6 +2249,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) +diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c +index 54b7be1..894dcdc 100644 +--- a/libavcodec/mpeg4videodec.c ++++ b/libavcodec/mpeg4videodec.c +@@ -2247,6 +2247,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) if (ctx->divx_version >= 0) s->workaround_bugs |= FF_BUG_HPEL_CHROMA; @@ -7604,7 +10669,7 @@ index 791a07b..502c21f 100644 } if (s->workaround_bugs & FF_BUG_STD_QPEL) { -@@ -2273,6 +2276,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) +@@ -2271,6 +2274,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) s->workaround_bugs, ctx->lavc_build, ctx->xvid_build, ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : ""); @@ -7612,25 +10677,154 @@ index 791a07b..502c21f 100644 if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 && s->codec_id == AV_CODEC_ID_MPEG4 && avctx->idct_algo == FF_IDCT_AUTO) { -diff --git b/libavcodec/mpegvideo_enc.c a/libavcodec/mpegvideo_enc.c -index 882cf09..71a858f 100644 ---- b/libavcodec/mpegvideo_enc.c -+++ a/libavcodec/mpegvideo_enc.c -@@ -399,9 +399,6 @@ FF_ENABLE_DEPRECATION_WARNINGS - return AVERROR(EINVAL); - } +diff --git a/libavcodec/raw.c b/libavcodec/raw.c +index 7146e3a..240b274 100644 +--- a/libavcodec/raw.c ++++ b/libavcodec/raw.c +@@ -273,6 +273,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { + { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, + { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, -- if (s->huffman && avctx->codec_id == AV_CODEC_ID_AMV) -- s->huffman = 0; -- - if (s->intra_dc_precision > (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO ? 3 : 0)) { - av_log(avctx, AV_LOG_ERROR, "intra dc precision too large\n"); - return AVERROR(EINVAL); -diff --git b/libavcodec/rpi_hevc_transform.h a/libavcodec/rpi_hevc_transform.h ++ /* RPI */ ++#ifdef RPI ++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++#endif ++ + /* special */ + { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ + { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ +diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c +index d181b74..84f8e8c 100644 +--- a/libavcodec/rawenc.c ++++ b/libavcodec/rawenc.c +@@ -31,6 +31,7 @@ + #include "libavutil/intreadwrite.h" + #include "libavutil/imgutils.h" + #include "libavutil/internal.h" ++#include "libavutil/avassert.h" + + static av_cold int raw_encode_init(AVCodecContext *avctx) + { +@@ -49,6 +50,101 @@ FF_ENABLE_DEPRECATION_WARNINGS + return 0; + } + ++// x0 & width in luma units (so chroma * 2) ++// x0 odd for v ++static uint8_t * sand_copy_line_u(uint8_t * dst, const uint8_t * src, ++ unsigned int x0, const unsigned int width, ++ const unsigned int stride1, const unsigned int stride2) ++{ ++ unsigned int xend; ++ ++ // Skip any empty slices ++ src += (x0 & ~(stride1 - 1)) * stride2; ++ x0 &= (stride1 - 1); ++ ++ xend = x0 + width; ++ for (unsigned int x = 0; x < xend; x += stride1) ++ { ++ const unsigned int w = FFMIN(stride1, xend - x) - x0; ++ for (unsigned int i = 0; i < w; i += 2) ++ *dst++ = src[x0 + i]; ++ src += stride1 * stride2; ++ x0 &= 1; ++ } ++ ++ return dst; ++} ++ ++static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int width, const unsigned int height) ++{ ++ for (unsigned int y = y0; y < height + y0; ++y) { ++ dst = sand_copy_line_u(dst, frame->data[1] + y * frame->linesize[1], x0, width, frame->linesize[1], frame->linesize[3]); ++ } ++ return dst; ++} ++ ++static uint8_t * sand_copy_line_y(uint8_t * dst, const uint8_t * src, ++ unsigned int x0, const unsigned int width, ++ const unsigned int stride1, const unsigned int stride2) ++{ ++ unsigned int xend; ++ ++ // Skip any empty slices ++ src += (x0 & ~(stride1 - 1)) * stride2; ++ x0 &= (stride1 - 1); ++ ++ xend = x0 + width; ++ for (unsigned int x = 0; x < xend; x += stride1) ++ { ++ const unsigned int w = FFMIN(stride1, xend - x) - x0; ++ memcpy(dst, src + x0, w); ++ dst += w; ++ src += stride1 * stride2; ++ x0 = 0; ++ } ++ return dst; ++} ++ ++static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO); ++ int size; ++ int width = frame->width; ++ int height = frame->height; ++ int x0 = 0; ++ int y0 = 0; ++ uint8_t * dst; ++ int ret; ++ ++ if (sd != NULL) { ++ const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data; ++ ++// printf("PScan: h/w=%d/%d, off=%d,%d\n", pscan->height, pscan->width, pscan->position[0][0], pscan->position[0][0]); ++ ++ x0 = si->left_offset; ++ y0 = si->top_offset; ++ } ++ ++ size = width * height * 3 / 2; ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ // Luma is "easy" ++ for (int y = y0; y < height + y0; ++y) { ++ dst = sand_copy_line_y(dst, frame->data[0] + y * frame->linesize[0], x0, width, frame->linesize[0], frame->linesize[3]); ++ } ++ ++ // Chroma is dull ++ dst = cpy_sand_c(dst, frame, x0 & ~1, y0 / 2, width, height / 2); ++ dst = cpy_sand_c(dst, frame, x0 | 1, y0 / 2, width, height / 2); ++ return 0; ++} ++ + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame, int *got_packet) + { +@@ -58,6 +154,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + if (ret < 0) + return ret; + ++ if (frame->format == AV_PIX_FMT_SAND128) { ++ ret = raw_sand_as_yuv420(avctx, pkt, frame); ++ *got_packet = (ret == 0); ++ return ret; ++ } ++ + if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) + return ret; + if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, +diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h new file mode 100644 index 0000000..4309f1c --- /dev/null -+++ a/libavcodec/rpi_hevc_transform.h ++++ b/libavcodec/rpi_hevc_transform.h @@ -0,0 +1,3070 @@ +unsigned char rpi_hevc_transform [] = { +21, @@ -10702,11 +13896,11 @@ index 0000000..4309f1c +33, +3, +}; -diff --git b/libavcodec/rpi_hevc_transform.s a/libavcodec/rpi_hevc_transform.s +diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s new file mode 100644 index 0000000..5543093 --- /dev/null -+++ a/libavcodec/rpi_hevc_transform.s ++++ b/libavcodec/rpi_hevc_transform.s @@ -0,0 +1,917 @@ +# ****************************************************************************** +# Argon Design Ltd. @@ -11625,12 +14819,12 @@ index 0000000..5543093 + bgt loop_cmds + + pop r6-r7, pc -diff --git b/libavcodec/rpi_mailbox.c a/libavcodec/rpi_mailbox.c +diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c new file mode 100644 -index 0000000..8d8a20d +index 0000000..0255f5d --- /dev/null -+++ a/libavcodec/rpi_mailbox.c -@@ -0,0 +1,118 @@ ++++ b/libavcodec/rpi_mailbox.c +@@ -0,0 +1,149 @@ +/* +Copyright (c) 2012, Broadcom Europe Ltd. +All rights reserved. @@ -11658,6 +14852,8 @@ index 0000000..8d8a20d +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + ++#ifdef RPI ++ +#include +#include +#include @@ -11674,6 +14870,7 @@ index 0000000..8d8a20d +#define DEVICE_FILE_NAME "/dev/vcio" + +#include "rpi_mailbox.h" ++//#include + +/* + * use ioctl to send mbox property message @@ -11733,6 +14930,31 @@ index 0000000..8d8a20d + return p[5]; +} + ++#define GET_VCIMAGE_PARAMS 0x30044 ++ ++int mbox_get_image_params(int fd, VC_IMAGE_T * img) ++{ ++ uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32]; ++ uint32_t * p = buf; ++ void * rimg; ++ int rv; ++ ++ *p++ = 0; // size ++ *p++ = 0; // process request ++ *p++ = GET_VCIMAGE_PARAMS; ++ *p++ = sizeof(*img); ++ *p++ = sizeof(*img); ++ rimg = p; ++ memcpy(p, img, sizeof(*img)); ++ p += sizeof(*img) / sizeof(*p); ++ *p++ = 0; // End tag ++ buf[0] = (p - buf) * sizeof(*p); ++ ++ rv = mbox_property(fd, buf); ++ memcpy(img, rimg, sizeof(*img)); ++ ++ return rv; ++} + +int mbox_open() { + int file_desc; @@ -11749,28 +14971,79 @@ index 0000000..8d8a20d +void mbox_close(int file_desc) { + close(file_desc); +} -diff --git b/libavcodec/rpi_mailbox.h a/libavcodec/rpi_mailbox.h ++ ++#endif ++ +diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h new file mode 100644 -index 0000000..b51303b +index 0000000..b316878 --- /dev/null -+++ a/libavcodec/rpi_mailbox.h -@@ -0,0 +1,10 @@ ++++ b/libavcodec/rpi_mailbox.h +@@ -0,0 +1,58 @@ +#ifndef RPI_MAILBOX_H +#define RPI_MAILBOX_H + ++/* The image structure. */ ++typedef struct vc_image_extra_uv_s { ++ void *u, *v; ++ int vpitch; ++} VC_IMAGE_EXTRA_UV_T; ++ ++typedef union { ++ VC_IMAGE_EXTRA_UV_T uv; ++// VC_IMAGE_EXTRA_RGBA_T rgba; ++// VC_IMAGE_EXTRA_PAL_T pal; ++// VC_IMAGE_EXTRA_TF_T tf; ++// VC_IMAGE_EXTRA_BAYER_T bayer; ++// VC_IMAGE_EXTRA_MSBAYER_T msbayer; ++// VC_IMAGE_EXTRA_CODEC_T codec; ++// VC_IMAGE_EXTRA_OPENGL_T opengl; ++} VC_IMAGE_EXTRA_T; ++ ++ ++typedef struct VC_IMAGE_T { ++ unsigned short type; /* should restrict to 16 bits */ ++ unsigned short info; /* format-specific info; zero for VC02 behaviour */ ++ unsigned short width; /* width in pixels */ ++ unsigned short height; /* height in pixels */ ++ int pitch; /* pitch of image_data array in bytes */ ++ int size; /* number of bytes available in image_data array */ ++ void *image_data; /* pixel data */ ++ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */ ++ void *metadata; /* metadata header for the image */ ++ void *pool_object; /* nonNULL if image was allocated from a vc_pool */ ++ int mem_handle; /* the mem handle for relocatable memory storage */ ++ int metadata_size; /* size of metadata of each channel in bytes */ ++ int channel_offset; /* offset of consecutive channels in bytes */ ++ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */ ++ uint8_t num_channels; /* number of channels (2 for stereo) */ ++ uint8_t current_channel;/* the channel this header is currently pointing to */ ++ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/ ++ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header ++ into a linked-mulitchannel image */ ++ uint8_t channel_index; /* index of the channel this header represents while ++ it is being linked. */ ++ uint8_t _dummy[3]; /* pad struct to 64 bytes */ ++} VC_IMAGE_T; ++ ++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1]; ++ ++ +extern int mbox_open(void); +extern void mbox_close(int file_desc); + +extern unsigned mbox_mem_lock(int file_desc, unsigned handle); +extern unsigned mbox_mem_unlock(int file_desc, unsigned handle); + ++int mbox_get_image_params(int fd, VC_IMAGE_T * img); ++ +#endif -diff --git b/libavcodec/rpi_qpu.c a/libavcodec/rpi_qpu.c +diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c new file mode 100644 -index 0000000..be58458 +index 0000000..36c8ab6 --- /dev/null -+++ a/libavcodec/rpi_qpu.c -@@ -0,0 +1,827 @@ ++++ b/libavcodec/rpi_qpu.c +@@ -0,0 +1,878 @@ +#ifdef RPI +#include +#include @@ -11784,10 +15057,13 @@ index 0000000..be58458 +#include +#include + ++#include ++ +#include "rpi_mailbox.h" +#include "rpi_qpu.h" +#include "rpi_shader.h" +#include "rpi_hevc_transform.h" ++#include "rpi_zc.h" + +#pragma GCC diagnostic push +// Many many redundant decls in the header files @@ -11798,6 +15074,10 @@ index 0000000..be58458 +// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) +#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 + ++// Add profile flags to all QPU requests - generates output in "vcdbg log msg" ++// Beware this is expensive and will probably throw off all other timing by >10% ++#define RPI_TRACE_QPU_PROFILE_ALL 0 ++ +// QPU "noflush" flags +// a mixture of flushing & profiling + @@ -11807,26 +15087,13 @@ index 0000000..be58458 +#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling +#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) + -+// On Pi2 there is no way to access the VPU L2 cache -+// GPU_MEM_FLG should be 4 for uncached memory. (Or C for alias to allocate in the VPU L2 cache) -+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly -+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug. -+#define GPU_MEM_FLG 0x4 -+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0 (On Pi1 it allows ARM to access VPU L2 cache) -+#define GPU_MEM_MAP 0x0 -+ +#define vcos_verify_ge0(x) ((x)>=0) + -+/*static const unsigned code[] = -+{ -+ #include "rpi_shader.hex" -+};*/ -+ +// Size in 32bit words +#define QPU_CODE_SIZE 2048 +#define VPU_CODE_SIZE 2048 + -+const short rpi_transMatrix2even[32][16] = { // Even rows first ++static const short rpi_transMatrix2even[32][16] = { // Even rows first +{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, +{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, +{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, @@ -11870,6 +15137,17 @@ index 0000000..be58458 + short transMatrix2even[16*16*2]; +}; + ++#define CFE_ENTS_PER_A 8 ++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices ++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70 ++// allow 128 ++#define CFE_ENT_COUNT 128 ++#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A) ++ ++struct rpi_cache_flush_env_s { ++ unsigned int n; ++ struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; ++}; + +#define WAIT_COUNT_MAX 16 + @@ -11892,7 +15170,6 @@ index 0000000..be58458 +typedef struct vq_wait_s +{ + sem_t sem; -+ unsigned int cost; + struct vq_wait_s * next; +} vq_wait_t; + @@ -11911,7 +15188,6 @@ index 0000000..be58458 + int open_count; + int init_count; + int mb; -+ unsigned int current_load; + GPU_MEM_PTR_T code_gm_ptr; + vq_wait_pool_t wait_pool; +#if RPI_TRACE_TIME_VPU_QPU_WAIT @@ -12183,6 +15459,18 @@ index 0000000..be58458 + return gpu->mb; +} + ++void gpu_ref(void) ++{ ++ gpu_lock_ref(); ++ gpu_unlock(); ++} ++ ++void gpu_unref(void) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_unlock_unref(ge); ++} ++ +// ---------------------------------------------------------------------------- +// +// Cache flush functions @@ -12190,10 +15478,11 @@ index 0000000..be58458 + +rpi_cache_flush_env_t * rpi_cache_flush_init() +{ -+ rpi_cache_flush_env_t * const rfe = calloc(1, sizeof(rpi_cache_flush_env_t)); ++ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t)); + if (rfe == NULL) + return NULL; + ++ rfe->n = 0; + return rfe; +} + @@ -12205,7 +15494,19 @@ index 0000000..be58458 + +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) +{ -+ int rc = (rfe->n == 0) ? 0 : vcsm_clean_invalid(&rfe->a); ++ int rc = 0; ++ unsigned int na; ++ unsigned int nr; ++ ++ // Clear any reamaining ents in the final block ++ if ((nr = rfe->n % CFE_ENTS_PER_A) != 0) ++ memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0])); ++ ++ for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na) ++ { ++ if (vcsm_clean_invalid(rfe->a + na) != 0) ++ rc = -1; ++ } + + free(rfe); + @@ -12218,17 +15519,22 @@ index 0000000..be58458 + +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) +{ -+ av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0])); -+ + // Deal with empty pointer trivially + if (gm == NULL || gm->numbytes == 0) + return; + -+ rfe->a.s[rfe->n].cmd = mode; -+ rfe->a.s[rfe->n].handle = gm->vcsm_handle; -+ rfe->a.s[rfe->n].addr = (unsigned int)gm->arm; -+ rfe->a.s[rfe->n].size = gm->numbytes; -+ ++rfe->n; ++ { ++ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); ++ const unsigned int n = rfe->n % CFE_ENTS_PER_A; ++ ++ av_assert0(rfe->n < CFE_ENT_COUNT); ++ ++ a->s[n].cmd = mode; ++ a->s[n].handle = gm->vcsm_handle; ++ a->s[n].addr = (unsigned int)gm->arm; ++ a->s[n].size = gm->numbytes; ++ ++rfe->n; ++ } +} + +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, @@ -12238,16 +15544,24 @@ index 0000000..be58458 + if (gm == NULL || size == 0) + return; + -+ av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0])); ++// printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes); ++ + av_assert0(offset <= gm->numbytes); + av_assert0(size <= gm->numbytes); + av_assert0(offset + size <= gm->numbytes); + -+ rfe->a.s[rfe->n].cmd = mode; -+ rfe->a.s[rfe->n].handle = gm->vcsm_handle; -+ rfe->a.s[rfe->n].addr = (unsigned int)gm->arm + offset; -+ rfe->a.s[rfe->n].size = size; -+ ++rfe->n; ++ { ++ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); ++ const unsigned int n = rfe->n % CFE_ENTS_PER_A; ++ ++ av_assert0(rfe->n < CFE_ENT_COUNT); ++ ++ a->s[n].cmd = mode; ++ a->s[n].handle = gm->vcsm_handle; ++ a->s[n].addr = (unsigned int)gm->arm + offset; ++ a->s[n].size = size; ++ ++rfe->n; ++ } +} + +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) @@ -12266,23 +15580,38 @@ index 0000000..be58458 + } +} + -+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, -+ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma) ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma) +{ -+ const unsigned int y_offset = frame->linesize[0] * start_line; -+ const unsigned int y_size = frame->linesize[0] * n; ++ const unsigned int y_offset = frame->linesize[0] * y0; ++ const unsigned int y_size = frame->linesize[0] * height; + // Round UV up/down to get everything + const unsigned int uv_rnd = (1U << uv_shift) >> 1; -+ const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift); -+ const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset; ++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); ++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; + ++#if 0 ++ // *** frame->height is cropped height so not good + // As all unsigned they will also reject -ve + // Test individually as well as added to reject overflow -+ av_assert0(start_line <= (unsigned int)frame->height); ++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped + av_assert0(n <= (unsigned int)frame->height); + av_assert0(start_line + n <= (unsigned int)frame->height); ++#endif + -+ if (gpu_is_buf1(frame)) { ++ if (!gpu_is_buf1(frame)) ++ { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); ++ } ++ } ++ else if (!rpi_sliced_frame(frame)) ++ { + const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); + if (do_luma) { + rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); @@ -12294,12 +15623,17 @@ index 0000000..be58458 + } + else + { -+ if (do_luma) { -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); -+ } -+ if (do_chroma) { -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); ++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); ++// printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' '); ++ // **** Use x0! ++ for (int x = 0; x < x0 + width; x += frame->linesize[0]) { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, y0), y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, ++ (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, y0 >> 1), uv_size); ++ } + } + } +} @@ -12340,13 +15674,11 @@ index 0000000..be58458 + + +// If sem_init actually takes time then maybe we want a pool... -+static vq_wait_t * vq_wait_new(const unsigned int cost) ++static vq_wait_t * vq_wait_new(void) +{ + gpu_env_t * const ge = gpu_lock_ref(); + vq_wait_t * const wait = ge->wait_pool.head; + ge->wait_pool.head = wait->next; -+ ge->current_load += cost; -+ wait->cost = cost; + wait->next = NULL; + +#if RPI_TRACE_TIME_VPU_QPU_WAIT @@ -12402,17 +15734,13 @@ index 0000000..be58458 + +static void vq_wait_post(vq_wait_t * const wait) +{ -+#if !RPI_TRACE_TIME_VPU_QPU_WAIT -+ if (wait->cost != 0) -+#endif ++#if RPI_TRACE_TIME_VPU_QPU_WAIT + { + gpu_env_t *const ge = gpu_lock(); -+ ge->current_load -= wait->cost; -+#if RPI_TRACE_TIME_VPU_QPU_WAIT + tto_end(&ge->ttw.active, ns_time()); -+#endif + gpu_unlock(); + } ++#endif + + sem_post(&wait->sem); +} @@ -12428,7 +15756,6 @@ index 0000000..be58458 +{ + unsigned int n; + unsigned int mask; -+ unsigned int cost; + struct gpu_job_s j[VPU_QPU_JOB_MAX]; +}; + @@ -12472,16 +15799,19 @@ index 0000000..be58458 +} + +// flags are QPU_FLAGS_xxx -+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail) ++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) +{ + if (n != 0) { + struct gpu_job_s *const j = new_job(vqj); + vqj->mask |= VPU_QPU_MASK_QPU; -+ vqj->cost += cost; + + j->command = EXECUTE_QPU; + j->u.q.jobs = n; ++#if RPI_TRACE_QPU_PROFILE_ALL ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; ++#else + j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; ++#endif + j->u.q.timeout = 5000; + memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); + } @@ -12503,7 +15833,7 @@ index 0000000..be58458 + } + + // We are going to want a sync object -+ wait = vq_wait_new(vqj->cost); ++ wait = vq_wait_new(); + + // There are 2 VPU Qs & 1 QPU Q so we can collapse sync + // If we only posted one thing or only QPU jobs @@ -12525,7 +15855,6 @@ index 0000000..be58458 + j->callback.cookie = wait; + } + -+ vqj->cost = 0; + vqj->mask = 0; + *wait_h = wait; +} @@ -12544,11 +15873,6 @@ index 0000000..be58458 + return rv; +} + -+unsigned int vpu_qpu_current_load(void) -+{ -+ return gpu_ptr()->current_load; -+} -+ +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) +{ + if (wait_h != NULL) @@ -12598,17 +15922,15 @@ index 0000000..be58458 +} + +#endif // RPI -diff --git b/libavcodec/rpi_qpu.h a/libavcodec/rpi_qpu.h +diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h new file mode 100644 -index 0000000..bcde316 +index 0000000..636e420 --- /dev/null -+++ a/libavcodec/rpi_qpu.h -@@ -0,0 +1,204 @@ ++++ b/libavcodec/rpi_qpu.h +@@ -0,0 +1,201 @@ +#ifndef RPI_QPU_H +#define RPI_QPU_H + -+#include -+ +#define RPI_ONE_BUF 1 + +typedef struct gpu_mem_ptr_s { @@ -12731,10 +16053,8 @@ index 0000000..bcde316 + +// Cache flush stuff + -+typedef struct rpi_flush_envss { -+ unsigned int n; -+ struct vcsm_user_clean_invalid_s a; -+} rpi_cache_flush_env_t; ++struct rpi_cache_flush_env_s; ++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; + +rpi_cache_flush_env_t * rpi_cache_flush_init(void); +// Free env without flushing @@ -12753,8 +16073,9 @@ index 0000000..bcde316 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, + const unsigned int offset, const unsigned int size); +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode); -+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, -+ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma); ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma); + +// init, add, finish for one gm ptr +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); @@ -12763,14 +16084,13 @@ index 0000000..bcde316 +// QPU specific functions +uint32_t qpu_fn(const int * const mc_fn); + -+#define QPU_N_UV 12 -+#define QPU_N_Y 12 -+#define QPU_N_MAX 16 ++#define QPU_N_GRP_UV 4 ++#define QPU_N_UV 12 ++#define QPU_N_GRP_Y 4 // 4 QPUs per TMU ++#define QPU_N_Y 12 ++#define QPU_N_MAX 12 + +#define QPU_MAIL_EL_VALS 2 -+#define QPU_MAIL_EL_SIZE (QPU_MAIL_EL_VALS * sizeof(uint32_t)) -+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS) -+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t)) + +struct vpu_qpu_wait_s; +typedef struct vq_wait_s * vpu_qpu_wait_h; @@ -12784,7 +16104,7 @@ index 0000000..bcde316 +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, + const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); -+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail); ++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); +int vpu_qpu_job_start(const vpu_qpu_job_h vqj); +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); @@ -12795,7 +16115,6 @@ index 0000000..bcde316 + +// Waits for previous post_codee to complete and Will null out *wait_h after use +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); -+unsigned int vpu_qpu_current_load(void); +int vpu_qpu_init(void); +void vpu_qpu_term(void); + @@ -12806,14 +16125,16 @@ index 0000000..bcde316 +extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch); + +extern int gpu_get_mailbox(void); ++void gpu_ref(void); ++void gpu_unref(void); + +#endif -diff --git b/libavcodec/rpi_shader.c a/libavcodec/rpi_shader.c +diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c new file mode 100644 -index 0000000..627cda9 +index 0000000..f2842b6 --- /dev/null -+++ a/libavcodec/rpi_shader.c -@@ -0,0 +1,624 @@ ++++ b/libavcodec/rpi_shader.c +@@ -0,0 +1,734 @@ +#include "rpi_shader.h" + +#ifdef _MSC_VER @@ -12837,744 +16158,947 @@ index 0000000..627cda9 +__attribute__((aligned(8))) +#endif +unsigned int rpi_shader[] = { -+// ::mc_setup_uv -+/* [0x00000000] */ 0x95801ff6, 0xd002591e, // mov tmurs, 1 ; mov ra_link, unif -+/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x00000010] */ 0x159a7d80, 0x10020827, // mov r0, elem_num -+/* [0x00000018] */ 0x0c027c00, 0x14020427, // add ra_x, ra0.16b, r0 -+/* [0x00000020] */ 0x15027d80, 0x12020767, // mov ra_y, ra0.16a -+/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif -+/* [0x00000030] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000038] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base -+/* [0x00000040] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1 -+/* [0x00000048] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1 -+/* [0x00000050] */ 0x15827d80, 0x10021427, // mov rb16, unif -+/* [0x00000058] */ 0x0c827380, 0x10021627, // add rb24, r1, unif -+/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000070] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000078] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 -+/* [0x00000080] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 -+/* [0x00000088] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 -+/* [0x00000090] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 -+/* [0x00000098] */ 0x00000000, 0xe0020327, // mov ra12, 0 -+/* [0x000000a0] */ 0x00000000, 0xe0020367, // mov ra13, 0 -+/* [0x000000a8] */ 0x00000000, 0xe00203a7, // mov ra14, 0 -+/* [0x000000b0] */ 0x00000000, 0xe00203e7, // mov ra15, 0 -+/* [0x000000b8] */ 0x00000000, 0xe0020267, // mov ra9, 0 -+/* [0x000000c0] */ 0x15427d80, 0x10020827, // mov r0, ra_x -+/* [0x000000c8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0 ; mov r1, ra_y -+/* [0x000000d0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base -+/* [0x000000d8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset -+/* [0x000000e0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1 -+/* [0x000000e8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3 -+/* [0x000000f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x000000f8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0 -+/* [0x00000100] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000108] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch -+/* [0x00000110] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2 -+/* [0x00000118] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1 -+/* [0x00000120] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif -+/* [0x00000128] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000130] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000140] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000150] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000158] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000160] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000168] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000170] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000178] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000180] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000188] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 -+/* [0x00000190] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000198] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 -+/* [0x000001a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000001a8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x000001b0] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x -+/* [0x000001b8] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base ++// ::mc_setup_c_q0 ++// ::mc_start ++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c_qn ++/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1 ++/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_base, unif ++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00000028] */ 0x0c9e7000, 0x10021667, // add rb_max_x, r0, r0 ++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00000038] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00000040] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 ++/* [0x00000048] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000050] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif ; mov ra12, 0 ++/* [0x00000058] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif ; mov ra13, 0 ++/* [0x00000060] */ 0x00000000, 0xe00059ce, // nop ; mov ra14, 0 ++/* [0x00000068] */ 0x8c5103f6, 0x1802560f, // add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0 ++/* [0x00000070] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x00000078] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00000080] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num ++/* [0x00000088] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b ++/* [0x00000090] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000000a8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000000b0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000000b8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000000c0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x000000c8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000000d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000000d8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x000000e0] */ 0x0c809f80, 0xd0021367, // add rb_wt_den_p15, 9, unif ++/* [0x000000e8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000000f0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000100] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000108] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000110] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000118] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000138] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00000140] */ 0x15827d80, 0x10020667, // mov ra_base2, unif ++/* [0x00000148] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b ++/* [0x00000150] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00000158] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000160] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000168] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000170] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000178] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00000180] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00000188] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000190] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x00000198] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 ++/* [0x000001a0] */ 0x95442ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y ++// :c_preload ++/* [0x000001a8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000001b0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000001b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001c0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001c8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000001d0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000001d8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:c_preload ++/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001e8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001f0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x000001f8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000200] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00000208] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000210] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000218] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000220] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 +// ::mc_filter_uv -+/* [0x000001c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000001c8] */ 0x15827d80, 0x100200a7, // mov ra2, unif -+/* [0x000001d0] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num ; mov r3, unif -+/* [0x000001d8] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0 -+/* [0x000001e0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x000001e8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 -+/* [0x000001f0] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+/* [0x000001f8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000200] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000208] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000210] */ 0x9509cdbf, 0x12024731, // mov ra_y_next, ra2.16a ; mov vw_setup, rb28 -+/* [0x00000218] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000220] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000228] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x00000230] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3 -+/* [0x00000238] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000240] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait -+/* [0x00000248] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:filter_uv_1 -+/* [0x00000250] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x00000258] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000260] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif -+/* [0x00000268] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 -+/* [0x00000270] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 -+/* [0x00000278] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 -+// :filter_uv_1 -+/* [0x00000280] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000288] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a -+/* [0x00000290] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b -+/* [0x00000298] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c -+/* [0x000002a0] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x000002a8] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 -+/* [0x000002b0] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 -+/* [0x000002b8] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1 ++/* [0x00000228] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000230] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1 ++/* [0x00000238] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 ++/* [0x00000240] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x00000248] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif ++/* [0x00000250] */ 0x935401f6, 0xd4024800, // max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next ++/* [0x00000258] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif ++/* [0x00000260] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000268] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4 ; mov ra0, unif ++/* [0x00000270] */ 0x800a7036, 0x122059d3, // nop ; mov ra_y_next, ra2.16a ++/* [0x00000278] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 ++/* [0x00000280] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000288] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a ++/* [0x00000290] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0 ++/* [0x00000298] */ 0x119c73c0, 0xd0020827, // shl r0, r1, 7 ++/* [0x000002a0] */ 0x8d818eb6, 0x10025743, // sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif ++/* [0x000002a8] */ 0x8c8013f6, 0xd0025456, // add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif ++/* [0x000002b0] */ 0x8c8033f6, 0xd002d496, // add rb_lcount, r1, 3 ; mov.ifnz ra_wt_off_mul_l0, unif ++/* [0x000002b8] */ 0x8c0e70b6, 0x18024808, // add r0, r0, r2 ; mov rb8, ra3.8a ++/* [0x000002c0] */ 0x910d01f6, 0xda024809, // shl r0, r0, i_shift16 ; mov rb9, ra3.8b ++/* [0x000002c8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000002d0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x000002d8] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c ++/* [0x000002e0] */ 0x950c0ff6, 0xde02494b, // mov r5quad, 0 ; mov rb11, ra3.8d ++/* [0x000002e8] */ 0x8f8013f6, 0xd002531e, // asr rb_wt_off, r1, 1 ; mov ra_link, unif ++/* [0x000002f0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++/* [0x000002f8] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +// :uvloop -+/* [0x000002c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x000002c8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x000002d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x000002d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x000002e0] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 -+/* [0x000002e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x000002f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x000002f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000300] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 -+/* [0x00000308] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000310] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000318] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000320] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000328] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000330] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000338] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000340] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000348] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000350] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00000358] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000360] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x00000368] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x00000370] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x00000378] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x00000380] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x00000388] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x00000390] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x00000398] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x000003a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x000003a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000003b0] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x000003b8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x000003c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x000003c8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x000003d0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x000003d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x000003e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x000003e8] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26 ; mov ra9, rb26 -+/* [0x000003f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x000003f8] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29 ; mov ra10, rb29 -+/* [0x00000400] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+/* [0x00000408] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++/* [0x00000300] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00000308] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next ++/* [0x00000310] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, 8 ; mov.ifnz r3, ra_y ++/* [0x00000318] */ 0x8c6817f6, 0xd0029818, // add r0, r3, 1 ; mov.ifz ra_base, ra_base_next ++/* [0x00000320] */ 0x94981f80, 0xd02279d1, // and.setf -, 1, elem_num ; mov ra_y, r0 ++/* [0x00000328] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000330] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9 ; mov.ifz r1, r2 << 1 ++/* [0x00000338] */ 0x559d049f, 0x10044822, // mov.ifz r0, r2 ; mul24 r2, r3, rb_pitch ++/* [0x00000340] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000348] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3 ; mov ra4, ra5 ++/* [0x00000350] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++/* [0x00000358] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000360] */ 0x40034031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000368] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000370] */ 0x40032031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000378] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d , r1 ++/* [0x00000380] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:uvloop ++/* [0x00000388] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00000390] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00000398] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x000003a0] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x000003a8] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000003b0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000003b8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000003c8] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 ++/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off ++/* [0x000003e0] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:uvloop ++/* [0x000003e8] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb_wt_den_p15 ++/* [0x000003f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000003f8] */ 0x15067d80, 0x18020c27, // mov vpm, ra1.8a ++/* [0x00000400] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000408] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0 ++/* [0x00000410] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1 ++/* [0x00000418] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest +// ::mc_filter_uv_b0 -+/* [0x00000410] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000418] */ 0x15827d80, 0x100200a7, // mov ra2, unif -+/* [0x00000420] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num ; mov r3, unif -+/* [0x00000428] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0 -+/* [0x00000430] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000438] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 -+/* [0x00000440] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+/* [0x00000448] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000450] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif -+/* [0x00000458] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000460] */ 0x150a7d80, 0x12020727, // mov ra_y_next, ra2.16a -+/* [0x00000468] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2 -+/* [0x00000470] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000478] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1 -+/* [0x00000480] */ 0x0c043dc0, 0xd20207e7, // add ra31, ra1.16a, 3 -+/* [0x00000488] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000490] */ 0x8c0601bf, 0x14025803, // add r0, r0, ra1.16b ; mov ra3, unif -+/* [0x00000498] */ 0x918101f6, 0xd002480e, // shl r0, r0, i_shift16 ; mov rb14, unif -+/* [0x000004a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 -+/* [0x000004a8] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a -+/* [0x000004b0] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b -+/* [0x000004b8] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c -+/* [0x000004c0] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d -+/* [0x000004c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000004d0] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0 -+// :uvloop_b0 -+/* [0x000004d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x000004e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x000004e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x000004f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x000004f8] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 -+/* [0x00000500] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000508] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000510] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000518] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 -+/* [0x00000520] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x00000528] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000530] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000538] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000540] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000548] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000550] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000558] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000560] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000568] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00000570] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000578] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x00000580] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x00000588] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x00000590] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 -+/* [0x00000598] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000005a0] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 -+/* [0x000005a8] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 -+/* [0x000005b0] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0 ; mov ra7, rb6 -+/* [0x000005b8] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31 -+/* [0x000005c0] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6 ; mov rb6, ra5 -+/* [0x000005c8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x000005d0] */ 0x95104ff6, 0x10024144, // mov ra5, rb4 ; mov rb4, ra4 -+/* [0x000005d8] */ 0x95185ff6, 0x10024105, // mov ra4, rb5 ; mov rb5, ra6 -+/* [0x000005e0] */ 0x95207ff6, 0x10024187, // mov ra6, rb7 ; mov rb7, ra8 -+/* [0x000005e8] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3 -+/* [0x000005f0] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin -+/* [0x000005f8] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3 ; mov -, unif -+/* [0x00000600] */ 0x95810ff6, 0xd0020827, // mov r0, i_shift16 ; mov -, unif -+/* [0x00000608] */ 0x00010000, 0xe0020867, // mov r1, 0x10000 -+/* [0x00000610] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12 -+/* [0x00000618] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 -+/* [0x00000620] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 -+/* [0x00000628] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 -+/* [0x00000630] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 -+/* [0x00000638] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30 -+/* [0x00000640] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 -+/* [0x00000648] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 -+/* [0x00000650] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin -+/* [0x00000658] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 -+/* [0x00000660] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 -+/* [0x00000668] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 -+// :uv_b0_post12 -+/* [0x00000670] */ 0x95105dbf, 0x100248a3, // mov r2, ra4 ; mov r3, rb5 -+/* [0x00000678] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 -+/* [0x00000680] */ 0x959e749b, 0x100241c6, // mov ra7, r2 ; mov rb6, r3 -+/* [0x00000688] */ 0x95187dbf, 0x100248a3, // mov r2, ra6 ; mov r3, rb7 -+/* [0x00000690] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 -+/* [0x00000698] */ 0x959e749b, 0x10024144, // mov ra5, r2 ; mov rb4, r3 -+// ::mc_filter_uv_b -+// :uv_b0_post_fin -+/* [0x000006a0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x000006a8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait -+/* [0x000006b0] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:uv_filter_b_1 -+/* [0x000006b8] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num -+/* [0x000006c0] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28 -+/* [0x000006c8] */ 0x0c027c00, 0x14020827, // add r0, ra0.16b, r0 -+/* [0x000006d0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 -+/* [0x000006d8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 -+/* [0x000006e0] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 -+// :uv_filter_b_1 -+/* [0x000006e8] */ 0x930001f6, 0xd202581c, // max r0, r0, 0 ; mov ra_y_next, ra0.16a -+/* [0x000006f0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif -+/* [0x000006f8] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 -+/* [0x00000700] */ 0x8c8270f6, 0x10020827, // add r0, r0, r3 ; mov -, unif -+/* [0x00000708] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif -+/* [0x00000710] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000720] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a -+/* [0x00000728] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b -+/* [0x00000730] */ 0x8c0d3eb6, 0x1c02468a, // add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c -+/* [0x00000738] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x00000740] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13 -+/* [0x00000748] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 ++/* [0x00000420] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000428] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1 ++/* [0x00000430] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 ++/* [0x00000438] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00000440] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif ++/* [0x00000448] */ 0x935401f6, 0xd4125815, // max r0, r0, 0 ; mov ra_xshift, ra_xshift_next ++/* [0x00000450] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif ++/* [0x00000458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000460] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4 ; mov ra0, unif ++/* [0x00000468] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 ++/* [0x00000470] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000478] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a ++/* [0x00000480] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0 ++/* [0x00000488] */ 0x918073f6, 0xd0025802, // shl r0, r1, 7 ; mov ra2, unif ++/* [0x00000490] */ 0x0d9d8e80, 0x10021767, // sub rb_dma1, rb_dma1_base, r2 ++/* [0x00000498] */ 0x0c9c13c0, 0xd0021467, // add rb_i_tmu, r1, 3 - PREREAD ++/* [0x000004a0] */ 0x0c9c33c0, 0xd00214a7, // add rb_lcount, r1, 3 ++/* [0x000004a8] */ 0x8c8270b6, 0x10125816, // add r0, r0, r2 ; mov ra_wt_mul_l0, unif ++/* [0x000004b0] */ 0x915201bf, 0x1c12d816, // shl r0, r0, ra_k16 ; mov.ifnz ra_wt_mul_l0, unif ++/* [0x000004b8] */ 0x8c81b1f6, 0x10025683, // add rb_dma0, r0, rb_dma0_base ; mov ra3, unif ++/* [0x000004c0] */ 0x159defc0, 0x10020267, // mov ra9, rb_max_y ++/* [0x000004c8] */ 0xec0e7d89, 0x14024821, // add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1 ++/* [0x000004d0] */ 0x8c0c21f6, 0x12125813, // add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a ++/* [0x000004d8] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif ++/* [0x000004e0] */ 0x935011bf, 0x18024800, // max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000004e8] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif ++/* [0x000004f0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x000004f8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000500] */ 0x94827076, 0x10025843, // and r1, r0, r1 ; mov ra3, unif ++/* [0x00000508] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000510] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1 ; mov rb8, ra3.8a ++/* [0x00000518] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 ++/* [0x00000520] */ 0x950e0ff6, 0x1a024489, // mov ra_wt_off_mul_l1, unif ; mov rb9, ra3.8b ++/* [0x00000528] */ 0x950e0ff6, 0x1c06448a, // mov.ifnz ra_wt_off_mul_l1, unif ; mov rb10, ra3.8c ++/* [0x00000530] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif ++/* [0x00000538] */ 0x950c0ff6, 0xde02494b, // mov r5quad,0 ; mov rb11, ra3.8d ++/* [0x00000540] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 ++/* [0x00000548] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif ++/* [0x00000550] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +// :uvloop_b -+/* [0x00000750] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000758] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+/* [0x00000760] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000768] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000770] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 -+/* [0x00000778] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000780] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000788] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000790] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 -+/* [0x00000798] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2 -+/* [0x000007a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000007a8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x000007b0] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x000007b8] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x000007c0] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x000007c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000007d0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000007d8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000007e0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000007e8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x000007f0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x000007f8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x00000800] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x00000808] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 -+/* [0x00000810] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x00000818] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 -+/* [0x00000820] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 -+/* [0x00000828] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 -+/* [0x00000830] */ 0x55586fce, 0x100241e1, // mov ra7, rb6 ; mul24 r1, r1, ra_k256 -+/* [0x00000838] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14 ; mov rb6, ra5 -+/* [0x00000840] */ 0x55044fce, 0x12024161, // mov ra5, rb4 ; mul24 r1, r1, ra1.16a -+/* [0x00000848] */ 0x8c127236, 0x10024844, // add r1, r1, r0 ; mov rb4, ra4 -+/* [0x00000850] */ 0x55585fce, 0x10024121, // mov ra4, rb5 ; mul24 r1, r1, ra_k256 -+/* [0x00000858] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12 ; mov rb5, ra6 -+/* [0x00000860] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31 ; mov ra6, rb7 -+/* [0x00000868] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x00000870] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13 -+/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait ; mov rb7, ra8 -+/* [0x00000880] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a -+/* [0x00000888] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26 ; mov ra9, rb26 -+/* [0x00000890] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000898] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29 ; mov ra10, rb29 -+/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+/* [0x000008a8] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+// ::mc_exit_c -+/* [0x000008b0] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait -+/* [0x000008b8] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit_c_1 ++/* [0x00000558] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00000560] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++/* [0x00000568] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, 8 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00000570] */ 0x95685ff6, 0x10029118, // mov rb4, rb5 ; mov.ifz ra_base, ra_base_next ++/* [0x00000578] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00000580] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num ++/* [0x00000588] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000590] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9 ; mov.ifz r1, r2 << 1 ++/* [0x00000598] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x000005a0] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_k255 ++/* [0x000005a8] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3 ; mov ra4, ra5 ++/* [0x000005b0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++/* [0x000005b8] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000005c0] */ 0x40034031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000005c8] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000005d0] */ 0x40032031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000005d8] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x000005e0] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++/* [0x000005e8] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++/* [0x000005f0] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, 8 ; mov r3, ra_y2 ++/* [0x000005f8] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++/* [0x00000600] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num ++/* [0x00000608] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000610] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9 ; mov.ifz r1, r2 << 1 ++/* [0x00000618] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000620] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255 ++/* [0x00000628] */ 0x950c3ff6, 0x100269c7, // mov.setf -, rb3 ; mov rb7, ra3 ++/* [0x00000630] */ 0x540563f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra1.8a, r0 ++/* [0x00000638] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000640] */ 0x40074031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000648] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000650] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000658] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++/* [0x00000660] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop_b ++/* [0x00000668] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00000670] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 ++/* [0x00000678] */ 0x4d08443e, 0x180241e0, // sub ra7, r2, r0 ; mul24 r0, rb4, ra2.8a ++/* [0x00000680] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00000688] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00000690] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++/* [0x00000698] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++/* [0x000006a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000006a8] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++/* [0x000006b0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++/* [0x000006b8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++/* [0x000006c0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x000006c8] */ 0x0c9e7280, 0x10020867, // add r1, r1, r2 ++/* [0x000006d0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000006d8] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:uvloop_b ++/* [0x000006e0] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 ++/* [0x000006e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000006f0] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a ++/* [0x000006f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000700] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0 ++/* [0x00000708] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1 ++/* [0x00000710] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest ++// ::mc_sync_q0 ++/* [0x00000718] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000720] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000728] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000730] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000738] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000740] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000748] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000750] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000758] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q1 ++/* [0x00000760] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000770] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000778] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000780] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000788] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q2 ++/* [0x00000790] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000798] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000007a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000007a8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000007b0] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000007b8] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q3 ++/* [0x000007c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000007c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000007d0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000007d8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000007e0] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000007e8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q4 ++/* [0x000007f0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000007f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000800] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000808] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000810] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000818] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000820] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000828] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000830] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q5 ++/* [0x00000838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000850] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000858] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000860] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q6 ++/* [0x00000868] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000878] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000880] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000888] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000890] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q7 ++/* [0x00000898] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000008a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000008a8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000008b0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x000008b8] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) +/* [0x000008c0] */ 0x009e7000, 0x100009e7, // nop -+/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop -+/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop -+/* [0x000008d8] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 -+/* [0x000008e0] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 -+/* [0x000008e8] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 -+/* [0x000008f0] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q8 ++/* [0x000008c8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000008d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000008d8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000008e0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000008e8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000008f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000008f8] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000900] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000908] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q9 ++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000920] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000928] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000930] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000938] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q10 ++/* [0x00000940] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000948] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000950] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000958] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000960] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000968] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q11 ++/* [0x00000970] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000980] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000988] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000990] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000998] */ 0x009e7000, 0x100009e7, // nop +// ::mc_exit -+// :exit_c_1 -+/* [0x000008f8] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000900] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00000908] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000910] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 -+/* [0x00000918] */ 0x00000000, 0xe80009e7, // mov -,srel(0) -+/* [0x00000920] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop ; nop -+/* [0x00000930] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_setup -+/* [0x00000938] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1 ; mov ra8, unif -+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif -+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+/* [0x00000958] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x00000960] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00000968] */ 0x0d0c1dc0, 0xd4021667, // sub rb_frame_width_minus_1, ra3.16b, 1 -+/* [0x00000970] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_frame_height_minus_1, ra3.16a, 1 -+/* [0x00000978] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000980] */ 0x15827380, 0x10021627, // or rb24, r1, unif -+/* [0x00000988] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num -+/* [0x00000990] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3 -+/* [0x00000998] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x000009a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 -+/* [0x000009a8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x000009b0] */ 0x0c201dc0, 0xd4020767, // add ra_y, ra8.16b, 1 -+/* [0x000009b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x000009c0] */ 0x0c267c00, 0x100208a7, // add r2, ra9, r0 -+/* [0x000009c8] */ 0x13200dc0, 0xd4020867, // max r1, ra8.16b, 0 -+/* [0x000009d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x000009d8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x000009e0] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2 -+/* [0x000009e8] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3 -+/* [0x000009f0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x000009f8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1 -+/* [0x00000a00] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 -+/* [0x00000a08] */ 0x0c281dc0, 0xd4120567, // add ra_y2, ra10.16b, 1 -+/* [0x00000a10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3 -+/* [0x00000a18] */ 0x0c2e7c00, 0x100208a7, // add r2, ra11, r0 -+/* [0x00000a20] */ 0x13280dc0, 0xd4020867, // max r1, ra10.16b, 0 -+/* [0x00000a28] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000a30] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000a38] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2 -+/* [0x00000a40] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000a48] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000a50] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000a58] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 -+/* [0x00000a60] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 -+/* [0x00000a68] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 -+/* [0x00000a70] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 -+/* [0x00000a78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000a80] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000a88] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000a90] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000a98] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000aa0] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000aa8] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000ab0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000ab8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000ac0] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000ac8] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9 -+/* [0x00000ad0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0 -+/* [0x00000ad8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000ae0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1 -+/* [0x00000ae8] */ 0x55810d8f, 0x100049e1, // mov -, unif ; mul24 r1, r1, rb_pitch -+/* [0x00000af0] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base -+/* [0x00000af8] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0 -+/* [0x00000b00] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1 -+/* [0x00000b08] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1 -+/* [0x00000b10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000b18] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2 -+// :per_block_setup -+/* [0x00000b20] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000b28] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000b30] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num -+/* [0x00000b38] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next -+/* [0x00000b40] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next -+/* [0x00000b48] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 -+/* [0x00000b50] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000b58] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif -+/* [0x00000b60] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000b68] */ 0x15067d80, 0x14020727, // mov ra_y_next, ra1.16b -+/* [0x00000b70] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif -+/* [0x00000b78] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0 -+/* [0x00000b80] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1 -+/* [0x00000b88] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif -+/* [0x00000b98] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3 -+/* [0x00000ba0] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b -+/* [0x00000ba8] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif -+/* [0x00000bb0] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0 -+/* [0x00000bb8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 -+/* [0x00000bc0] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b -+/* [0x00000bc8] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5 -+/* [0x00000bd0] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7 -+/* [0x00000bd8] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7 -+/* [0x00000be0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b -+/* [0x00000be8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000bf0] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif -+/* [0x00000bf8] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16 ; mov ra5, unif -+/* [0x00000c00] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400 -+/* [0x00000c08] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3 ; mov rb14, ra5.16a -+/* [0x00000c10] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00000c18] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d -+/* [0x00000c20] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c -+/* [0x00000c28] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d -+/* [0x00000c30] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c -+/* [0x00000c38] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 -+/* [0x00000c40] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d -+/* [0x00000c48] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c -+/* [0x00000c50] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 -+/* [0x00000c58] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d -+/* [0x00000c60] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c -+/* [0x00000c68] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00000c70] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d -+/* [0x00000c78] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c -+/* [0x00000c80] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 -+/* [0x00000c88] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d -+/* [0x00000c90] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c -+/* [0x00000c98] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x00000ca0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d -+/* [0x00000ca8] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c -+/* [0x00000cb0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 -+/* [0x00000cb8] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d -+/* [0x00000cc0] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c -+/* [0x00000cc8] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a ; mov ra18, unif -+/* [0x00000cd0] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b -+/* [0x00000cd8] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c -+/* [0x00000ce0] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18 -+/* [0x00000ce8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000cf0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13 -+/* [0x00000cf8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9 -+/* [0x00000d00] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d -+// ::mc_filter -+/* [0x00000d08] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1 -+// :yloop -+/* [0x00000d10] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000d18] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+/* [0x00000d20] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000d28] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000d30] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00000d38] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000d40] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000d48] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000d50] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000d58] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000d60] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000d68] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000d70] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 -+/* [0x00000d78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000d80] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000d88] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000d90] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000d98] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000da0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000da8] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000db0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000db8] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00000dc0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000dc8] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000dd0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00000dd8] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00000de0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00000de8] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000df0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00000df8] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00000e00] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000e08] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00000e10] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00000e18] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000e20] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00000e28] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00000e30] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x00000e38] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00000e40] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00000e48] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00000e50] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00000e58] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00000e60] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00000e68] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00000e70] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00000e78] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x00000e80] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00000e88] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00000e90] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x00000e98] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x00000ea0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00000ea8] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000eb0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00000eb8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00000ec0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00000ec8] */ 0xfffffc38, 0xf0f809e7, // brr -, r:per_block_setup -+/* [0x00000ed0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000ed8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000ee0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+// ::mc_filter_b -+// :yloopb -+/* [0x00000ee8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000ef0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+/* [0x00000ef8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+/* [0x00000f00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000f08] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00000f10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0 -+/* [0x00000f18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000f20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000f28] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000f30] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000f38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1 -+/* [0x00000f40] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000f48] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 -+/* [0x00000f50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000f58] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000f60] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000f68] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000f70] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000f78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000f80] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000f88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000f90] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00000f98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000fa0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000fa8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00000fb0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00000fb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00000fc0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000fc8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00000fd0] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00000fd8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000fe0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00000fe8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00000ff0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00000ff8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00001000] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00001008] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x00001010] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00001018] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00001020] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00001028] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00001030] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00001038] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00001040] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00001048] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00001050] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12 -+/* [0x00001058] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00001060] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001068] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14 -+/* [0x00001070] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 -+/* [0x00001078] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait -+/* [0x00001080] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00001088] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00001090] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00001098] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x000010a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x000010a8] */ 0xfffffa58, 0xf0f809e7, // brr -, r:per_block_setup -+/* [0x000010b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x000010b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x000010c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+// ::mc_interrupt_exit12c -+/* [0x000010c8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait -+/* [0x000010d0] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit12_c_1 -+/* [0x000010d8] */ 0x009e7000, 0x100009e7, // nop -+/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop -+/* [0x000010e8] */ 0x009e7000, 0x100009e7, // nop -+/* [0x000010f0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16 -+/* [0x000010f8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10 -+/* [0x00001100] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11 -+/* [0x00001108] */ 0x00000000, 0xe0020267, // mov ra9, 0 ++// ::mc_exit_c ++/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000009a8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x000009b0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000009b8] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 ++/* [0x000009c0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000009c8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000009d0] */ 0x009e7000, 0x100009e7, // nop +// ::mc_interrupt_exit12 -+// :exit12_c_1 -+/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001118] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00001120] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00001128] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 -+/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001178] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001180] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00001188] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00001190] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x00001198] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_exit1 -+/* [0x000011a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000011b0] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000011b8] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000011c0] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000011c8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000011d0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x000011d8] */ 0x009e7000, 0x100009e7, // nop ; nop ++// ::mc_interrupt_exit12c ++/* [0x000009d8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000009e0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000009f0] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 ++/* [0x000009f8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a00] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000a08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y_q0 ++/* [0x00000a18] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y_qn ++/* [0x00000a20] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000a28] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00000a30] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00000a38] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++/* [0x00000a40] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00000a48] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 ++/* [0x00000a50] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000a58] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00000a60] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1 ++/* [0x00000a68] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00000a70] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000a78] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000a80] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch ++/* [0x00000a88] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num ++/* [0x00000a90] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00000a98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000aa0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000aa8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000ab0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000ab8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000ac0] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000ac8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ad0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000ad8] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00000ae0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000ae8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000af0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000af8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000b00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000b08] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000b10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000b18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000b20] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 ++/* [0x00000b28] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00000b30] */ 0x95042ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :y_preload ++/* [0x00000b38] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000b40] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00000b48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000b50] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000b58] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00000b60] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00000b68] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:y_preload ++/* [0x00000b70] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000b78] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000b80] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x00000b88] */ 0x0c809dc0, 0xd0021367, // add rb_wt_den_p15, unif, 9 ++/* [0x00000b90] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000b98] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000ba0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000ba8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000bb0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000bb8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000bc0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000bc8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000bd0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000bd8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000be0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000be8] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000bf0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000bf8] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000c00] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000c08] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup ++/* [0x00000c10] */ 0x935401f6, 0xd4125815, // max r0, r0, 0 ; mov ra_xshift, ra_xshift_next ++/* [0x00000c18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000c20] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000c28] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000c30] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif ++/* [0x00000c38] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00000c40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000c48] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00000c50] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00000c58] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000c60] */ 0x930401f6, 0xd2125813, // max r0, r0, 0 ; mov ra_y2_next, ra1.16a ++/* [0x00000c68] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x00000c70] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000c78] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00000c80] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000c88] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000c90] */ 0x8c9dc07f, 0x10024831, // add r0, r0, r1 ; mov vw_setup, rb_vpm_init ++/* [0x00000c98] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00000ca0] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width ++/* [0x00000ca8] */ 0x8c405df6, 0xd2025460, // add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height ++/* [0x00000cb0] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16 ++/* [0x00000cb8] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 ++/* [0x00000cc0] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7 ++/* [0x00000cc8] */ 0x0c427180, 0x14020827, // add r0, r0, ra_width ++/* [0x00000cd0] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 ++/* [0x00000cd8] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00000ce0] */ 0x918101f6, 0xd0045816, // shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000ce8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3 ++/* [0x00000cf0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000cf8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d ++/* [0x00000d00] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c ++/* [0x00000d08] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00000d10] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00000d18] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00000d20] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 ++/* [0x00000d28] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d ++/* [0x00000d30] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c ++/* [0x00000d38] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 ++/* [0x00000d40] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d ++/* [0x00000d48] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c ++/* [0x00000d50] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000d58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00000d60] */ 0x90216387, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255 ++/* [0x00000d68] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 ++/* [0x00000d70] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00000d78] */ 0x90216387, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255 ++/* [0x00000d80] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000d88] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00000d90] */ 0x90216387, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255 ++/* [0x00000d98] */ 0x954a0dbf, 0x10064597, // mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif ++/* [0x00000da0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 ++/* [0x00000da8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00000db0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000db8] */ 0x90216387, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255 ++/* [0x00000dc0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 ++/* [0x00000dc8] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif ++// ::mc_filter ++/* [0x00000dd0] */ 0xfffffe20, 0xf0f807a7, // brr ra_link, r:per_block_setup ++/* [0x00000dd8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00000de0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000de8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00000df0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++// :yloop ++/* [0x00000df8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00000e00] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00000e08] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00000e10] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00000e18] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00000e20] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00000e28] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00000e30] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00000e38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00000e40] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00000e48] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 ++/* [0x00000e50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00000e58] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++/* [0x00000e60] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00000e68] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00000e70] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00000e78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000e80] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00000e88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00000e90] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00000e98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000ea0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000ea8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00000eb0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00000eb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00000ec0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000ec8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00000ed0] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00000ed8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8 ; mov r1, ra8 ++/* [0x00000ee0] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 ++/* [0x00000ee8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloop ++/* [0x00000ef0] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 ++/* [0x00000ef8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00000f00] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3 ; mov rb11, r1 ++/* [0x00000f08] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a ++/* [0x00000f10] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b ++/* [0x00000f18] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00000f20] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00000f28] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00000f30] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00000f38] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00000f40] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00000f48] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00000f50] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00000f58] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00000f60] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x00000f68] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off ++/* [0x00000f70] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8 ; mov r0, ra_height ++/* [0x00000f78] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:yloop ++/* [0x00000f80] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 ++/* [0x00000f88] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait ++/* [0x00000f90] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a ++/* [0x00000f98] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 ++/* [0x00000fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00000fa8] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 ++/* [0x00000fb0] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 ++/* [0x00000fb8] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest ++/* [0x00000fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000fc8] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 ++/* [0x00000fd0] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 ++/* [0x00000fd8] */ 0xfffffe00, 0xf0f809e7, // brr -, r:yloop ++/* [0x00000fe0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch ++/* [0x00000fe8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 ++/* [0x00000ff0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_b ++/* [0x00000ff8] */ 0xfffffbf8, 0xf0f807a7, // brr ra_link, r:per_block_setup ++/* [0x00001000] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001008] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00001010] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++// :yloopb ++/* [0x00001018] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00001020] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001028] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001030] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001038] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001040] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001048] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001050] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001058] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00001060] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001068] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 ++/* [0x00001070] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x00001078] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++/* [0x00001080] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00001088] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001090] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00001098] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000010a0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000010a8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000010b0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000010b8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000010c0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000010c8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000010d0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000010d8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000010e0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000010e8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000010f0] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x000010f8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8 ; mov r1, ra8 ++/* [0x00001100] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 ++/* [0x00001108] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloopb ++/* [0x00001110] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 ++/* [0x00001118] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00001120] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3 ; mov rb11, r1 ++/* [0x00001128] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a ++/* [0x00001130] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b ++/* [0x00001138] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001140] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001148] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001150] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001158] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001160] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001168] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00001170] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001178] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001180] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001188] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++/* [0x00001190] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0 ++/* [0x00001198] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8 ; mov r0, ra_height ++/* [0x000011a0] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:yloopb ++/* [0x000011a8] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 ++/* [0x000011b0] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait ++/* [0x000011b8] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a ++/* [0x000011c0] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 ++/* [0x000011c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000011d0] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 ++/* [0x000011d8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 ++/* [0x000011e0] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest ++/* [0x000011e8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000011f0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 ++/* [0x000011f8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 ++/* [0x00001200] */ 0xfffffdf8, 0xf0f809e7, // brr -, r:yloopb ++/* [0x00001208] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch ++/* [0x00001210] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 ++/* [0x00001218] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y_p00 ++/* [0x00001220] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001228] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next ++/* [0x00001230] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00001238] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00001240] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001248] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00001250] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00001258] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif ++/* [0x00001260] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00001268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001270] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif ++/* [0x00001278] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00001280] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width ++/* [0x00001288] */ 0x8d402df6, 0xd2025460, // sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height ++/* [0x00001290] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16 ++/* [0x00001298] */ 0x8c8001f6, 0xd0025496, // add rb_lcount, r0, 0 ; mov ra_wt_off_mul_l0, unif ++/* [0x000012a0] */ 0x918071f6, 0xd0024817, // shl r0, r0, 7 ; mov rb_dest, unif ++/* [0x000012a8] */ 0x0c427180, 0x14020827, // add r0, r0, ra_width ++/* [0x000012b0] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 ++/* [0x000012b8] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base ++/* [0x000012c0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 ++/* [0x000012c8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif ++// :yloop_p00 ++/* [0x000012d0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000012d8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000012e0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000012e8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000012f0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000012f8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001300] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 ++/* [0x00001308] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00001310] */ 0x9140f3f6, 0xd2024860, // shl r1, r1, 15 ; mov r0, ra_height ++/* [0x00001318] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off ++/* [0x00001320] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:yloop_p00 ++/* [0x00001328] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 ++/* [0x00001330] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait ++/* [0x00001338] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a ++/* [0x00001340] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 ++/* [0x00001348] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001350] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 ++/* [0x00001358] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 ++/* [0x00001360] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest ++/* [0x00001368] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001370] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 ++/* [0x00001378] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 ++/* [0x00001380] */ 0xffffff30, 0xf0f809e7, // brr -, r:yloop_p00 ++/* [0x00001388] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch ++/* [0x00001390] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 ++/* [0x00001398] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y_b00 ++/* [0x000013a0] */ 0xfffff850, 0xf0f807a7, // brr ra_link, r:per_block_setup ++/* [0x000013a8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x000013b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++/* [0x000013b8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000013c0] */ 0x00000007, 0xe0020827, // mov r0, 7 ++/* [0x000013c8] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 ++/* [0x000013d0] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 ++/* [0x000013d8] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++/* [0x000013e0] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 ++/* [0x000013e8] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :yloop_b00 ++/* [0x000013f0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x000013f8] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001400] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001408] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001410] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001418] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001420] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001428] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001430] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00001438] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001440] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 ++/* [0x00001448] */ 0x545963c6, 0x12024860, // and r1, r1, rb_k255 ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00001450] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00001458] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 ++/* [0x00001460] */ 0x119ce3c0, 0xd0020867, // shl r1, r1, 14 ++/* [0x00001468] */ 0x8c40c3f6, 0x12024860, // add r1, r1, rb_wt_off ; mov r0, ra_height ++/* [0x00001470] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:yloop_b00 ++/* [0x00001478] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15 ++/* [0x00001480] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16 ; mov -, vw_wait ++/* [0x00001488] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1 ; mov vpm, ra3.8a ++/* [0x00001490] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0 ; mov ra_height, r0 ++/* [0x00001498] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000014a0] */ 0x929da07f, 0x10024831, // min r0, r0, r1 ; mov vw_setup, rb_dma0 ++/* [0x000014a8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1 ; mov vw_setup, rb_dma1 ++/* [0x000014b0] */ 0x809d703f, 0x100049f2, // nop ; mov vw_addr, rb_dest ++/* [0x000014b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000014c0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23 ++/* [0x000014c8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0 ++/* [0x000014d0] */ 0xffffff00, 0xf0f809e7, // brr -, r:yloop_b00 ++/* [0x000014d8] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch ++/* [0x000014e0] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 ++/* [0x000014e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init +// ::mc_end +}; +#ifdef __HIGHC__ +#pragma Align_to(8, rpi_shader) +#endif -diff --git b/libavcodec/rpi_shader.h a/libavcodec/rpi_shader.h +diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h new file mode 100644 -index 0000000..3b1229e +index 0000000..a44bce9 --- /dev/null -+++ a/libavcodec/rpi_shader.h -@@ -0,0 +1,20 @@ ++++ b/libavcodec/rpi_shader.h +@@ -0,0 +1,35 @@ +#ifndef rpi_shader_H +#define rpi_shader_H + +extern unsigned int rpi_shader[]; + -+#define mc_setup_uv (rpi_shader + 0) -+#define mc_filter_uv (rpi_shader + 112) -+#define mc_filter_uv_b0 (rpi_shader + 260) -+#define mc_filter_uv_b (rpi_shader + 424) -+#define mc_exit_c (rpi_shader + 556) -+#define mc_exit (rpi_shader + 574) -+#define mc_setup (rpi_shader + 590) -+#define mc_filter (rpi_shader + 834) -+#define mc_filter_b (rpi_shader + 954) -+#define mc_interrupt_exit12c (rpi_shader + 1074) -+#define mc_interrupt_exit12 (rpi_shader + 1092) -+#define mc_exit1 (rpi_shader + 1128) -+#define mc_end (rpi_shader + 1144) ++#define mc_setup_c_q0 (rpi_shader + 0) ++#define mc_start (rpi_shader + 0) ++#define mc_setup_c_qn (rpi_shader + 2) ++#define mc_filter_uv (rpi_shader + 138) ++#define mc_filter_uv_b0 (rpi_shader + 264) ++#define mc_sync_q0 (rpi_shader + 454) ++#define mc_sync_q1 (rpi_shader + 472) ++#define mc_sync_q2 (rpi_shader + 484) ++#define mc_sync_q3 (rpi_shader + 496) ++#define mc_sync_q4 (rpi_shader + 508) ++#define mc_sync_q5 (rpi_shader + 526) ++#define mc_sync_q6 (rpi_shader + 538) ++#define mc_sync_q7 (rpi_shader + 550) ++#define mc_sync_q8 (rpi_shader + 562) ++#define mc_sync_q9 (rpi_shader + 580) ++#define mc_sync_q10 (rpi_shader + 592) ++#define mc_sync_q11 (rpi_shader + 604) ++#define mc_exit (rpi_shader + 616) ++#define mc_exit_c (rpi_shader + 616) ++#define mc_interrupt_exit12 (rpi_shader + 630) ++#define mc_interrupt_exit12c (rpi_shader + 630) ++#define mc_setup_y_q0 (rpi_shader + 646) ++#define mc_setup_y_qn (rpi_shader + 648) ++#define mc_filter (rpi_shader + 884) ++#define mc_filter_b (rpi_shader + 1022) ++#define mc_filter_y_p00 (rpi_shader + 1160) ++#define mc_filter_y_b00 (rpi_shader + 1256) ++#define mc_end (rpi_shader + 1340) + +#endif -diff --git b/libavcodec/rpi_shader.qasm a/libavcodec/rpi_shader.qasm +diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm new file mode 100644 -index 0000000..6fd6af5 +index 0000000..58fd911 --- /dev/null -+++ a/libavcodec/rpi_shader.qasm -@@ -0,0 +1,1150 @@ ++++ b/libavcodec/rpi_shader.qasm +@@ -0,0 +1,1349 @@ + +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress +# the warning that we are using rotation & ra/rb registers. r0..3 can be -+# rotated through all 16 elems ra regs can only be routated through their ++# rotated through all 16 elems ra regs can only be rotated through their +# local 4. As it happens this is what is wanted here as we do not want the +# constants from the other half of the calc. + ++# PREREAD is the number of requests that we have sitting in the TMU request ++# queue. ++# ++# There are 8 slots availible in the TMU request Q for tm0s requests, but ++# only 4 output FIFO entries and overflow is bad (corruption or crash) ++# (If threaded then only 2 out FIFO entries, but we aren't.) ++# In s/w we are effectively limited to the min vertical read which is >= 4 ++# so output FIFO is the limit. ++# ++# However in the current world there seems to be no benefit (and a small ++# overhead) in setting this bigger than 2. ++ ++.set PREREAD, 2 ++ ++ +# register allocation +# -+# ra0...ra7 eight horizontal filter coefficients -+# -+# rb0 rx_shift2 -+# rb1 rb_y2_next -+# -+# rb4...rb7 -+# -+# rb8..rb11, ra8...ra11 Y: eight filtered rows of context (ra11 == most recent) -+# -+# (ra15 isn't clamped to zero - this happens during the -+# copy to ra14, and during its use in the vertical filter) -+# -+# rb8...rb11 eight vertical filter coefficients + -+# ra4 y: Fiter, UV: part -of b0 -> b stash ++# ra0-3 ++# Used as temp and may be loop filter coeffs (split into .8s) ++# or temp in loop. Check usage on an individual basis. ++ ++# ra4-7 ++# C: L0 H filter out FIFO ++# otherwise -- free -- ++ ++# ra8-11 ++# temp in some places - check usage ++# Y: (with rb8-11) horiz out FIFO ++ ++# ra12-15 ++# -- free -- ++ ++# uniform: width:height ++.set ra_width_height, ra16 ++.set ra_width, ra16.16b ++.set ra_height, ra16.16a ++ ++# y:y2 same layout as y_y2_next so we can update both together ++.set ra_y_y2, ra17 ++.set ra_y2, ra17.16a ++.set ra_y, ra17.16b ++ ++# uniform: L1 weight (U on left, V on right) ++# Only used in Y B ++.set ra_wt_off_mul_l1, ra18 ++.set ra_wt_off_l1, ra18.16b ++.set ra_wt_mul_l1, ra18.16a ++ ++# y_next:y2_next same layout as y_y2 so we can update both together ++.set ra_y_y2_next, ra19 ++.set ra_y_next, ra19.16b ++.set ra_y2_next, ra19.16a ++ ++# Setup: consts - subdivide a single register ++.set ra_kff100100, ra20 ++.set ra_k256, ra20.16a ++.set ra_k0, ra20.8a ++.set ra_k1, ra20.8b ++.set ra_k16, ra20.8c ++.set ra_k255, ra20.8d ++ ++# Loop: xshifts ++.set ra_xshift, ra21.16a ++.set ra_xshift_next, ra21.16b ++ ++# Loop var: L0 weight (U on left, V on right) ++# _off_ is not used in loop as we want to modify it before use ++.set ra_wt_off_mul_l0, ra22 ++.set ra_wt_mul_l0, ra22.16a ++.set ra_wt_off_l0, ra22.16b ++ ++# -- free -- ra23 ++ ++# Loop: src frame base (L0) ++.set ra_base, ra24 ++ ++# Loop: src frame base (L1) ++.set ra_base2, ra25 ++ ++# Loop: next src frame base (L0) ++.set ra_base_next, ra26 ++ ++# -- free -- ra27 ++# -- free -- ra28 ++# -- free -- ra29 + -+# rb12 offset to add before shift (round + weighting offsets) -+# rb13 shift: denom + 6 + 9 -+# rb14 L0 weight (U on left, V on right) -+# rb15 -- free -- -+# -+# ra16 clipped(row start address+elem_num)&~3 -+# ra17 per-channel shifts -+# ra18 L1 weight (Y) -+# ra19 next ra17 -+# -+# rb16 pitch -+# rb17 height + 1 -+# rb18 height + 3 -+# rb19 next ra16 -+# -+# ra20 1 -+# ra21 ra_21 -+# ra22 ra_k256 256 -+# ra23 ra_y2_next ra_y2_next -+# -+# rb20 -- free -- -+# rb21 -- free -- -+# rb22 rb_k255 255 -+# rb23 -- free -- -+# -+# rb24 vdw_setup_1(dst_pitch) -+# rb25 frame width-1 -+# rb26 height<<23 + width<<16 + vdw_setup_0 -+# rb27 vdw_setup_0 (depends on QPU number) -+# rb28 vpm_setup (depends on QPU number) for writing 8bit results into VPM -+# rb29 vdw_setup_1(dst_pitch-width) -+# rb30 frame height-1 -+# rb31 used as temp to count loop iterations -+# -+# ra24 clipped(row start address+8+elem_num)&~3 -+# ra25 per-channel shifts 2 -+# ra26 next ra24 -+# ra27 next ra25 -+# ra28 next y -+# ra29 y for next texture access -+# +# Use an even numbered register as a link register to avoid corrupting flags -+# ra30 next kernel address -+# ra31 chroma-B height+3; free otherwise -+ -+.set rb_frame_width_minus_1, rb25 -+.set rb_frame_height_minus_1, rb30 -+.set rb_pitch, rb16 -+.set ra_x, ra16 -+.set ra_y2, ra21.16a -+.set ra_y2_next, ra21.16b -+ -+.set rb_x_next, rb19 -+.set rx_frame_base2_next, rb19 -+ -+.set ra_frame_base, ra24 -+.set ra_frame_base_next, ra26 -+.set ra_xshift, ra17 -+ -+.set ra_u2v_ref_offset, ra25 -+.set ra_frame_base2, ra25 -+ -+.set ra_xshift_next, ra19 -+.set rx_xshift2, rb0 -+.set rx_xshift2_next, rb1 -+ -+.set ra_u2v_dst_offset, ra27 -+ -+.set ra_y_next, ra28 -+.set ra_y, ra29 -+ -+.set ra_k1, ra20 -+.set rb_k255, rb22 -+.set ra_k256, ra22 -+ +.set ra_link, ra30 + ++# -- free -- ra31 ++ ++.set rb_xshift2, rb0 ++.set rb_xshift2_next, rb1 ++ ++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 ++.set rb_elem_x, rb2 ++ ++# rb3 ++# C: Temp (U/V flag) ++# Y: free ++ ++# rb4-7 ++# C-B: L1 H filter out FIFO ++# Y: (with ra2.8x) Y vertical filter coeffs ++ ++# rb8-11 ++# C: Vertical filter coeffs ++# Y: (with ra8-11) horiz out FIFO ++ ++# Loop var: offset to add before shift (round + weighting offsets) ++# Exact value varies by loop ++.set rb_wt_off, rb12 ++ ++# Setup: denom + 6 + 9 ++.set rb_wt_den_p15, rb13 ++ ++# -- free -- rb14 ++# -- free -- rb15 ++ ++# Line pitch (128 for sand128) ++.set rb_pitch, rb16 ++ ++# Loop count - 2 (set up TMU for next xfer) ++.set rb_i_tmu, rb17 ++ ++# Loop count for min(height, 16) ++# Y will reset & loop again if height > 16 ++.set rb_lcount, rb18 ++ ++# frame_base2_next ++.set rb_base2_next, rb19 ++ ++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give ++# offset to the slice ++.set rb_xpitch, rb20 ++ ++# -- free -- rb21 ++ ++# Setup: 255 ++.set rb_k255, rb22 ++ ++# Loop: destination address ++.set rb_dest, rb23 ++ ++# vdw_setup_1(dst_pitch) ++.set rb_dma1_base, rb24 ++ ++# Setup: pic width - 1 ++# In the case of chroma it is in bytes so 2 * (pic_width_c - 1) ++.set rb_max_x, rb25 ++ ++# Loop: height<<23 + width<<16 + vdw_setup_0 ++.set rb_dma0, rb26 ++ ++# vdw_setup_0 (depends on QPU number) ++.set rb_dma0_base, rb27 ++ ++# Setup: vw_setup value to reset VPM write pointer ++.set rb_vpm_init, rb28 ++ ++# Loop: vdw_setup_1(dst_pitch-width) = stride ++.set rb_dma1, rb29 ++ ++# Setup: pic_height - 1 ++.set rb_max_y, rb30 ++ ++# -- free -- rb31 ++ ++ ++ ++ +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. +.set i_shift16, -16 +.set i_shift21, -11 ++.set i_shift23, -9 +.set i_shift30, -2 + +# Much of the setup code is common between Y & C @@ -13596,714 +17120,719 @@ index 0000000..6fd6af5 + add r_dma, r0, r1 # DMA out +.endm + ++.macro m_setup_q0 ++ srel -, 12 ++.endm ++ ++# Code start label ++::mc_start + +################################################################################ -+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) -+::mc_setup_uv -+ mov tmurs, 1 ; mov ra_link, unif # No swap TMUs ; Next fn ++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) ++::mc_setup_c_q0 ++ m_setup_q0 ++::mc_setup_c_qn ++ mov tmurs, 1 # No swap TMUs + +# Load first request location -+mov ra0, unif -+mov r0, elem_num ++ mov ra0, unif # next_x_y + -+add ra_x, ra0.16b, r0 # Store x -+mov ra_y, ra0.16a # Store y -+mov ra_frame_base, unif # Store frame u base -+mov r1, vdw_setup_1(0) # Merged with dst_stride shortly, delay slot for ra_frame_base -+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame ++ mov ra_base, unif # Store frame c base + +# Read image dimensions -+sub rb25,unif,1 -+sub rb30,unif,1 -+ -+# get source pitch -+mov rb16, unif -+ -+# get destination vdw setup -+add rb24, r1, unif # dst_stride ++ sub r0, unif, 1 # pic c width ++ add rb_max_x, r0, r0 ++ sub rb_max_y, unif, 1 # pic c height + +# load constants -+ mov ra_k1, 1 -+ mov ra_k256, 256 ++ mov ra_kff100100, 0xff100100 + mov rb_k255, 255 + -+# touch registers to keep simulator happy + ++ mov r1, vdw_setup_1(0) # Merged with dst_stride shortly, delay slot for ra_base ++ ++# touch registers to keep simulator happy ++# ; ra12..15: vertical scroll registers ++# get source pitch ++ mov rb_xpitch, unif ; mov ra12, 0 # stride2 ++ mov rb_pitch, unif ; mov ra13, 0 # stride1 ++ nop ; mov ra14, 0 ++# get destination vdw setup ++ add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1 ++ ++ and r0, 1, elem_num ++ nop ; mul24 r0, r0, 5 ++ add rb_elem_x, r0, elem_num ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base ++ ++ add r0, ra0.16b, ra0.16b # [rb_elem_x delay] ++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice ++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y ++ min r0, r0, rb_max_x ++ ++# Get shift ++ shl ra_xshift_next, r0, 3 ++ ++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++ ++ and r0, r0, -4 ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ++ add ra_base, ra_base, r0 ++ ++ add rb_wt_den_p15, 9, unif # denominator ++ ++# Compute part of VPM to use for DMA output ++ m_calc_dma_regs rb_vpm_init, rb_dma0_base ++ ++# And again for L1, but only worrying about frame2 stuff ++ ++# Load first request location ++ mov ra0, unif # next_x_y ++ ++ mov ra_base2, unif # [ra0 delay] Store frame c base ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base ++ ++ add r0, ra0.16b, ra0.16b # Load x ++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ ++# Get shift ++ shl rb_xshift2_next, r0, 3 ++ ++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++ ++ and r0, r0, -4 ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r2, ra_y2 ++ add ra_base2, ra_base2, r0 ++ ++# Do preloads ++# r0 = ra_y, r2 = ra_y2 ++ mov r3, PREREAD ; mov r0, ra_y ++ ++:c_preload ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:c_preload ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz c_preload ++ ++ mov ra_link, unif # link ++# touch registers to keep simulator happy + # ra/b4..7: B0 -> B stash registers + mov ra4, 0 ; mov rb4, 0 ++ bra -, ra_link + mov ra5, 0 ; mov rb5, 0 + mov ra6, 0 ; mov rb6, 0 + mov ra7, 0 ; mov rb7, 0 -+ -+ # ra12..15: vertical scroll registers -+ mov ra12, 0 -+ mov ra13, 0 -+ mov ra14, 0 -+ mov ra15, 0 -+ -+ # ra9 - delayed setup - must be 0 initially -+ mov ra9, 0 -+ -+# Compute base address for first and second access -+mov r0, ra_x # Load x -+max r0, r0, 0 ; mov r1, ra_y # Load y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base # Load the frame base -+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset -+add ra_y, r1, 1 -+add r0, r0, r3 -+and r0, r0, ~3 -+max r1, r1, 0 ; mov ra_x, r0 # y -+min r1, r1, rb_frame_height_minus_1 -+# submit texture requests for first line -+add r2, r2, r0 ; mul24 r1, r1, rb_pitch -+add t0s, r0, r1 ; mov ra_frame_base, r2 -+add t1s, r2, r1 -+ -+add rb13, 9, unif # denominator -+mov -, unif # Unused -+ -+mov -, unif # ??? same as (register) qpu_num -+ -+# Compute part of VPM to use for DMA output -+m_calc_dma_regs rb28, rb27 -+ -+# submit texture requests for second line -+max r1, ra_y, 0 -+min r1, r1, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 -+bra -, ra_link -+nop ; mul24 r1, r1, rb_pitch -+add t0s, r1, ra_x -+add t1s, r1, ra_frame_base -+ -+ ++# >>> ra_link + +################################################################################ + -+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst) ++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst) + +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block +::mc_filter_uv -+mov ra_link, unif -+ +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+mov ra2, unif # x_y -+mov r0, elem_num ; mov r3, unif # frame_base ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+add r0, ra2.16b, r0 # x -+max r0, r0, 0 -+min r0, r0, rb_frame_width_minus_1 -+# compute offset from frame base u to frame base v -+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+shl ra_xshift_next, r0, 3 -+add r0, r0, r3 ; mov ra1, unif # ; width_height -+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs -+mov ra_y_next, ra2.16a ; mov vw_setup, rb28 ++ and.setf -, elem_num, 1 # [ra2 delay] + -+add ra_frame_base_next, rb_x_next, r2 ++ add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 # x ; r1=0 ++ add r0, r0, rb_elem_x ++ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base ++ max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height ++ ++ shl ra_xshift_next, r0, 3 ++ ++ and r0, r0, -4 ; mov ra0, unif # H filter coeffs ++ nop ; mov ra_y_next, ra2.16a ++ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=w*2 (we are working in pel pairs) ** x*2 already calced! ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ++ shl r0, r1, 7 + +# set up VPM write -+# get width,height of block + -+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+add rb17, ra1.16a, 1 -+add rb18, ra1.16a, 3 -+shl r0, ra1.16a, 7 ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs ++ add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif # ; U offset/weight ++ add rb_lcount, r1, 3 ; mov.ifnz ra_wt_off_mul_l0, unif # ; V offset/weight + -+ mov.setf -, ra9 ; mov -, vw_wait -+ brr.anyz -, r:filter_uv_1 ++# ; unpack filter coefficients + -+add r0, r0, ra1.16b # Combine width and height of destination area -+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register -+add rb26, r0, rb27 ; mov ra3, unif # ; V filter coeffs -+# >>> (skip V DMA if never requested) ++ add r0, r0, r2 ; mov rb8, ra3.8a # Combine width and height of destination area (r0=h<<8, r2=w*2) ++ shl r0, r0, i_shift16 ; mov rb9, ra3.8b # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight + -+ sub vw_setup, ra9, -16 -+ mov vw_setup, ra10 -+ mov vw_addr, ra11 -+:filter_uv_1 ++ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c ++ mov r5quad, 0 ; mov rb11, ra3.8d # Loop count (r5rep is B, r5quad is A) + -+# unpack filter coefficients ++ asr rb_wt_off, r1, 1 ; mov ra_link, unif # Link ++ shl ra_wt_mul_l0, ra_wt_mul_l0, 1 # weight*2 + -+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight -+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight -+nop ; mov rb10, ra3.8c -+mov r3, 0 ; mov rb11, ra3.8d # Loop count ++# ra9 alias for rb_max_y ++# ra_wt_mul_l0 - weight L0 * 2 ++# rb_wt_den_p15 = weight denom + 6 + 9 ++# rb_wt_off = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) + -+shl r1, ra1.16b, rb13 -+asr rb12, r1, 1 -+shl rb14, ra1.16a, 1 # b14 = weight*2 -+ -+# rb14 - weight L0 * 2 -+# rb13 = weight denom + 6 + 9 -+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) -+ -+# r2 is elem_num +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+# r3 = 0 ++# We want (r0r1) ++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... ++# We fetch (after shift) ++# C0 : C3 : C1 : C4 : C2 : C5 : ... ++ ++ mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ++ ++# r5 = 0 (loop counter) +:uvloop +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment -+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment ++ shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next ++ shr r1, r2, 8 ; mov.ifnz r3, ra_y ++ add r0, r3, 1 ; mov.ifz ra_base, ra_base_next + -+max r2, ra_y, 0 # y -+min r2, r2, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 -+add t1s, ra_frame_base, r2 ++ and.setf -, 1, elem_num ; mov ra_y, r0 ++ max r3, r3, ra_k0 ; mov r0, r1 << 15 ++ min r3, r3, ra9 ; mov.ifz r1, r2 << 1 + -+# generate seven shifted versions -+# interleave with scroll of vertical context ++ mov.ifz r0, r2 ; mul24 r2, r3, rb_pitch ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++# ra4 not really needed; this could be a mul24 rather than a mov but current ++# register usage means this wouldn't help ++ mov.setf -, rb3 ; mov ra4, ra5 + +# apply horizontal filter -+nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+sub r0, r2, r3 ; mov r3, rb31 -+sub.setf -, r3, 4 ; mov ra12, ra13 -+brr.anyn -, r:uvloop -+mov ra13, ra14 ; mul24 r1, ra14, rb9 -+mov ra14, ra15 -+mov ra15, r0 ; mul24 r0, ra12, rb8 ++# The filter coeffs for the two halves of this are the same (unlike in the ++# Y case) so it doesn't matter which ra0 we get them from ++# Also as the two halves are locked together we don't need to separate the 1st ++# r0 mul or the last r1 mul as they are vaild for all QPUs ++ ++ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ sub.setf -, r5, 4 ; mul24 r0, ra0.8d , r1 ++ brr.anyn -, r:uvloop ++ add r2, r2, r3 ; mov ra5, ra6 ++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift) ++ mov ra6, ra7 ; mul24 r1, ra7, rb10 ++ sub ra7, r2, r0 ; mul24 r0, ra4, rb8 +# >>> .anyn uvloop + -+# apply vertical filter and write to VPM ++ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay] ++ add r1, r1, r0 ; mul24 r0, ra7, rb11 ++ sub r1, r1, r0 ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ++ nop ; mul24 r1, r1, ra_wt_mul_l0 ++ shl r1, r1, 8 + -+sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+add r1, r1, r0 ; mul24 r0, ra15, rb11 -+sub r1, r1, r0 ; mov -, vw_wait -+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+asr r1, r1, 14 -+nop ; mul24 r1, r1, rb14 -+shl r1, r1, 8 -+ -+add r1, r1, rb12 -+brr.anyn -, r:uvloop -+asr r1, r1, rb13 -+min r1, r1, rb_k255 # Delay 2 -+max vpm, r1, 0 # Delay 3 -+# >>> ++ add r1, r1, rb_wt_off ++ brr.anyn -, r:uvloop ++ asr ra1.8as, r1, rb_wt_den_p15 ++ mov -, vw_wait ++ mov vpm, ra1.8a ++# >>> .anyn uvloop + +# DMA out for U & stash for V -+ mov vw_setup, rb26 ; mov ra9, rb26 # VDW setup 0 + bra -, ra_link -+ mov vw_setup, rb29 ; mov ra10, rb29 # Stride -+ mov vw_addr, unif # u_dst_addr -+ mov ra11, unif # v_dst_addr -+# >>> ++ mov vw_setup, rb_dma0 ++ mov vw_setup, rb_dma1 ++ mov vw_addr, rb_dest # u_dst_addr ++# >>> ra_link + +################################################################################ + -+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst) ++# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst) + +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block +::mc_filter_uv_b0 -+mov -, unif # Ignore chain address - always "b" -+ +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+mov ra2, unif # x_y -+mov r0, elem_num ; mov r3, unif # frame_base ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+add r0, ra2.16b, r0 # x -+max r0, r0, 0 -+min r0, r0, rb_frame_width_minus_1 -+# compute offset from frame base u to frame base v -+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next -+shl ra_xshift_next, r0, 3 -+add r0, r0, r3 ; mov ra1, unif # ; width_height -+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs -+mov ra_y_next, ra2.16a ++ and.setf -, elem_num, 1 # Also acts as delay slot for ra2 + -+add ra_frame_base_next, rb_x_next, r2 ++ add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1 # x ; r1=0 ++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base ++ max r0, r0, 0 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height + -+# Need to have unsigned coeffs to so we can just unpack in the filter -+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the -+# filter code. Unpack into b regs for V ++ shl ra_xshift_next, r0, 3 + -+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+add rb17, ra1.16a, 1 -+add ra31, ra1.16a, 3 -+shl r0, ra1.16a, 7 -+add r0, r0, ra1.16b ; mov ra3, unif # Combine width and height of destination area ; V filter coeffs -+shl r0, r0, i_shift16 ; mov rb14, unif # U weight L0 -+add rb26, r0, rb27 -+ -+mov rb8, ra3.8a -+mov rb9, ra3.8b -+mov rb10, ra3.8c -+mov rb11, ra3.8d -+ -+# r2 is elem_num -+# r3 is loop counter -+ -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ -+mov.ifnz rb14, unif ; mov r3, 0 # V weight L0 ; Loop counter -+ -+# rb14 unused in b0 but will hang around till the second pass -+ -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+# r3 = 0 -+:uvloop_b0 -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment -+ shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+ shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte -+ -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_frame_height_minus_1 -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 -+ add t1s, ra_frame_base, r2 -+ -+# generate seven shifted versions -+# interleave with scroll of vertical context -+ -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ -+ nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 -+ sub.setf -, r3, 4 ; mov ra12, ra13 -+ brr.anyn -, r:uvloop_b0 -+ mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13 -+ mov ra14, ra15 ; mul24 r2, ra15, rb10 # ra15 is about to be ra14 -+ mov ra15, r0 ; mul24 r0, ra12, rb8 -+# >>> .anyn uvloop_b0 -+ -+# apply vertical filter and write to B-FIFO -+ -+ sub r1, r1, r0 ; mov ra8.16b, ra7 # start of B FIFO writes -+ add r1, r1, r2 ; mul24 r0, ra15, rb11 # N.B. ra15 write gap -+ sub r1, r1, r0 ; mov ra7, rb6 -+ -+# FIFO goes: -+# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b -+# This arrangement optimizes the inner loop FIFOs at the expense of making the -+# bulk shift between loops quite a bit nastier -+# a8 used as temp -+ -+ sub.setf -, r3, ra31 -+ asr ra8.16a, r1, 6 ; mov rb6, ra5 # This discards the high bits that might be bad -+ brr.anyn -, r:uvloop_b0 -+ mov ra5, rb4 ; mov rb4, ra4 -+ mov ra4, rb5 ; mov rb5, ra6 -+ mov ra6, rb7 ; mov rb7, ra8 -+# >>> -+ -+# 1st half done all results now in the a/b4..7 fifo -+ -+# Need to bulk rotate FIFO for heights other than 16 -+# plausible heights are 16, 12, 8, 6, 4, 3, 2 and that is all we deal with -+# we are allowed 3/4 cb_size w/h :-( -+ -+# Destination uniforms discarded -+# At the end drop through to _b - we will always do b after b0 -+ -+ sub.setf -, 15, r3 # 12 + 3 of preroll -+ brr.anyn -, r:uv_b0_post_fin # h > 12 (n) => 16 (do nothing) -+ sub r3, 11, r3 ; mov -, unif # r3 = shifts wanted ; Discard u_dst_addr -+ mov r0, i_shift16 ; mov -, unif # ; Discard v_dst_addr -+ mov r1, 0x10000 -+# >>> -+ brr.anyz -, r:uv_b0_post12 # h == 12 deal with specially -+# If h != 16 && h != 12 then h <= 8 so -+# shift 8 with discard (.16b = .16a on all regs) -+ shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 -+ shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 -+ shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 -+# >>> -+ shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 -+ -+ shl.setf -, r3, i_shift30 # b2 -> C, b1 -> N -+# Shift 4 -+ mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 -+ mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 -+ # If we shifted by 4 here then the max length remaining is 4 -+ # so that is it -+ -+ brr -, r:uv_b0_post_fin -+# Shift 2 -+ mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 -+ mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 -+ mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 -+ # 6 / 2 so need 6 outputs -+# >>> -+ -+:uv_b0_post12 -+# this one is annoying as we need to swap halves of things that don't -+# really want to be swapped -+ -+# b7a, a6a, b5a, a4a -+# b4a, a5a, b6a, a7a -+# b7b, a6b, b5b, a4b -+# b4b, a5b, b6b, a7b -+ -+ mov r2, ra4 ; mov r3, rb5 -+ shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 -+ mov ra7, r2 ; mov rb6, r3 -+ -+ mov r2, ra6 ; mov r3, rb7 -+ shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 -+ mov ra5, r2 ; mov rb4, r3 -+ -+:uv_b0_post_fin -+ # drop through -+ -+################################################################################ -+ -+::mc_filter_uv_b -+ -+ mov ra_link, unif -+ mov.setf -, ra9 ; mov -, vw_wait # Delayed V DMA -+ brr.anyz -, r:uv_filter_b_1 -+ -+ mov ra0, unif ; mov r0, elem_num -+ -+# per-channel shifts were calculated on the *previous* invocation ++ and r0, r0, -4 ; mov ra0, unif # L0 H filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=x*2 (we are working in pel pairs) ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ++ shl r0, r1, 7 ; mov ra2, unif # ; L0 V filter coeffs + +# set up VPM write -+mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28 + -+# get base addresses and per-channel shifts for *next* invocation -+add r0, ra0.16b, r0 # x -+# >>> -+ sub vw_setup, ra9, -16 -+ mov vw_setup, ra10 -+ mov vw_addr, ra11 -+:uv_filter_b_1 ++ sub rb_dma1, rb_dma1_base, r2 # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, r1, 3 - PREREAD ++ add rb_lcount, r1, 3 + -+max r0, r0, 0 ; mov ra_y_next, ra0.16a # y -+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # V frame_base -+# compute offset from frame base u to frame base v -+sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 # U frame_base -+add r0, r0, r3 ; mov -, unif # discard width_height -+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs ++ add r0, r0, r2 ; mov ra_wt_mul_l0, unif # ; U weight ++ shl r0, r0, ra_k16 ; mov.ifnz ra_wt_mul_l0, unif # Shift into bits 16 upwards of the vdw_setup0 register ; V weight ++ add rb_dma0, r0, rb_dma0_base ; mov ra3, unif # ; x2_y2 + -+# rb17, rb26, rb29, ra31 inherited from B0 as w/h must be the same ++# L1 - uniform layout could possibly be optimized + -+mov ra3, unif # V filter coeffs ++ mov ra9, rb_max_y # [ra3 delay] + -+# get filter coefficients ++ add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1 # r0=x*2 ; r1=0 ++ add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a ++ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base ++ max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B ++ min r0, r0, rb_max_x ; mov ra1, unif # H filter coeffs + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ shl rb_xshift2_next, r0, 3 + -+# Get offset & weight stuff ++ and r0, r0, -4 ++ and r1, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov rb8, ra3.8a # Add stripe offsets ; start unpacking filter coeffs ++ add rb_base2_next, r3, r0 + -+# The unif read occurs unconditionally, only the write is conditional -+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight ; -+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight ; -+add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c -+mov r3, 0 ; mov rb11, ra3.8d # Loop counter ; ++ mov ra_wt_off_mul_l1, unif ; mov rb9, ra3.8b # U offset/weight ++ mov.ifnz ra_wt_off_mul_l1, unif ; mov rb10, ra3.8c # V offset/weight + -+shl r1, ra1.16b, rb13 -+asr rb12, r1, 1 ++ mov rb_dest, unif # dst_addr ++ mov r5quad,0 ; mov rb11, ra3.8d ++ shl r1, ra_wt_off_l1, rb_wt_den_p15 ++ asr rb_wt_off, r1, 9 ; mov ra_link, unif # link + -+# ra1.16a used directly in the loop ++# r5 loop counter ++# ra0 H coeffs L0 ++# ra1 H coeffs L1 ++# ra2 V coeffs L0 ++# ra3 temp ++# ra4-7 L0 H FIFO ++# rb4-7 L1 H FIFO ++# rb8-rb11 V coeffs L1 ++# ra9 rb_max_y alias + -+# retrieve texture results and pick out bytes -+# then submit two more texture requests ++ mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] + -+# r3 = 0 +:uvloop_b +# retrieve texture results and pick out bytes +# then submit two more texture requests ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment ++ shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++ shr r1, r2, 8 ; mov.ifz ra_y_y2, ra_y_y2_next ++ mov rb4, rb5 ; mov.ifz ra_base, ra_base_next ++ add ra_y, 1, ra_y ; mov r3, ra_y + -+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment -+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1 -+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ and.setf -, 1, elem_num ++ max r3, r3, ra_k0 ; mov r0, r1 << 15 ++ min r3, r3, ra9 ; mov.ifz r1, r2 << 1 + -+max r2, ra_y, 0 # y -+min r2, r2, rb_frame_height_minus_1 -+add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+add t0s, ra_x, r2 ; v8min r1, r1, rb_k255 -+add t1s, ra_frame_base, r2 ++ mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch ++ add t0s, ra_base, r3 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + -+# generate seven shifted versions -+# interleave with scroll of vertical context ++# L0 H-filter ++# H FIFO scrolls are spread all over this loop ++ mov.setf -, rb3 ; mov ra4, ra5 + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++ sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 + -+nop ; mul24 r3, ra0.8a, r0 -+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+sub r0, r2, r3 ; mov r3, rb31 -+sub.setf -, r3, 4 ; mov ra12, ra13 -+brr.anyn -, r:uvloop_b -+mov ra13, ra14 ; mul24 r1, ra14, rb9 -+mov ra14, ra15 ; mul24 r2, ra15, rb10 -+mov ra15, r0 ; mul24 r0, ra12, rb8 ++ shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++ shr r1, r2, 8 ; mov r3, ra_y2 ++ add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++ ++ and.setf -, 1, elem_num ++ max r3, r3, ra_k0 ; mov r0, r1 << 15 ++ min r3, r3, ra9 ; mov.ifz r1, r2 << 1 ++ ++ mov.ifz r0, r2 ; mul24 r3, r3, rb_pitch ++ add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ ++# L1 H-filter ++ mov.setf -, rb3 ; mov rb7, ra3 ++ ++ and r1, r1, rb_k255 ; mul24 r3, ra1.8a, r0 ++ nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++ brr.anyn -, r:uvloop_b ++# V filters - start in branch delay slots of H ++ add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ mov ra6, ra7 ; mul24 r3, ra7, rb10 ++ sub ra7, r2, r0 ; mul24 r0, rb4, ra2.8a ++# >>> .anyn uvloop_b0 ++ ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++ sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++ add r1, r1, r0 ; mul24 r0, ra7, rb11 ++ sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++ ++ asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++ ++ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9) ++ add r1, r1, r2 ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend ++ ++ brr.anyn -, r:uvloop_b ++ asr ra3.8as, r1, rb_wt_den_p15 ++ mov -, vw_wait ++ mov vpm, ra3.8a +# >>> .anyn uvloop_b + -+# apply vertical filter and write to VPM -+ -+ sub r1, r1, r0 ; mov ra8.16b, ra7 # FIFO rotate (all ra/b4..7) -+ add r1, r1, r2 ; mul24 r0, ra15, rb11 -+ sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 -+ mov ra7, rb6 ; mul24 r1, r1, ra_k256 -+ asr r1, r1, 14 ; mov rb6, ra5 # shift2=6 -+ -+ mov ra5, rb4 ; mul24 r1, r1, ra1.16a -+ add r1, r1, r0 ; mov rb4, ra4 -+ -+ mov ra4, rb5 ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend -+ add r1, r1, rb12 ; mov rb5, ra6 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1) -+ -+ sub.setf -, r3, ra31 ; mov ra6, rb7 -+ brr.anyn -, r:uvloop_b -+ asr ra3.8as, r1, rb13 -+ mov -, vw_wait ; mov rb7, ra8 # vw_wait is B-reg (annoyingly) ; Final FIFO mov -+ mov vpm, ra3.8a -+# >>> -+ -+# DMA out for U & stash for V -+ -+ mov vw_setup, rb26 ; mov ra9, rb26 # VDW setup 0 ++# DMA out + bra -, ra_link -+ mov vw_setup, rb29 ; mov ra10, rb29 # Stride -+ mov vw_addr, unif # u_dst_addr -+ mov ra11, unif # v_dst_addr -+ -+ ++ mov vw_setup, rb_dma0 ++ mov vw_setup, rb_dma1 ++ mov vw_addr, rb_dest ++# >>> ra_link + +################################################################################ ++# Exit code used by both Luma & Chroma so place between them to avoid I-cache ++# conflicts ++ ++.macro m_exit_drain ++.if PREREAD == 2 ++# Special case 2 as loop is wasteful ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ nop ; nop ; ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 ++.else ++ mov.setf r3, PREREAD - 1 ++:1 ++ brr.anynz -, r:1b ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ sub.setf r3, r3, 1 ++ # >>> ++ mov -, vw_wait ++.endif ++.endm ++ ++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) ++# All qpus start at the beginning and after that (group - 1) must have finished ++# before (group) can start ++# ++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain ++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - ++# lockup otherwise) ++# ++# There is some, currently ill defined, potential lockup if we have the VDM active ++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? ++# ++# The code stalled when I had many waiters on a single sem so we have a ++# "ripple" of srels to restart. Unsure why, may have been bug, but this works ++# and we currently have both the memory & sems to support it. ++.macro m_sync_q, n_qpu ++ mov ra_link, unif ++ mov -, vw_wait ++ ++.set n_sem_sync, n_qpu - (n_qpu % 4) ++.set n_sem_in, n_qpu ++.set n_sem_out, n_qpu + 1 ++ ++.if n_qpu % 4 == 0 ++ ++.set n_sem_quad_in, 12 + n_qpu / 4 ++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % 3) ++ ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ bra -, ra_link ++ sacq -, n_sem_quad_in ++ srel -, n_sem_out ++ srel -, n_sem_quad_out ++ ++.else ++ bra -, ra_link ++ srel -, n_sem_sync ++ sacq -, n_sem_in ++.if n_sem_out % 4 != 0 ++ srel -, n_sem_out ++.else ++ nop ++.endif ++.endif ++.endm ++ ++::mc_sync_q0 ++ m_sync_q 0 ++::mc_sync_q1 ++ m_sync_q 1 ++::mc_sync_q2 ++ m_sync_q 2 ++::mc_sync_q3 ++ m_sync_q 3 ++::mc_sync_q4 ++ m_sync_q 4 ++::mc_sync_q5 ++ m_sync_q 5 ++::mc_sync_q6 ++ m_sync_q 6 ++::mc_sync_q7 ++ m_sync_q 7 ++::mc_sync_q8 ++ m_sync_q 8 ++::mc_sync_q9 ++ m_sync_q 9 ++::mc_sync_q10 ++ m_sync_q 10 ++::mc_sync_q11 ++ m_sync_q 11 + +# mc_exit() -+ ++# Chroma & Luma the same now +::mc_exit_c -+ mov.setf -, ra9 ; mov -, vw_wait -+# Annoyingly it looks iike condition codes don't work on writes to special -+# registers so we have to branch around the writes -+ brr.anyz -, r:exit_c_1 -+ nop -+ nop -+ nop -+# >>> -+ -+ sub vw_setup, ra9, -16 -+ mov vw_setup, ra10 -+ mov vw_addr, ra11 -+ nop -+:exit_c_1 -+ +::mc_exit -+ ldtmu0 -+ ldtmu1 -+ ldtmu0 -+ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW -+ -+ mov -,srel(0) -+ -+ nop ; nop ; thrend -+ nop ; nop # delay slot 1 -+ nop ; nop # delay slot 2 -+ -+# mc_interrupt_exit8() -+#::mc_interrupt_exit8 -+#mov -, vw_wait # wait on the VDW -+# -+#ldtmu0 -+#ldtmu1 -+#ldtmu0 -+#ldtmu1 -+# -+#mov -,sacq(0) # 1 -+#mov -,sacq(0) # 2 -+#mov -,sacq(0) # 3 -+#mov -,sacq(0) # 4 -+#mov -,sacq(0) # 5 -+#mov -,sacq(0) # 6 -+#mov -,sacq(0) # 7 -+# -+#nop ; nop ; thrend -+#mov interrupt, 1; nop # delay slot 1 -+#nop ; nop # delay slot 2 -+# -+ -+ ++ m_exit_drain ++ nop ; nop ; thrend ++ nop ++ nop + ++# mc_interrupt_exit12() ++::mc_interrupt_exit12c ++::mc_interrupt_exit12 ++ m_exit_drain ++ sacq -, 12 ++ nop ; nop ; thrend ++ mov interrupt, 1 ++ nop ++# >>> thrend <<< + +# LUMA CODE + +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. +# For P frames we make the second x,y coordinates offset by +8 + ++ +################################################################################ -+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel) -+::mc_setup ++# mc_setup ++# ++# typedef struct qpu_mc_pred_y_s_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t pic_h; ++# uint16_t pic_w; ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_s_t; ++ ++::mc_setup_y_q0 ++ m_setup_q0 ++::mc_setup_y_qn + # Need to save these because we need to know the frame dimensions before computing texture coordinates -+ mov tmurs, 1 ; mov ra8, unif # No TMU swap ; y_x -+ mov ra9, unif # ref_y_base -+ mov ra10, unif # y2_x2 -+ mov ra11, unif # ref_y2_base -+ -+# Read image dimensions -+ mov ra3, unif # width_height -+ mov rb_pitch, unif # src_pitch [ra3 delay] -+ sub rb_frame_width_minus_1, ra3.16b, 1 -+ sub rb_frame_height_minus_1, ra3.16a, 1 -+ -+# get destination pitch -+ mov r1, vdw_setup_1(0) -+ or rb24, r1, unif # dst_pitch -+ -+# Compute base address for first and second access -+ mov r3, elem_num -+ add r0, ra8.16a, r3 # Load x + elem_num -+ max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ add ra_y, ra8.16b, 1 -+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate -+ add r2, ra9, r0 # ra9 is address for frame0 (not including y offset) -+ max r1, ra8.16b, 0 -+ min r1, r1, rb_frame_height_minus_1 -+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 -+ add t0s, r2, r1 ; mov ra_frame_base, r2 -+ -+ # r3 still contains elem_num -+ add r0, ra10.16a, r3 # Load x -+ max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 -+ shl rx_xshift2_next, r0, 3 # Compute shifts -+ add ra_y2, ra10.16b, 1 -+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate -+ add r2, ra11, r0 # r2 is address for frame1 (not including y offset) -+ max r1, ra10.16b, 0 -+ min r1, r1, rb_frame_height_minus_1 -+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0 -+ add t1s, r2, r1 ; mov ra_frame_base2, r2 ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ mov ra9, unif # ref_y_base ++ mov ra1, unif # x2_y2 ++ mov ra11, unif # ref_y2_base + +# load constants + -+ mov ra_k1, 1 -+ mov ra_k256, 256 ++ mov ra_kff100100, 0xff100100 + mov rb_k255, 255 + -+# touch vertical context to keep simulator happy ++# Compute part of VPM to use + ++# Read image dimensions ++ mov ra3, unif # width_height ++ mov rb_xpitch, unif # stride2 ++ sub rb_max_x, ra3.16b, 1 ++ sub rb_max_y, ra3.16a, 1 ++ mov rb_pitch, unif # stride1 ++ ++# get destination pitch ++ mov r1, vdw_setup_1(0) ++ or rb_dma1_base, r1, rb_pitch ++ ++# Compute base address for first and second access ++ mov r3, elem_num ++ add r0, ra0.16b, r3 # Load x + elem_num ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ ++# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs ++ ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base, ra9, r0 ++ ++ # r3 still contains elem_num ++ add r0, ra1.16b, r3 # Load x ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ ++ # r2 still contains mask ++ and r0, r0, -4 ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base2, ra11, r0 ++ ++# Do preloads ++ nop ; mov r0, ra0.16a # ; r0 = y ++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 ++ ++:y_preload ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:y_preload ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz y_preload ++ ++ add rb_wt_den_p15, unif, 9 # weight denom + 6 ++ ++ m_calc_dma_regs rb_vpm_init, rb_dma0_base ++ ++ mov ra_link, unif # Next fn ++ ++# touch vertical context to keep simulator happy + mov ra8, 0 ; mov rb8, 0 ++ bra -, ra_link + mov ra9, 0 ; mov rb9, 0 + mov ra10, 0 ; mov rb10, 0 + mov ra11, 0 ; mov rb11, 0 ++# >>> ra_link + -+# Compute part of VPM to use -+ m_calc_dma_regs rb28, rb27 -+ -+# Weighted prediction denom -+ add rb13, unif, 9 # unif = weight denom + 6 -+ -+# submit texture requests for second line -+ max r1, ra_y, 0 -+ min r1, r1, rb_frame_height_minus_1 -+ add ra_y, ra_y, 1 -+ mov -, unif ; mul24 r1, r1, rb_pitch # unused ; -+ add t0s, r1, ra_frame_base -+ -+ max r1, ra_y2, 0 -+ min r1, r1, rb_frame_height_minus_1 -+ add ra_y2, ra_y2, 1 -+ nop ; mul24 r1, r1, rb_pitch -+ add t1s, r1, ra_frame_base2 -+ -+# FALL THROUGHT TO PER-BLOCK SETUP -+ ++################################################################################ ++# +# Start of per-block setup code +# P and B blocks share the same setup code to save on Icache space -+:per_block_setup -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ mov ra_link, unif + -+ mov ra1, unif ; mov r1, elem_num # y_x ; elem_num has implicit unpack?? -+ -+# per-channel shifts were calculated on the *previous* invocation -+ mov ra_xshift, ra_xshift_next -+ mov rx_xshift2, rx_xshift2_next ++# luma_setup_delay3 done in delay slots of branch that got us here + +# get base addresses and per-channel shifts for *next* invocation ++# per-channel shifts were calculated on the *previous* invocation + -+ add r0, ra1.16a, r1 # Load x -+ max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ mov ra_y_next, ra1.16b -+ and r0, r0, ~3 ; mov ra1, unif # y2_x2 -+ add ra_frame_base_next, r2, r0 ++# 1st 3 instructions of per_block-setup in branch delay ++# ++# typedef struct qpu_mc_pred_y_p_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t h; ++# uint16_t w; ++# uint32_t mymx21; ++# uint32_t wo1; ++# uint32_t wo2; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p_t; ++# + -+ add r0, ra1.16a, r1 # Load x -+ max r0, r0, 0 -+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base -+ shl rx_xshift2_next, r0, 3 # Compute shifts -+ mov ra_y2_next, ra1.16b -+ and r0, r0, ~3 ; mov ra1, unif # width_height ; r0 gives the clipped and aligned x coordinate -+ add rx_frame_base2_next, r2, r0 # r2 is address for frame1 (not including y offset) ++.macro luma_setup ++ brr ra_link, r:per_block_setup ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] # [ra0 delay] ++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++.endm + -+# set up VPM write -+ mov vw_setup, rb28 ++:per_block_setup ++ max r0, r0, 0 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y ++ add ra_base_next, ra_base_next, r0 # [ra1 delay] ++ ++ add r0, ra1.16b, r3 # Load x2 ++ max r0, r0, 0 ; mov ra_y2_next, ra1.16a ++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov vw_setup, rb_vpm_init # Add stripe offsets ; set up VPM write ++ add rb_base2_next, rb_base2_next, r0 + +# get width,height of block (unif load above) -+ sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width) -+ add rb17, ra1.16a, 5 -+ add rb18, ra1.16a, 7 -+ shl r0, ra1.16a, 7 -+ add r0, r0, ra1.16b # Combine width and height of destination area -+ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb26, r0, rb27 ; mov r0, unif # Packed filter offsets ++ sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height ++ min r0, r0, ra_k16 ++ add rb_lcount, r0, 7 ++ shl r0, r0, 7 ++ add r0, r0, ra_width # Combine width and height of destination area ++ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets + +# get filter coefficients and discard unused B frame values -+ shl.ifz r0, r0, i_shift16 ; mov ra5, unif # Pick half to use ; L0 offset/weight -+ mov r2, 0x01040400 # [ra5 delay] -+ shl ra8, r0, 3 ; mov rb14, ra5.16a ++ shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight ++ shl ra8, r0, 3 + +# Pack the 1st 4 filter coefs for H & V tightly ++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) + -+ mov r1,0x00010100 # -ve ++ mov r1,0x00010100 # -ve [ra8 delay] + ror ra2.8a, r1, ra8.8d + ror ra0.8a, r1, ra8.8c + -+ ror ra2.8b, r2, ra8.8d -+ ror ra0.8b, r2, ra8.8c ++ mov r1, 0x01040400 ++ ror ra2.8b, r1, ra8.8d ++ ror ra0.8b, r1, ra8.8c + + mov r1,0x050b0a00 # -ve + ror ra2.8c, r1, ra8.8d @@ -14313,87 +17842,73 @@ index 0000000..6fd6af5 + ror ra2.8d, r1, ra8.8d + ror ra0.8d, r1, ra8.8c + -+# In the 2nd vertical half we use b registers due to -+# using a-side fifo regs. The easiest way to achieve this to pack it -+# and then unpack! ++# In the 2nd vertical half we use b registers due to using a-side fifo regs + + mov r1,0x3a281100 -+ ror ra3.8a, r1, ra8.8d -+ ror ra1.8a, r1, ra8.8c ++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255 + + mov r1,0x0a0b0500 # -ve -+ ror ra3.8b, r1, ra8.8d -+ ror ra1.8b, r1, ra8.8c ++ ror r0, r1, ra8.8d ++ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255 + + mov r1,0x04040100 -+ ror ra3.8c, r1, ra8.8d -+ ror ra1.8c, r1, ra8.8c ++ ror r0, r1, ra8.8d ++ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255 ++ ++ mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address + + mov r1,0x01010000 # -ve -+ ror ra3.8d, r1, ra8.8d -+ ror ra1.8d, r1, ra8.8c -+ -+# Extract weighted prediction information in parallel -+# We are annoyingly A src limited here -+ -+ mov rb4, ra3.8a ; mov ra18, unif -+ mov rb5, ra3.8b -+ mov rb6, ra3.8c -+ mov.ifnz ra5, ra18 -+ ++ ror r0, r1, ra8.8d + bra -, ra_link ++ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255 + -+ shl r0, ra5.16b, rb13 # Offset calc -+ asr rb12, r0, 9 # For B l1 & L0 offsets should be identical so it doesn't matter which we use -+ mov r3, 0 ; mov rb7, ra3.8d ++ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 ++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ asr rb_wt_off, r0, 9 ; mov ra_link, unif # ; link - load after we've used its previous val +# >>> branch ra_link -+# ++ +# r3 = 0 -+# ra18.16a = weight L1 -+# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) -+# rb12 = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) -+# rb13 = weight denom + 6 + 9 -+# rb14 = weight L0 ++# ra_wt_mul_l1 = weight L1 ++# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) ++# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) ++# rb_wt_den_p15 = weight denom + 6 + 9 ++# rb_wt_mul_l0 = weight L0 + + +################################################################################ -+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) +# In a P block, y2_x2 should be y_x+8 +# At this point we have already issued two pairs of texture requests for the current block + +::mc_filter -+# ra5.16a = weight << 16; We want weight * 2 in rb14 ++ luma_setup + -+ shl rb14, ra5.16a, 1 ++ shl ra_wt_mul_l0, ra_wt_mul_l0, 1 + -+# r3 = 0 ++# r5 = 0 (loop count) + +:yloop +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+# If we knew there was no clipping then this code would get simpler. -+# Perhaps we could add on the pitch and clip using larger values? -+ +# N.B. Whilst y == y2 as far as this loop is concerned we will start +# the grab for the next block before we finish with this block and that +# might be B where y != y2 so we must do full processing on both y and y2 + -+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y -+ min r2, r2, rb_frame_height_minus_1 -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next + -+ max r2, ra_y2, 0 # y -+ min r2, r2, rb_frame_height_minus_1 -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + +# generate seven shifted versions +# interleave with scroll of vertical context @@ -14401,72 +17916,86 @@ index 0000000..6fd6af5 + mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + +# apply horizontal filter -+ nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 ++ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r3, 8 ; mov r1, ra8 -+ mov ra8, ra9 ; mov rb8, rb9 ++ sub.setf -, r5, 8 ; mov r1, ra8 ++ mov ra8, ra9 ; mov rb8, rb9 + brr.anyn -, r:yloop -+ mov ra9, ra10 ; mov rb9, rb10 -+ mov ra10, ra11 ; mov rb10, rb11 -+ mov ra11, r0 ; mov rb11, r1 ++ mov ra9, ra10 ; mov rb9, rb10 ++ mov ra10, ra11 ; mov rb10, rb11 ++ sub ra11, r2, r3 ; mov rb11, r1 + # >>> .anyn yloop + + # apply vertical filter and write to VPM + -+ nop ; mul24 r0, rb8, ra2.8a -+ nop ; mul24 r1, rb9, ra2.8b -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 ; mov -, vw_wait ++ nop ; mul24 r0, rb8, ra2.8a ++ nop ; mul24 r1, rb9, ra2.8b ++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb4 ++ add r1, r1, r0 ; mul24 r0, ra9, rb5 ++ sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++ add r1, r1, r0 ; mul24 r0, ra11, rb7 ++ sub r1, r1, r0 +# At this point r1 is a 22-bit signed quantity: 8 (original sample), +# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign) +# The top 8 bits have rubbish in them as mul24 is unsigned +# The low 6 bits need discard before weighting -+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish + asr r1, r1, 14 -+ nop ; mul24 r1, r1, rb14 -+ add r1, r1, rb12 ++ nop ; mul24 r1, r1, ra_wt_mul_l0 ++ add r1, r1, rb_wt_off + -+ shl r1, r1, 8 ++ shl r1, r1, 8 ; mov r0, ra_height + brr.anyn -, r:yloop -+ asr r1, r1, rb13 -+# We have a saturating pack unit - I can't help feeling it should be useful here -+ min r1, r1, rb_k255 # Delay 2 rb_k255 = 255 -+ max vpm, r1, 0 # Delay 3 ++ asr ra3.8as, r1, rb_wt_den_p15 ++ mov r1, ra_k16 ; mov -, vw_wait ++ sub r0, r0, r1 ; mov vpm, ra3.8a +# >>> branch.anyn yloop + ++# If looping again the we consumed 16 height last loop ++ # rb_dma1 (stride) remains constant ++ # rb_i_tmu remains const (based on total height) ++ # recalc rb_dma0, rb_lcount based on new segment height ++ # N.B. r3 is loop counter still ++ ++ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now ++ +# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride ++ nop ; mov vw_addr, rb_dest # start the VDW ++# >>> .anyz ra_link + -+ brr -, r:per_block_setup -+ mov vw_setup, rb26 # VDW setup 0 Delay 1 -+ mov vw_setup, rb29 # Stride Delay 2 -+ mov vw_addr, unif # start the VDW Delay 3 -+ ++ add rb_lcount, rb_lcount, r0 ++ shl r0, r2, i_shift23 ++ add rb_dma0, rb_dma0, r0 ++ brr -, r:yloop ++ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 ++ add rb_dest, rb_dest, r0 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> yloop + + +################################################################################ + -+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) +# In a P block, only the first half of coefficients contain used information. +# At this point we have already issued two pairs of texture requests for the current block +# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?) @@ -14477,8 +18006,7 @@ index 0000000..6fd6af5 +# From 19->7 32bits per command. + +::mc_filter_b -+ # r0 = weightL0 << 16, we want it in rb14 -+# asr rb14, r0, i_shift16 ++ luma_setup + +:yloopb +# retrieve texture results and pick out bytes @@ -14487,21 +18015,19 @@ index 0000000..6fd6af5 +# If we knew there was no clipping then this code would get simpler. +# Perhaps we could add on the pitch and clip using larger values? + -+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1 -+ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3 -+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+ shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y -+ min r2, r2, rb_frame_height_minus_1 -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next + -+ max r2, ra_y2, 0 # y -+ min r2, r2, rb_frame_height_minus_1 -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255 ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte + +# generate seven shifted versions +# interleave with scroll of vertical context @@ -14509,129 +18035,372 @@ index 0000000..6fd6af5 + mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] + +# apply horizontal filter -+ nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 ++ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r3, 8 ; mov r1, ra8 -+ mov ra8, ra9 ; mov rb8, rb9 ++ sub.setf -, r5, 8 ; mov r1, ra8 ++ mov ra8, ra9 ; mov rb8, rb9 + brr.anyn -, r:yloopb -+ mov ra9, ra10 ; mov rb9, rb10 -+ mov ra10, ra11 ; mov rb10, rb11 -+ mov ra11, r0 ; mov rb11, r1 ++ mov ra9, ra10 ; mov rb9, rb10 ++ mov ra10, ra11 ; mov rb10, rb11 ++ sub ra11, r2, r3 ; mov rb11, r1 + # >>> .anyn yloopb + + # apply vertical filter and write to VPM -+ nop ; mul24 r0, rb8, ra2.8a -+ nop ; mul24 r1, rb9, ra2.8b -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 ; mov r2, rb12 ++ nop ; mul24 r0, rb8, ra2.8a ++ nop ; mul24 r1, rb9, ra2.8b ++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb4 ++ add r1, r1, r0 ; mul24 r0, ra9, rb5 ++ sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++ add r1, r1, r0 ; mul24 r0, ra11, rb7 ++ sub r1, r1, r0 ; mov r2, rb_wt_off +# As with P-pred r1 is a 22-bit signed quantity in 32-bits +# Top 8 bits are bad - low 6 bits should be discarded -+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 + + asr r1, r1, 14 -+ nop ; mul24 r0, r1, rb14 -+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 -+ -+ add r1, r1, r0 ; mov -, vw_wait -+ shl r1, r1, 8 ++ nop ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 + ++ add r1, r1, r0 ++ shl r1, r1, 8 ; mov r0, ra_height + brr.anyn -, r:yloopb -+ asr r1, r1, rb13 # Delay 1 -+ min r1, r1, rb_k255 # Delay 2 -+ max vpm, r1, 0 # Delay 3 ++ asr ra3.8as, r1, rb_wt_den_p15 ++ mov r1, ra_k16 ; mov -, vw_wait ++ sub r0, r0, r1 ; mov vpm, ra3.8a ++# >>> branch.anyn yloop ++ ++# If looping again the we consumed 16 height last loop ++ # rb_dma1 (stride) remains constant ++ # rb_i_tmu remains const (based on total height) ++ # recalc rb_dma0, rb_lcount based on new segment height ++ # N.B. r5 is loop counter still ++ ++ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now + +# DMA out -+ brr -, r:per_block_setup -+ mov vw_setup, rb26 # VDW setup 0 Delay 1 -+ mov vw_setup, rb29 # Stride Delay 2 -+ mov vw_addr, unif # start the VDW Delay 3 ++ bra.anyz -, ra_link ++ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride ++ nop ; mov vw_addr, rb_dest # start the VDW ++# >>> .anyz ra_link ++ ++ add rb_lcount, rb_lcount, r0 ++ shl r0, r2, i_shift23 ++ add rb_dma0, rb_dma0, r0 ++ brr -, r:yloopb ++ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 ++ add rb_dest, rb_dest, r0 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> yloopb + +################################################################################ -+::mc_interrupt_exit12c -+ mov.setf -, ra9 ; mov -, vw_wait -+ brr.anyz -, r:exit12_c_1 -+ nop -+ nop -+ nop -+# >>> ++# ++# typedef struct qpu_mc_pred_y_p00_s { ++# qpu_mc_src_t next_src1; ++# uint16_t h; ++# uint16_t w; ++# uint32_t wo1; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p00_t; + -+ sub vw_setup, ra9, -16 -+ mov vw_setup, ra10 -+ mov vw_addr, ra11 -+ mov ra9, 0 -+:exit12_c_1 ++::mc_filter_y_p00 ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ mov ra_xshift, ra_xshift_next # [ra0 delay] ++ add r0, ra0.16b, r3 + -+# mc_interrupt_exit12() -+::mc_interrupt_exit12 -+ ldtmu0 -+ ldtmu1 -+ ldtmu0 -+ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW ++ max r0, r0, 0 ++ min r0, r0, rb_max_x + -+ mov -,sacq(0) # 1 -+ mov -,sacq(0) # 2 -+ mov -,sacq(0) # 3 -+ mov -,sacq(0) # 4 -+ mov -,sacq(0) # 5 -+ mov -,sacq(0) # 6 -+ mov -,sacq(0) # 7 -+ mov -,sacq(0) # 8 -+ mov -,sacq(0) # 9 -+ mov -,sacq(0) # 10 -+ mov -,sacq(0) # 11 ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height ++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # ; set up VPM write + -+ nop ; nop ; thrend -+ mov interrupt, 1; nop # delay slot 1 -+ nop ; nop # delay slot 2 ++# get width,height of block (unif load above) ++ sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width) ++ sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height ++ min r0, r0, ra_k16 ++ add rb_lcount, r0, 0 ; mov ra_wt_off_mul_l0, unif ++ shl r0, r0, 7 ; mov rb_dest, unif # Destination address ++ add r0, r0, ra_width # Combine width and height of destination area ++ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base + ++ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 ++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link + -+::mc_exit1 -+ mov -, vw_wait # wait on the VDW ++:yloop_p00 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch + -+ ldtmu0 -+ ldtmu1 -+ ldtmu0 -+ ldtmu1 -+ nop ; nop ; thrend -+ mov interrupt, 1; nop # delay slot 1 -+ nop ; nop # delay slot 2 ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 + ++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++ shl r1, r1, 15 ; mov r0, ra_height ++ add r1, r1, rb_wt_off ++ ++ brr.anyn -, r:yloop_p00 ++ asr ra3.8as, r1, rb_wt_den_p15 ++ mov r1, ra_k16 ; mov -, vw_wait ++ sub r0, r0, r1 ; mov vpm, ra3.8a ++# >>> branch.anyn yloop_p00 ++ ++# If looping again the we consumed 16 height last loop ++ # rb_dma1 (stride) remains constant ++ # rb_i_tmu remains const (based on total height) ++ # recalc rb_dma0, rb_lcount based on new segment height ++ # N.B. r5 is loop counter still ++ ++ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride ++ nop ; mov vw_addr, rb_dest # start the VDW ++# >>> .anyz ra_link ++ ++ add rb_lcount, rb_lcount, r0 ++ shl r0, r2, i_shift23 ++ add rb_dma0, rb_dma0, r0 ++ brr -, r:yloop_p00 ++ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 ++ add rb_dest, rb_dest, r0 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> yloop_p00 ++ ++################################################################################ ++ ++::mc_filter_y_b00 ++# luma setup does a fair bit more than we need calculating filter coeffs ++# that we will never use but it saves I-cache to use it (also simple!) ++ luma_setup ++ ++# Fix up vals that were expecting a filter (somewhat icky) ++ mov r0, 7 ++ sub rb_i_tmu, rb_i_tmu, r0 ++ sub rb_lcount, rb_lcount, r0 ++ mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++ shl rb_wt_off, rb_wt_off, r0 ++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++ ++:yloop_b00 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ and r1, r1, rb_k255 ; mul24 r0, r0, ra_wt_mul_l0 ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++ add r1, r0, r1 ++ shl r1, r1, 14 ++ add r1, r1, rb_wt_off ; mov r0, ra_height ++ ++ brr.anyn -, r:yloop_b00 ++ asr ra3.8as, r1, rb_wt_den_p15 ++ mov r1, ra_k16 ; mov -, vw_wait ++ sub r0, r0, r1 ; mov vpm, ra3.8a ++# >>> branch.anyn yloop ++ ++# If looping again the we consumed 16 height last loop ++ # rb_dma1 (stride) remains constant ++ # rb_i_tmu remains const (based on total height) ++ # recalc rb_dma0, rb_lcount based on new segment height ++ # N.B. r5 is loop counter still ++ ++ max.setf -, r0, 0 ; mov ra_height, r0 # Done if Z now ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r1 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ sub r2, r0, r1 ; mov vw_setup, rb_dma1 # Stride ++ nop ; mov vw_addr, rb_dest # start the VDW ++# >>> .anyz ra_link ++ ++ add rb_lcount, rb_lcount, r0 ++ shl r0, r2, i_shift23 ++ add rb_dma0, rb_dma0, r0 ++ brr -, r:yloop_b00 ++ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 ++ add rb_dest, rb_dest, r0 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> yloopb00 ++ ++################################################################################ + +::mc_end +# Do not add code here because mc_end must appear after all other code. -diff --git b/libavcodec/rpi_zc.c a/libavcodec/rpi_zc.c +diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h new file mode 100644 -index 0000000..9ac22aa +index 0000000..838b6bd --- /dev/null -+++ a/libavcodec/rpi_zc.c -@@ -0,0 +1,453 @@ ++++ b/libavcodec/rpi_shader_cmd.h +@@ -0,0 +1,112 @@ ++#ifndef RPI_SHADER_CMD_H ++#define RPI_SHADER_CMD_H ++ ++#pragma pack(push, 4) ++ ++typedef struct qpu_mc_src_s ++{ ++ int16_t y; ++ int16_t x; ++ uint32_t base; ++} qpu_mc_src_t; ++ ++ ++typedef struct qpu_mc_pred_c_p_s { ++ qpu_mc_src_t next_src; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ uint32_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_p_t; ++ ++typedef struct qpu_mc_pred_c_b_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x1; ++ uint32_t coeffs_y1; ++ uint32_t weight_u1; ++ uint32_t weight_v1; ++ qpu_mc_src_t next_src2; ++ uint32_t coeffs_x2; ++ uint32_t coeffs_y2; ++ uint32_t wo_u2; ++ uint32_t wo_v2; ++ uint32_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_b_t; ++ ++typedef struct qpu_mc_pred_c_s_s { ++ qpu_mc_src_t next_src1; ++ uint32_t pic_cw; // C Width (== Y width / 2) ++ uint32_t pic_ch; // C Height (== Y Height / 2) ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ qpu_mc_src_t next_src2; ++ uint32_t next_fn; ++} qpu_mc_pred_c_s_t; ++ ++typedef struct qpu_mc_pred_c_s { ++ union { ++ qpu_mc_pred_c_p_t p; ++ qpu_mc_pred_c_b_t b; ++ qpu_mc_pred_c_s_t s; ++ }; ++} qpu_mc_pred_c_t; ++ ++ ++typedef struct qpu_mc_pred_y_p_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t h; ++ uint16_t w; ++ uint32_t mymx21; ++ uint32_t wo1; ++ uint32_t wo2; ++ uint32_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p_t; ++ ++typedef struct qpu_mc_pred_y_p00_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t wo1; ++ uint32_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p00_t; ++ ++typedef struct qpu_mc_pred_y_s_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t pic_h; ++ uint16_t pic_w; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ uint32_t next_fn; ++} qpu_mc_pred_y_s_t; ++ ++// Only a useful structure in that it allows us to return something other than a void * ++typedef struct qpu_mc_pred_y_s { ++ union { ++ qpu_mc_pred_y_p_t p; ++ qpu_mc_pred_y_p00_t p00; ++ qpu_mc_pred_y_s_t s; ++ }; ++} qpu_mc_pred_y_t; ++ ++typedef union qpu_mc_pred_cmd_u { ++ qpu_mc_pred_y_t y; ++ qpu_mc_pred_c_t c; ++} qpu_mc_pred_cmd_t; ++ ++#pragma pack(pop) ++ ++#endif ++ +diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c +new file mode 100644 +index 0000000..b061fe0 +--- /dev/null ++++ b/libavcodec/rpi_zc.c +@@ -0,0 +1,581 @@ +#include "config.h" +#ifdef RPI +#include "rpi_qpu.h" ++#include "rpi_mailbox.h" +#include "rpi_zc.h" ++#include "libavutil/avassert.h" ++#include + +#include "libavutil/buffer_internal.h" ++#include ++ ++#define TRACE_ALLOC 0 + +struct ZcPoolEnt; + @@ -14668,6 +18437,9 @@ index 0000000..9ac22aa +#define STRIDE_OR 0 +#endif + ++#define DEBUG_ZAP0_BUFFERS 0 ++ ++ +static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size) +{ + ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt)); @@ -14686,6 +18458,11 @@ index 0000000..9ac22aa + goto fail1; + } + ++#if TRACE_ALLOC ++ printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm); ++#endif ++ ++ pool->numbytes = zp->gmem.numbytes; + zp->next = NULL; + zp->pool = pool; + zp->n = pool->n++; @@ -14699,6 +18476,10 @@ index 0000000..9ac22aa + +static void zc_pool_ent_free(ZcPoolEnt * const zp) +{ ++#if TRACE_ALLOC ++ printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm); ++#endif ++ + gpu_free(&zp->gmem); + av_free(zp); +} @@ -14707,6 +18488,8 @@ index 0000000..9ac22aa +{ + ZcPoolEnt * p = pool->head; + pool->head = NULL; ++ pool->numbytes = -1; ++ + while (p != NULL) + { + ZcPoolEnt * const zp = p; @@ -14715,15 +18498,21 @@ index 0000000..9ac22aa + } +} + -+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes) ++static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes) +{ + ZcPoolEnt * zp; ++ int numbytes; ++ + pthread_mutex_lock(&pool->lock); + -+ if (numbytes != pool->numbytes) ++ numbytes = pool->numbytes; ++ ++ // If size isn't close then dump the pool ++ // Close in this context means within 128k ++ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes) + { + zc_pool_flush(pool); -+ pool->numbytes = numbytes; ++ numbytes = req_bytes; + } + + if (pool->head != NULL) @@ -14750,6 +18539,10 @@ index 0000000..9ac22aa + if (zp != NULL) + { + pthread_mutex_lock(&pool->lock); ++#if TRACE_ALLOC ++ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes); ++#endif ++ + if (pool->numbytes == zp->gmem.numbytes) + { + zp->next = pool->head; @@ -14780,10 +18573,18 @@ index 0000000..9ac22aa + pthread_mutex_destroy(&pool->lock); +} + ++typedef struct ZcOldCtxVals ++{ ++ int thread_safe_callbacks; ++ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags); ++ void * get_buffer_context; ++} ZcOldCtxVals; + +typedef struct AVZcEnv +{ ++ unsigned int refcount; + ZcPool pool; ++ ZcOldCtxVals old; +} ZcEnv; + +// Callback when buffer unrefed to zero @@ -14803,18 +18604,71 @@ index 0000000..9ac22aa +} + +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( -+ const unsigned int video_width, const unsigned int video_height) ++ const int format, const unsigned int video_width, const unsigned int video_height) +{ + AVRpiZcFrameGeometry geo; -+ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; -+// geo.stride_y = ((video_width + 32 + 31) & ~31); -+ geo.stride_c = geo.stride_y / 2; -+// geo.height_y = (video_height + 15) & ~15; -+ geo.height_y = (video_height + 32 + 31) & ~31; -+ geo.height_c = geo.height_y / 2; ++ ++ switch (format) ++ { ++ case AV_PIX_FMT_YUV420P: ++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ // geo.stride_y = ((video_width + 32 + 31) & ~31); ++ geo.stride_c = geo.stride_y / 2; ++ // geo.height_y = (video_height + 15) & ~15; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ break; ++ ++ case AV_PIX_FMT_SAND128: ++ { ++ const unsigned int stripe_w = 128; ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ gpu_ref(); ++ mbox_get_image_params(gpu_get_mailbox(), &new_img); ++ gpu_unref(); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = (video_width + stripe_w - 1) / stripe_w; ++ ++ pthread_mutex_unlock(&sand_lock); ++ ++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); ++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); ++ break; ++ } ++ ++ default: ++ memset(&geo, 0, sizeof(geo)); ++ break; ++ } + return geo; +} + ++ +static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size) +{ + ZcPoolEnt *const zp = zc_pool_alloc(pool, size); @@ -14833,6 +18687,10 @@ index 0000000..9ac22aa + idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0); +#endif + ++#if DEBUG_ZAP0_BUFFERS ++ memset((void*)idata, 0, size); ++#endif ++ + if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL) + { + av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n"); @@ -14847,13 +18705,12 @@ index 0000000..9ac22aa + return NULL; +} + -+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame) ++static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame) +{ -+ ZcEnv *const zc = s->get_buffer_context; -+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height); ++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); + const unsigned int size_y = geo.stride_y * geo.height_y; + const unsigned int size_c = geo.stride_c * geo.height_c; -+ const unsigned int size_pic = size_y + size_c * 2; ++ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes; + AVBufferRef * buf; + unsigned int i; + @@ -14861,7 +18718,7 @@ index 0000000..9ac22aa + + if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL) + { -+ av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); ++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); + return AVERROR(ENOMEM); + } + @@ -14872,19 +18729,24 @@ index 0000000..9ac22aa + } + + frame->buf[0] = buf; ++ + frame->linesize[0] = geo.stride_y; + frame->linesize[1] = geo.stride_c; + frame->linesize[2] = geo.stride_c; ++ if (geo.stripes > 1) ++ frame->linesize[3] = geo.height_y + geo.height_c; // abuse: linesize[3] = stripe stride ++ + frame->data[0] = buf->data; + frame->data[1] = frame->data[0] + size_y; -+ frame->data[2] = frame->data[1] + size_c; ++ if (geo.planes_c > 1) ++ frame->data[2] = frame->data[1] + size_c; ++ + frame->extended_data = frame->data; + // Leave extended buf alone + + return 0; +} + -+ +#define RPI_GET_BUFFER2 1 + +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags) @@ -14894,21 +18756,25 @@ index 0000000..9ac22aa +#else + int rv; + -+ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 || -+ frame->format != AV_PIX_FMT_YUV420P) ++ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0) + { +// printf("Do default alloc: format=%#x\n", frame->format); + rv = avcodec_default_get_buffer2(s, frame, flags); + } ++ else if (frame->format == AV_PIX_FMT_YUV420P || ++ frame->format == AV_PIX_FMT_SAND128) ++ { ++ rv = rpi_get_display_buffer(s->get_buffer_context, frame); ++ } + else + { -+ rv = rpi_get_display_buffer(s, frame); ++ rv = avcodec_default_get_buffer2(s, frame, flags); + } + +#if 0 -+ printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, -+ frame->width, frame->height, -+ frame->linesize[0], frame->linesize[1], frame->linesize[2], ++ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, ++ frame->format, frame->width, frame->height, ++ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3], + frame->data[0], frame->data[1], frame->data[2], + frame->buf[0], frame->buf[1], frame->buf[2], + av_buffer_get_opaque(frame->buf[0])); @@ -14929,7 +18795,7 @@ index 0000000..9ac22aa + dest->width = src->width; + dest->height = src->height; + -+ if (rpi_get_display_buffer(s, dest) != 0) ++ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0) + { + return NULL; + } @@ -14962,14 +18828,16 @@ index 0000000..9ac22aa +{ + assert(s != NULL); + -+ if (frame->format != AV_PIX_FMT_YUV420P) ++ if (frame->format != AV_PIX_FMT_YUV420P && ++ frame->format != AV_PIX_FMT_SAND128) + { -+ av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format); ++ av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); + return NULL; + } + + if (frame->buf[1] != NULL) + { ++ av_assert0(frame->format == AV_PIX_FMT_YUV420P); + if (maycopy) + { + av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); @@ -15053,47 +18921,70 @@ index 0000000..9ac22aa + } +} + ++int av_rpi_zc_in_use(const struct AVCodecContext * const s) ++{ ++ return s->get_buffer2 == av_rpi_zc_get_buffer2; ++} ++ +int av_rpi_zc_init(struct AVCodecContext * const s) +{ -+ ZcEnv * const zc = av_rpi_zc_env_alloc(); -+ if (zc == NULL) ++ if (av_rpi_zc_in_use(s)) + { -+ return AVERROR(ENOMEM); ++ ZcEnv * const zc = s->get_buffer_context; ++ ++zc->refcount; + } ++ else ++ { ++ ZcEnv *const zc = av_rpi_zc_env_alloc(); ++ if (zc == NULL) ++ { ++ return AVERROR(ENOMEM); ++ } + -+ s->get_buffer_context = zc; -+ s->get_buffer2 = av_rpi_zc_get_buffer2; ++ zc->refcount = 1; ++ zc->old.get_buffer_context = s->get_buffer_context; ++ zc->old.get_buffer2 = s->get_buffer2; ++ zc->old.thread_safe_callbacks = s->thread_safe_callbacks; ++ ++ s->get_buffer_context = zc; ++ s->get_buffer2 = av_rpi_zc_get_buffer2; ++ s->thread_safe_callbacks = 1; ++ } + return 0; +} + +void av_rpi_zc_uninit(struct AVCodecContext * const s) +{ -+ if (s->get_buffer2 == av_rpi_zc_get_buffer2) ++ if (av_rpi_zc_in_use(s)) + { + ZcEnv * const zc = s->get_buffer_context; -+ s->get_buffer2 = avcodec_default_get_buffer2; -+ s->get_buffer_context = NULL; -+ av_rpi_zc_env_free(zc); ++ if (--zc->refcount == 0) ++ { ++ s->get_buffer2 = zc->old.get_buffer2; ++ s->get_buffer_context = zc->old.get_buffer_context; ++ s->thread_safe_callbacks = zc->old.thread_safe_callbacks; ++ av_rpi_zc_env_free(zc); ++ } + } +} + +#endif // RPI + -diff --git b/libavcodec/rpi_zc.h a/libavcodec/rpi_zc.h +diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h new file mode 100644 -index 0000000..4dd7a8b +index 0000000..f4aeb78 --- /dev/null -+++ a/libavcodec/rpi_zc.h -@@ -0,0 +1,88 @@ ++++ b/libavcodec/rpi_zc.h +@@ -0,0 +1,137 @@ +#ifndef LIBAVCODEC_RPI_ZC_H +#define LIBAVCODEC_RPI_ZC_H + +// Zero-Copy frame code for RPi +// RPi needs Y/U/V planes to be contiguous for display. By default +// ffmpeg will allocate separated planes so a memcpy is needed before -+// display. This code prodes a method a making ffmpeg allocate a single -+// bit of memory for the frame when can then be refrence counted until -+// display ahs finsihed with it. ++// display. This code provides a method a making ffmpeg allocate a single ++// bit of memory for the frame when can then be reference counted until ++// display has finished with it. + +#include "libavutil/frame.h" +#include "libavcodec/avcodec.h" @@ -15110,10 +19001,13 @@ index 0000000..4dd7a8b + unsigned int height_y; + unsigned int stride_c; + unsigned int height_c; ++ unsigned int planes_c; ++ unsigned int stripes; +} AVRpiZcFrameGeometry; + + +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( ++ const int format, + const unsigned int video_width, const unsigned int video_height); + +// Replacement fn for avctx->get_buffer2 @@ -15122,7 +19016,7 @@ index 0000000..4dd7a8b +// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames +// must be set to 1 as otherwise the buffer info is killed before being returned +// by avcodec_decode_video2. Note also that this means that the AVFrame that is -+// return must be manually derefed with av_frame_unref. This should be done ++// returned must be manually derefed with av_frame_unref. This should be done +// after av_rpi_zc_ref has been called. +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags); + @@ -15160,6 +19054,8 @@ index 0000000..4dd7a8b +// Allocate the environment used by the ZC code +void av_rpi_zc_env_free(AVZcEnvPtr); + ++// Test to see if the context is using zc (checks get_buffer2) ++int av_rpi_zc_in_use(const struct AVCodecContext * const s); + +// Init ZC into a context +// There is nothing magic in this fn - it just packages setting @@ -15171,12 +19067,56 @@ index 0000000..4dd7a8b +// get_buffer2 & get_buffer_context +void av_rpi_zc_uninit(struct AVCodecContext * const s); + ++ ++ ++static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame) ++{ ++ return frame->linesize[3]; ++} ++ ++static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ const unsigned int stride1 = frame->linesize[0]; ++ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y + stride2 * x2; ++} ++ ++static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) ++{ ++ const unsigned int stride1 = frame->linesize[0]; ++ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); ++ const unsigned int x = x_c * 2; ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y_c + stride2 * x2; ++} ++ ++static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y); ++} ++ ++static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y); ++} ++ ++static inline int rpi_sliced_frame(const AVFrame * const frame) ++{ ++ return frame->format == AV_PIX_FMT_SAND128; ++} ++ ++ +#endif + -diff --git b/libavcodec/utils.c a/libavcodec/utils.c -index 3e8677d..f1efc0d 100644 ---- b/libavcodec/utils.c -+++ a/libavcodec/utils.c +diff --git a/libavcodec/utils.c b/libavcodec/utils.c +index 0c68836..b8139f5 100644 +--- a/libavcodec/utils.c ++++ b/libavcodec/utils.c @@ -26,6 +26,12 @@ */ @@ -15264,48 +19204,34 @@ index 3e8677d..f1efc0d 100644 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1, CONFIG_MEMORY_POISONING ? NULL : -diff --git b/libavformat/matroskaenc.c a/libavformat/matroskaenc.c -index 9c7a213..af941ce 100644 ---- b/libavformat/matroskaenc.c -+++ a/libavformat/matroskaenc.c -@@ -2223,7 +2223,7 @@ static int mkv_check_new_extra_data(AVFormatContext *s, AVPacket *pkt) +@@ -729,6 +788,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags + { + int ret; - switch (par->codec_id) { - case AV_CODEC_ID_FLAC: -- if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL) && !mkv->is_live) { -+ if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL)) { - AVCodecParameters *codecpriv_par; - int64_t curpos; - if (side_data_size != par->extradata_size) { -diff --git b/libavformat/mov.c a/libavformat/mov.c -index f2296f8..4550cf0 100644 ---- b/libavformat/mov.c -+++ a/libavformat/mov.c -@@ -1186,12 +1186,6 @@ static void mov_metadata_creation_time(AVDictionary **metadata, int64_t time) - if (time) { - if(time >= 2082844800) - time -= 2082844800; /* seconds between 1904-01-01 and Epoch */ -- -- if ((int64_t)(time * 1000000ULL) / 1000000 != time) { -- av_log(NULL, AV_LOG_DEBUG, "creation_time is not representable\n"); -- return; -- } -- - avpriv_dict_set_timestamp(metadata, "creation_time", time * 1000000); - } - } -@@ -5794,7 +5788,6 @@ static int mov_read_close(AVFormatContext *s) - av_freep(&mov->fragment_index_data); ++#ifdef RPI ++ // This is going to end badly if we let it continue ++ av_assert0(frame->format != AV_PIX_FMT_SAND128); ++#endif ++ + if (avctx->hw_frames_ctx) + return av_hwframe_get_buffer(avctx->hw_frames_ctx, frame, 0); - av_freep(&mov->aes_decrypt); -- av_freep(&mov->chapter_tracks); +diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c +index ecfb872..5fa099f 100644 +--- a/libavfilter/avfilter.c ++++ b/libavfilter/avfilter.c +@@ -969,6 +969,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args) + "options, but options were provided: %s.\n", args); + return AVERROR(EINVAL); + } ++ printf("=== args='%s'\n", args); - return 0; - } -diff --git b/libavformat/mpegts.c a/libavformat/mpegts.c + #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR + if ( !strcmp(filter->filter->name, "format") || +diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c index 3eff152..30dfb14 100644 ---- b/libavformat/mpegts.c -+++ a/libavformat/mpegts.c +--- a/libavformat/mpegts.c ++++ b/libavformat/mpegts.c @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = { #endif { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 }, @@ -15315,10 +19241,10 @@ index 3eff152..30dfb14 100644 { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 }, { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC }, { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS }, -diff --git b/libavformat/utils.c a/libavformat/utils.c -index a059046..ef70074 100644 ---- b/libavformat/utils.c -+++ a/libavformat/utils.c +diff --git a/libavformat/utils.c b/libavformat/utils.c +index a82bbc7..4bf5574 100644 +--- a/libavformat/utils.c ++++ b/libavformat/utils.c @@ -748,7 +748,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in int default_stream_index = av_find_default_stream_index(s); if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) { @@ -15328,10 +19254,10 @@ index a059046..ef70074 100644 continue; s->streams[i]->pts_wrap_reference = pts_wrap_reference; s->streams[i]->pts_wrap_behavior = pts_wrap_behavior; -diff --git b/libavutil/buffer.c a/libavutil/buffer.c +diff --git a/libavutil/buffer.c b/libavutil/buffer.c index 8d1aa5f..649876d 100644 ---- b/libavutil/buffer.c -+++ a/libavutil/buffer.c +--- a/libavutil/buffer.c ++++ b/libavutil/buffer.c @@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool) return ret; @@ -15342,10 +19268,10 @@ index 8d1aa5f..649876d 100644 + BufferPoolEntry *buf = av_buffer_get_opaque(ref); + return buf->opaque; +} -diff --git b/libavutil/buffer.h a/libavutil/buffer.h +diff --git a/libavutil/buffer.h b/libavutil/buffer.h index 73b6bd0..d907de3 100644 ---- b/libavutil/buffer.h -+++ a/libavutil/buffer.h +--- a/libavutil/buffer.h ++++ b/libavutil/buffer.h @@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool); */ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool); @@ -15356,60 +19282,315 @@ index 73b6bd0..d907de3 100644 /** * @} */ -diff --git b/pi-util/conf.sh a/pi-util/conf.sh -new file mode 100755 -index 0000000..8b596a2 ---- /dev/null -+++ a/pi-util/conf.sh -@@ -0,0 +1,33 @@ -+echo "Configure for Pi2/3" +diff --git a/libavutil/frame.h b/libavutil/frame.h +index 7cb78a1..b94a635 100644 +--- a/libavutil/frame.h ++++ b/libavutil/frame.h +@@ -127,6 +127,13 @@ enum AVFrameSideDataType { + * libavutil/spherical.h. + */ + AV_FRAME_DATA_SPHERICAL, + -+RPI_BUILDROOT=`pwd`/build -+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot -+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf -+RPI_OPT_VC=$RPI_ROOTFS/opt/vc -+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" -+#RPI_DEFS="-D__VCCOREVER__=0x04000000" -+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib" -+#RPI_KEEPS="-save-temps=obj" -+RPI_KEEPS="" ++ /** ++ * Extra data required to deal with a cropped Sand frame ++ * AVFrame holds the cropped size, but we cannot simply offset the start ++ * address to get the picture as we can for planar formats ++ */ ++ AV_FRAME_DATA_SAND_INFO, + }; + + enum AVActiveFormatDescription { +@@ -139,6 +146,13 @@ enum AVActiveFormatDescription { + AV_AFD_SP_4_3 = 15, + }; + ++typedef struct AVFrameDataSandInfo ++{ ++ unsigned int left_offset; ++ unsigned int top_offset; ++ unsigned int pic_width; ++ unsigned int pic_height; ++} AVFrameDataSandInfo; + + /** + * Structure to hold side data for an AVFrame. +diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c +index d4a7a8b..92a01a4 100644 +--- a/libavutil/pixdesc.c ++++ b/libavutil/pixdesc.c +@@ -2158,6 +2158,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { + .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR | + AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA, + }, ++ [AV_PIX_FMT_SAND128] = { ++ .name = "sand128", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */ ++ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */ ++ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ ++ }, ++ .flags = 0, ++ } + }; + #if FF_API_PLUS1_MINUS1 + FF_ENABLE_DEPRECATION_WARNINGS +diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h +index 5dafc34..0895b69 100644 +--- a/libavutil/pixfmt.h ++++ b/libavutil/pixfmt.h +@@ -314,6 +314,9 @@ enum AVPixelFormat { + AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian + AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian + ++// RPI - not on ifdef so can be got at by calling progs ++ AV_PIX_FMT_SAND128, ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding + -+./configure --enable-cross-compile\ -+ --arch=armv6t2\ -+ --cpu=cortex-a7\ -+ --target-os=linux\ -+ --disable-stripping\ -+ --disable-thumb\ -+ --enable-mmal\ -+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ -+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ -+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\ -+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ -+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- + AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions + }; + +diff --git a/libswscale/input.c b/libswscale/input.c +index 04a5190..837f633 100644 +--- a/libswscale/input.c ++++ b/libswscale/input.c +@@ -741,6 +741,13 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV, + } + } + ++static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV, ++ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, ++ int width, uint32_t *unused) ++{ ++ // NIF ++} + -+# --enable-extra-warnings\ -+# --arch=armv71\ -+# --enable-shared\ -+ -+# gcc option for getting asm listing -+# -Wa,-ahls -diff --git b/pi-util/conf_h265.csv a/pi-util/conf_h265.csv + #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) + + static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, +@@ -1124,6 +1131,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) + case AV_PIX_FMT_P016BE: + c->chrToYV12 = p016BEToUV_c; + break; ++ case AV_PIX_FMT_SAND128: ++ c->chrToYV12 = sand128ToUV_c; ++ break; + } + if (c->chrSrcHSubSample) { + switch (srcFormat) { +diff --git a/libswscale/utils.c b/libswscale/utils.c +index 4c9b53b..835f3aa 100644 +--- a/libswscale/utils.c ++++ b/libswscale/utils.c +@@ -254,6 +254,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { + [AV_PIX_FMT_P010BE] = { 1, 1 }, + [AV_PIX_FMT_P016LE] = { 1, 0 }, + [AV_PIX_FMT_P016BE] = { 1, 0 }, ++#ifdef RPI ++ [AV_PIX_FMT_SAND128] = { 1, 0 }, ++#endif + }; + + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) +diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 -index 0000000..d3db338 +index 0000000..2d45679 --- /dev/null -+++ a/pi-util/conf_h265.csv ++++ b/pi-util/BUILD.txt +@@ -0,0 +1,24 @@ ++Building Pi FFmpeg ++================== ++ ++Configuration: ++============= ++ ++pi-util/conf_pi2.sh ++ ++contains suitable options to build the code for Pi2/3. It expects to find ++git clones of ++ ++https://github.com/raspberrypi/tools ++https://github.com/raspberrypi/firmware ++ ++in the parent of the FFmpeg directory. I recommend using --depth 1 to avoid a ++lot of history you don't want. ++ ++If you have a copy of qasm.py in ../local then the .qasm sources will be ++rebuilt. Otherwise the prebuilt .c & .h files will be used. ++ ++pi-util/conf_p1.sh should configure for Pi1. Beware that as of this time ++H265 QPU acceleration is broken on Pi1 and so it is disabled. ++ ++ +diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv +new file mode 100644 +index 0000000..6082641 +--- /dev/null ++++ b/pi-util/conf_h265.2016_HEVC_v1.csv +@@ -0,0 +1,147 @@ ++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 ++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 ++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 ++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 ++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 ++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 ++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 ++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 ++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt ++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt ++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 ++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 ++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 ++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 ++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 ++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth ++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? ++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 +diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv +new file mode 100644 +index 0000000..fc14f2a +--- /dev/null ++++ b/pi-util/conf_h265.csv @@ -0,0 +1,144 @@ +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5 -+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 ++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5 +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 -+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 ++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 @@ -15431,7 +19612,7 @@ index 0000000..d3db338 +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 -+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 @@ -15471,7 +19652,7 @@ index 0000000..d3db338 +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 -+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 @@ -15485,10 +19666,10 @@ index 0000000..d3db338 +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 -+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 -+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 @@ -15517,7 +19698,7 @@ index 0000000..d3db338 +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5 +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5 +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 -+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5 +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5 +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 @@ -15528,7 +19709,7 @@ index 0000000..d3db338 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 +0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 -+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 @@ -15545,12 +19726,85 @@ index 0000000..d3db338 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 -diff --git b/pi-util/ffconf.py a/pi-util/ffconf.py -new file mode 100644 -index 0000000..c896bc6 +diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh +new file mode 100755 +index 0000000..ec25b81 --- /dev/null -+++ a/pi-util/ffconf.py -@@ -0,0 +1,154 @@ ++++ b/pi-util/conf_pi1.sh +@@ -0,0 +1,31 @@ ++echo "Configure for Pi1" ++ ++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf ++RPI_OPT_VC=`pwd`/../firmware/opt/vc ++ ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" ++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++./configure --enable-cross-compile\ ++ --cpu=arm1176jzf-s\ ++ --arch=arm\ ++ --disable-neon\ ++ --target-os=linux\ ++ --disable-stripping\ ++ --enable-mmal\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++ ++ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++# --enable-shared\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls +diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh +new file mode 100755 +index 0000000..f8e5e75 +--- /dev/null ++++ b/pi-util/conf_pi2.sh +@@ -0,0 +1,30 @@ ++echo "Configure for Pi2/3" ++ ++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf ++RPI_OPT_VC=`pwd`/../firmware/opt/vc ++ ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" ++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++./configure --enable-cross-compile\ ++ --arch=armv6t2\ ++ --cpu=cortex-a7\ ++ --target-os=linux\ ++ --disable-stripping\ ++ --disable-thumb\ ++ --enable-mmal\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++# --enable-shared\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls +diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py +new file mode 100755 +index 0000000..e96bad2 +--- /dev/null ++++ b/pi-util/ffconf.py +@@ -0,0 +1,164 @@ +#!/usr/bin/env python + +import os @@ -15561,7 +19815,6 @@ index 0000000..c896bc6 +import csv +from stat import * + -+conf_root = "/opt/conform/h265" +ffmpeg_exec = "./ffmpeg" + +def testone(fileroot, name, es_file, md5_file): @@ -15611,10 +19864,10 @@ index 0000000..c896bc6 + +def scandir(root): + aconf = [] -+ ents = os.listdir(conf_root) ++ ents = os.listdir(root) + ents.sort(key=str.lower) + for name in ents: -+ test_path = os.path.join(conf_root, name) ++ test_path = os.path.join(root, name) + if S_ISDIR(os.stat(test_path).st_mode): + files = os.listdir(test_path) + es_file = "?" @@ -15625,7 +19878,7 @@ index 0000000..c896bc6 + pass + elif ext == ".bit" or ext == ".bin": + es_file = f -+ elif ext == ".md5": ++ elif ext == ".md5" or (ext == ".txt" and base[-4:] == "_md5"): + if md5_file == "?": + md5_file = f + elif base[-3:] == "yuv": @@ -15641,9 +19894,11 @@ index 0000000..c896bc6 + return True + return False + -+def doconf(csva, tests): -+ failures = [] ++def doconf(csva, tests, test_root): ++ unx_failures = [] + unx_success = [] ++ failures = 0 ++ successes = 0 + for a in csva: + exp_test = int(a[0]) + if (exp_test and runtest(a[1], tests)): @@ -15651,17 +19906,25 @@ index 0000000..c896bc6 + print "==== ", name, + sys.stdout.flush() + -+ rv = testone(os.path.join(conf_root, name), name, a[2], a[3]) ++ rv = testone(os.path.join(test_root, name), name, a[2], a[3]) ++ if (rv == 0): ++ successes += 1 ++ else: ++ failures += 1 ++ + if (rv == 0): + if exp_test == 2: + print ": * OK *" + unx_success.append(name) + else: + print ": ok" -+ elif exp_test > 1 and rv == 1: ++ elif exp_test == 2 and rv == 1: + print ": fail" ++ elif exp_test == 3 and rv == 2: ++ # Call an expected "crash" an abort ++ print ": abort" + else: -+ failures.append(name) ++ unx_failures.append(name) + if rv == 1: + print ": * FAIL *" + elif (rv == 2) : @@ -15671,11 +19934,11 @@ index 0000000..c896bc6 + else : + print ": * BANG *" + -+ if failures or unx_success: -+ print "Unexpected Failures:", failures ++ if unx_failures or unx_success: ++ print "Unexpected Failures:", unx_failures + print "Unexpected Success: ", unx_success + else: -+ print "All tests normal" ++ print "All tests normal:", successes, "ok,", failures, "failed" + + +class ConfCSVDialect(csv.Dialect): @@ -15691,2662 +19954,67 @@ index 0000000..c896bc6 + + argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester") + argp.add_argument("tests", nargs='*') ++ argp.add_argument("--test_root", default="/opt/conform/h265", help="Root dir for test") + argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir") + argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename") + args = argp.parse_args() + + if args.csvgen: -+ csv.writer(sys.stdout).writerows(scandir(conf_root)) ++ csv.writer(sys.stdout).writerows(scandir(args.test_root)) + exit(0) + + with open(args.csv, 'rt') as csvfile: + csva = [a for a in csv.reader(csvfile, ConfCSVDialect())] + + -+ doconf(csva, args.tests) ++ doconf(csva, args.tests, args.test_root) + -diff --git b/pi-util/qasm.py a/pi-util/qasm.py -new file mode 100644 -index 0000000..1eacc04 +diff --git a/pi-util/qem.sh b/pi-util/qem.sh +new file mode 100755 +index 0000000..47dd071 --- /dev/null -+++ a/pi-util/qasm.py -@@ -0,0 +1,2502 @@ -+#!/usr/bin/env python -+ -+# add.ifz.setf -, r0, ra0 ; fmul rb1, rany2, 0 ; thrend # comment -+# add r0, r0, 1 # implicit mul nop -+# nop # explicit add nop, implicit mul nop -+# bkpt # implicit add/mul nop -+# mov r0, 0x1234 # hex immediate -+# mov r0, 20 * 40 # expressions... -+# mov r0, f(sqrt(2.0) * 3.0) # f() converts float to bits -+# mov r0, a:label # put address of label in r0 -+# :label -+# bra.allnn ra2, a:1f # branch to label 1 (searching forward), using absolute address -+# :1 -+# brr.anyz -, r:1b # branch to label 1 (searching backward), using relative address -+# :1 # multiple definitions of numeric labels (differentiated using f/b) -+# .set my_val, 3 # introduce alias for 3 -+# .set my_reg, r0 # and for r0 -+# mov my_reg, my_val # then use them -+# .set my_reg2, my_reg + my_val # r0 plus 3 is r3 -+# .macro my_add, a, b, c # a, b, c act as if .set on entry -+# .set my_val, 10 -+# add a, b, c -+# mov r0, my_val # 10 -+# .endm # forget all .sets since .macro (including arg .sets) -+# mov r0, my_val # 3 -+# my_add my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right) -+ -+import math -+import optparse -+import os -+import random -+import re -+import struct -+import sys -+import time -+ -+############################################################################### -+# constants -+############################################################################### -+ -+# ops -+###### -+ -+# negatives are internal qasm ops -+ -+AOP_MOV = -3 # two operands -+AOP_BRA = -2 # two operands -+AOP_BRR = -1 # two operands -+AOP_NOP = 0x00 # no operands -+AOP_FADD = 0x01 -+AOP_FSUB = 0x02 -+AOP_FMIN = 0x03 -+AOP_FMAX = 0x04 -+AOP_FMINABS = 0x05 -+AOP_FMAXABS = 0x06 -+AOP_FTOI = 0x07 # two operands -+AOP_ITOF = 0x08 # two operands -+AOP_ADD = 0x0c -+AOP_SUB = 0x0d -+AOP_SHR = 0x0e -+AOP_ASR = 0x0f -+AOP_ROR = 0x10 -+AOP_SHL = 0x11 -+AOP_MIN = 0x12 -+AOP_MAX = 0x13 -+AOP_AND = 0x14 -+AOP_OR = 0x15 -+AOP_XOR = 0x16 -+AOP_NOT = 0x17 # two operands -+AOP_CLZ = 0x18 # two operands -+AOP_V8ADDS = 0x1e -+AOP_V8SUBS = 0x1f -+ -+MOP_MOV = -1 # two operands -+MOP_NOP = 0x0 # no operands -+MOP_FMUL = 0x1 -+MOP_MUL24 = 0x2 -+MOP_V8MULD = 0x3 -+MOP_V8MIN = 0x4 -+MOP_V8MAX = 0x5 -+MOP_V8ADDS = 0x6 -+MOP_V8SUBS = 0x7 -+ -+# ldi modes -+############ -+ -+LDI_32 = 0 -+LDI_EL_SIGNED = 1 -+LDI_EL_UNSIGNED = 3 -+LDI_SEMA = 4 -+ -+# conds -+######## -+ -+COND_NEVER = 0 -+COND_ALWAYS = 1 -+COND_IFZ = 2 -+COND_IFNZ = 3 -+COND_IFN = 4 -+COND_IFNN = 5 -+COND_IFC = 6 -+COND_IFNC = 7 -+ -+BCOND_ALLZ = 0 -+BCOND_ALLNZ = 1 -+BCOND_ANYZ = 2 -+BCOND_ANYNZ = 3 -+BCOND_ALLN = 4 -+BCOND_ALLNN = 5 -+BCOND_ANYN = 6 -+BCOND_ANYNN = 7 -+BCOND_ALLC = 8 -+BCOND_ALLNC = 9 -+BCOND_ANYC = 10 -+BCOND_ANYNC = 11 -+BCOND_ALWAYS = 15 -+ -+# packing/unpacking -+#################### -+ -+# regfile a pack modes -+PACK_A_NOP = 0 -+PACK_A_16A = 1 -+PACK_A_16B = 2 -+PACK_A_8888 = 3 -+PACK_A_8A = 4 -+PACK_A_8B = 5 -+PACK_A_8C = 6 -+PACK_A_8D = 7 -+PACK_A_32S = 8 -+PACK_A_16AS = 9 -+PACK_A_16BS = 10 -+PACK_A_8888S = 11 -+PACK_A_8AS = 12 -+PACK_A_8BS = 13 -+PACK_A_8CS = 14 -+PACK_A_8DS = 15 -+ -+# mul unit pack modes -+PACK_MUL_NOP = 0 -+PACK_MUL_8888 = 3 -+PACK_MUL_8A = 4 -+PACK_MUL_8B = 5 -+PACK_MUL_8C = 6 -+PACK_MUL_8D = 7 -+ -+# regfile a unpack modes -+UNPACK_A_NOP = 0 -+UNPACK_A_16A = 1 -+UNPACK_A_16B = 2 -+UNPACK_A_8R = 3 -+UNPACK_A_8A = 4 -+UNPACK_A_8B = 5 -+UNPACK_A_8C = 6 -+UNPACK_A_8D = 7 -+ -+# r4 unpack modes -+UNPACK_R4_NOP = 0 -+UNPACK_R4_16A = 1 -+UNPACK_R4_16B = 2 -+UNPACK_R4_8R = 3 -+UNPACK_R4_8A = 4 -+UNPACK_R4_8B = 5 -+UNPACK_R4_8C = 6 -+UNPACK_R4_8D = 7 -+ -+PACK_TYPE_INT = 0 -+PACK_TYPE_FLOAT = 1 -+PACK_TYPE_EITHER = -1 -+ -+PACK_MODE_A = 0 # regfile a -+PACK_MODE_M = 1 # mul unit -+PACK_MODE_EITHER = -1 -+ -+UNPACK_LOC_A = 0 # regfile a -+UNPACK_LOC_R4 = 1 # r4 -+UNPACK_LOC_AB = 2 # either regfile a or regfile b -+UNPACK_LOC_OTHER = 3 # somewhere else -+ -+# args -+####### -+ -+# loc_t, ie internal -+MUX_AC = 0 -+MUX_ANY = 1 -+MUX_A = 2 -+MUX_B = 3 -+RW_EITHER = 0 -+RW_READ = 1 -+RW_WRITE = 2 -+ -+RADDR_NOP = 39 -+ -+# negatives are for internal use -+RMUX_SEMA = -6 -+RMUX_LABEL = -5 -+RMUX_IMMV = -4 -+RMUX_IMM = -3 -+RMUX_AC = -2 -+RMUX_ANY = -1 -+RMUX_A0 = 0 # followed by A1, A2, A3, A4, A5 -+RMUX_A = 6 -+RMUX_B = 7 -+ -+WADDR_R0 = 32 # followed by R1, R2, R3 -+WADDR_NOP = 39 -+ -+WMUX_ANY = 0 -+WMUX_A = 1 -+WMUX_B = 2 -+ -+# signals -+########## -+ -+SIG_BKPT = 0 -+SIG_NORMAL = 1 -+SIG_THRSW = 2 -+SIG_THREND = 3 -+SIG_SBWAIT = 4 -+SIG_SBDONE = 5 -+SIG_INT = 6 # on a0 -+SIG_LTHRSW = 6 # on b0 -+SIG_LOADCV = 7 -+SIG_LOADC = 8 -+SIG_LDCEND = 9 -+SIG_LDTMU0 = 10 -+SIG_LDTMU1 = 11 -+SIG_ROTATE = 12 # on a0 -+SIG_LOADAM = 12 # on b0 -+SIG_SMALLIMMED = 13 -+SIG_IMMED = 14 -+SIG_BRANCH = 15 -+ -+# multi-line assembler constructs -+################################## -+ -+CONSTRUCT_MACRO = 0x1 -+CONSTRUCT_IF = 0x2 -+CONSTRUCT_ELSE = 0x4 -+CONSTRUCT_REP = 0x8 -+ -+############################################################################### -+# helpers -+############################################################################### -+ -+def asm_error(message, location = None): -+ if location is None: -+ location = current_location -+ if location == '': -+ sys.stderr.write('qasm ERROR: %s\n' % message) -+ else: -+ sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message)) -+ sys.exit(-1) -+ -+def asm_warning(message, location = None): -+ if disable_warnings or (nwarn_level != 0): -+ return -+ if location is None: -+ location = current_location -+ if location == '': -+ sys.stderr.write('qasm WARNING: %s\n' % message) -+ else: -+ sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message)) -+ if warnings_are_errors: -+ asm_error('warnings are errors!', location) -+ -+# smart_split('') = [] -+# smart_split('a') = ['a'] -+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6'] -+def smart_split(s, delim = ',', count = 0): -+ if len(s) == 0: -+ return [] -+ parts = [] -+ depth = 0 -+ i = 0 -+ for j in xrange(len(s)): -+ if s[j] in '([{': -+ depth += 1 -+ elif s[j] in ')]}': -+ depth -= 1 -+ elif (s[j] == delim) and (depth == 0): -+ parts.append(s[i:j]) -+ i = j + 1 -+ if len(parts) == count: -+ break -+ if depth != 0: -+ asm_error('bracket nesting fail') -+ parts.append(s[i:]) -+ return parts -+ -+def is_int(x): -+ return isinstance(x, int) or isinstance(x, long) -+ -+############################################################################### -+# "parsing" stuff -+############################################################################### -+ -+re_macro = re.compile('\\.macro\\s+(?P\\w+)(?P(\\s*,\\s*\\w+)*)$') -+re_if = re.compile('\\.if((?Pn?set)\\s+(?P\\w+)|\\s(?P.+))$') -+re_elif = re.compile('\\.elif((?Pn?set)\\s+(?P\\w+)|\\s(?P.+))$') -+re_rep = re.compile('\\.rep\\s+(?P\\w+)\\s*,(?P.+)$') -+re_include = re.compile('\\.include\\s(?P.+)$') -+re_set = re.compile('\\.set\\s+(?P\\w+)\\s*,(?P.+)$') -+re_unset = re.compile('\\.unset\\s+(?P\\w+)$') -+re_eval = re.compile('\\.eval\\s(?P.+)$') -+re_print_info_warn_error = re.compile('\\.(?Pprint|info|warn|error)\\s(?P.+)$') -+re_assert = re.compile('\\.assert\\s(?P.+)$') -+re_data = re.compile('\\.d(?P[124])\\s(?P.+)$') -+re_macro_inst = re.compile('(?P\\w+)(?P\\s.+|)$') -+re_label = re.compile(':(?P:?[a-zA-Z_]\\w*|\\d+)$') -+re_op = re.compile('(?P\\w+)(\\.(?P\\w+))??(\\.(?Psetf))?(?P\\s.+|)$') -+re_label_ref_left = re.compile('\\b([ar]):') -+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$') -+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals... -+ -+# ops -+###### -+ -+aops = { -+ 'mov': (AOP_MOV, 2), -+ 'bra': (AOP_BRA, 2), -+ 'brr': (AOP_BRR, 2), -+ 'nop': (AOP_NOP, 0), -+ 'fadd': (AOP_FADD, 3), -+ 'fsub': (AOP_FSUB, 3), -+ 'fmin': (AOP_FMIN, 3), -+ 'fmax': (AOP_FMAX, 3), -+ 'fminabs': (AOP_FMINABS, 3), -+ 'fmaxabs': (AOP_FMAXABS, 3), -+ 'ftoi': (AOP_FTOI, 2), -+ 'itof': (AOP_ITOF, 2), -+ 'add': (AOP_ADD, 3), -+ 'sub': (AOP_SUB, 3), -+ 'shr': (AOP_SHR, 3), -+ 'asr': (AOP_ASR, 3), -+ 'ror': (AOP_ROR, 3), -+ 'shl': (AOP_SHL, 3), -+ 'min': (AOP_MIN, 3), -+ 'max': (AOP_MAX, 3), -+ 'and': (AOP_AND, 3), -+ 'or': (AOP_OR, 3), -+ 'xor': (AOP_XOR, 3), -+ 'not': (AOP_NOT, 2), -+ 'clz': (AOP_CLZ, 2), -+ 'v8adds': (AOP_V8ADDS, 3), -+ 'v8subs': (AOP_V8SUBS, 3)} -+ -+def get_aop(aop): -+ if aop not in aops: -+ asm_error('invalid aop') -+ return aops[aop] -+ -+mops = { -+ 'mov': (MOP_MOV, 2), -+ 'nop': (MOP_NOP, 0), -+ 'fmul': (MOP_FMUL, 3), -+ 'mul24': (MOP_MUL24, 3), -+ 'v8muld': (MOP_V8MULD, 3), -+ 'v8min': (MOP_V8MIN, 3), -+ 'v8max': (MOP_V8MAX, 3), -+ 'v8adds': (MOP_V8ADDS, 3), -+ 'v8subs': (MOP_V8SUBS, 3)} -+ -+def get_mop(mop): -+ if mop not in mops: -+ asm_error('invalid mop') -+ return mops[mop] -+ -+# conds -+######## -+ -+conds = { -+ 'ifz': COND_IFZ, -+ 'ifnz': COND_IFNZ, -+ 'ifn': COND_IFN, -+ 'ifnn': COND_IFNN, -+ 'ifc': COND_IFC, -+ 'ifnc': COND_IFNC} -+ -+def get_cond(cond): -+ if not cond: -+ return COND_ALWAYS -+ if cond not in conds: -+ asm_error('invalid cond') -+ return conds[cond] -+ -+bconds = { -+ 'allz': BCOND_ALLZ, -+ 'allnz': BCOND_ALLNZ, -+ 'anyz': BCOND_ANYZ, -+ 'anynz': BCOND_ANYNZ, -+ 'alln': BCOND_ALLN, -+ 'allnn': BCOND_ALLNN, -+ 'anyn': BCOND_ANYN, -+ 'anynn': BCOND_ANYNN, -+ 'allc': BCOND_ALLC, -+ 'allnc': BCOND_ALLNC, -+ 'anyc': BCOND_ANYC, -+ 'anync': BCOND_ANYNC} -+ -+def get_bcond(bcond): -+ if not bcond: -+ return BCOND_ALWAYS -+ if bcond not in bconds: -+ asm_error('invalid bcond') -+ return bconds[bcond] -+ -+def get_setf(setf): -+ if not setf: -+ return False -+ return True -+ -+# packing/unpacking -+#################### -+ -+packs = { -+ '16a': (PACK_A_16A, PACK_TYPE_INT, PACK_MODE_A), -+ '16b': (PACK_A_16B, PACK_TYPE_INT, PACK_MODE_A), -+ '16af': (PACK_A_16A, PACK_TYPE_FLOAT, PACK_MODE_A), -+ '16bf': (PACK_A_16B, PACK_TYPE_FLOAT, PACK_MODE_A), -+ '8abcd': (PACK_A_8888, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8a': (PACK_A_8A, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8b': (PACK_A_8B, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8c': (PACK_A_8C, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8d': (PACK_A_8D, PACK_TYPE_EITHER, PACK_MODE_A), -+ 's': (PACK_A_32S, PACK_TYPE_EITHER, PACK_MODE_A), -+ '16as': (PACK_A_16AS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '16bs': (PACK_A_16BS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8abcds': (PACK_A_8888S, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8as': (PACK_A_8AS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8bs': (PACK_A_8BS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8cs': (PACK_A_8CS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8ds': (PACK_A_8DS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8ac': (PACK_MUL_8A, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8bc': (PACK_MUL_8B, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8cc': (PACK_MUL_8C, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8dc': (PACK_MUL_8D, PACK_TYPE_EITHER, PACK_MODE_M)} -+ -+def get_pack(pack): -+ if not pack: -+ return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER) -+ if pack not in packs: -+ asm_error('invalid pack') -+ return packs[pack] -+ -+a_unpacks = { -+ '16a': (UNPACK_A_16A, PACK_TYPE_INT), -+ '16b': (UNPACK_A_16B, PACK_TYPE_INT), -+ '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT), -+ '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT), -+ '8dr': (UNPACK_A_8R, PACK_TYPE_EITHER), -+ '8a': (UNPACK_A_8A, PACK_TYPE_INT), -+ '8b': (UNPACK_A_8B, PACK_TYPE_INT), -+ '8c': (UNPACK_A_8C, PACK_TYPE_INT), -+ '8d': (UNPACK_A_8D, PACK_TYPE_INT), -+ '8ac': (UNPACK_A_8A, PACK_TYPE_FLOAT), -+ '8bc': (UNPACK_A_8B, PACK_TYPE_FLOAT), -+ '8cc': (UNPACK_A_8C, PACK_TYPE_FLOAT), -+ '8dc': (UNPACK_A_8D, PACK_TYPE_FLOAT)} -+ -+def get_a_unpack(unpack): -+ if not unpack: -+ return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A) -+ if unpack not in a_unpacks: -+ asm_error('invalid ra unpack') -+ return a_unpacks[unpack] + (UNPACK_LOC_A,) -+ -+r4_unpacks = { -+ '16af': UNPACK_R4_16A, -+ '16bf': UNPACK_R4_16B, -+ '8dr': UNPACK_R4_8R, -+ '8ac': UNPACK_R4_8A, -+ '8bc': UNPACK_R4_8B, -+ '8cc': UNPACK_R4_8C, -+ '8dc': UNPACK_R4_8D} -+ -+def get_r4_unpack(unpack): -+ if not unpack: -+ return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4) -+ if unpack not in r4_unpacks: -+ asm_error('invalid r4 unpack') -+ return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4) -+ -+# args -+####### -+ -+class loc_t: -+ def __init__(self, mux, i, rot, r5_rot, pack, rw): -+ self.mux = mux -+ self.i = i -+ self.rot = rot % 16 -+ self.r5_rot = r5_rot % 16 -+ self.pack = pack -+ self.rw = rw -+ -+ def copy(self): -+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw) -+ -+ def __add__(self, i): -+ if not is_int(i): -+ raise Exception('can only add integer to loc') -+ return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw) -+ -+ def __sub__(self, i): -+ if not is_int(i): -+ raise Exception('can only subtract integer from loc') -+ return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw) -+ -+ def __cmp__(self, other): -+ if is_int(other): -+ return cmp(self.i, other) -+ if not isinstance(other, loc_t): -+ raise Exception('can only compare loc to integer or other loc') -+ if self.mux != other.mux: -+ return cmp(self.mux, other.mux) -+ if self.i != other.i: -+ return cmp(self.i, other.i) -+ if self.rot != other.rot: -+ return cmp(self.rot, other.rot) -+ if self.r5_rot != other.r5_rot: -+ return cmp(self.r5_rot, other.r5_rot) -+ return cmp(self.pack, other.pack) -+ -+ def is_r5(self): -+ return (self.mux == MUX_AC) and (self.i == 5) -+ -+ def shift(self, rot, left): -+ if isinstance(rot, loc_t) and rot.is_r5(): -+ if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack: -+ raise Exception('can\'t rotate by rotated/unpacked r5') -+ return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw) -+ if not is_int(rot): -+ raise Exception('can only rotate by integer or r5') -+ return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw) -+ -+ def __lshift__(self, rot): -+ return self.shift(rot, True) -+ -+ def __rshift__(self, rot): -+ return self.shift(rot, False) -+ -+ def __getattr__(self, name): -+ # discard the first character if it is an underscore. this is a total hack -+ # to allow packs starting with a digit to work -+ if name[0] == '_': -+ name = name[1:] -+ if (name in packs) or (name in a_unpacks) or (name in r4_unpacks): -+ if self.pack: -+ raise Exception('can\'t specify two packs') -+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw) -+ raise AttributeError() -+ -+ def __str__(self): -+ if self.mux == MUX_AC: -+ return 'r%d' % self.i -+ if self.mux == MUX_ANY: -+ return 'rany%d' % self.i -+ if self.mux == MUX_A: -+ return 'ra%d' % self.i -+ if self.mux == MUX_B: -+ return 'rb%d' % self.i -+ assert 0 -+ -+class sema_t: -+ def __init__(self, acq, i): -+ if not is_int(i): -+ raise Exception('semaphore index must be integer') -+ self.acq = acq -+ self.i = i -+ -+class label_t: -+ def __init__(self, rel, name, offset): -+ self.rel = rel -+ self.name = name -+ self.offset = offset -+ -+ def __add__(self, offset): -+ return label_t(self.rel, self.name, self.offset + offset) -+ -+ def __sub__(self, offset): -+ return label_t(self.rel, self.name, self.offset - offset) -+ -+class label_maker_t: -+ def __init__(self, rel): -+ self.rel = rel -+ -+ def __getattr__(self, name): -+ # we discard the first character. this is a total hack to allow numeric labels to work -+ if not re_label_ref_right.match(name[1:]): -+ raise Exception('invalid label reference') -+ return label_t(self.rel, name[1:], 0) -+ -+def bits(x, n): -+ if (x >> n) != 0: -+ raise Exception('%d doesn\'t fit in %d bits' % (x, n)) -+ return x -+ -+def bitsw(x, n): -+ if x == (1 << n): -+ x = 0 -+ return bits(x, n) -+ -+def bitsws(x, n): -+ if x == (1 << (n - 1)): -+ x = 0 -+ if -(1 << (n - 1)) <= x < 0: -+ x += 1 << n -+ return bits(x, n) -+ -+def vpm_setup(n, stride, addr, v2 = False): -+ horiz, laned, size, y, x, p = addr -+ if size not in (0, 1, 2): -+ raise Exception('addr size should be 0, 1, or 2') -+ if horiz: -+ if x != 0: -+ raise Exception('horizontal accesses must have x of 0') -+ else: -+ if (y & 0xf) != 0: -+ raise Exception('vertical accesses must be 16 row aligned') -+ hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size) -+ if v2: -+ return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) | -+ (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size)) -+ return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) | -+ (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size)) -+ -+def vdw_setup_0(n, m, addr): -+ horiz, size, y, x, p = addr -+ if size not in (0, 1, 2): -+ raise Exception('addr size should be 0, 1, or 2') -+ return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) | -+ (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size)) -+ -+def vdr_setup_0(n, m, addr, vpm_stride, stride): -+ horiz, size, y, x, p = addr -+ if size not in (0, 1, 2): -+ raise Exception('addr size should be 0, 1, or 2') -+ if (stride < 8) or (stride & (stride - 1)): -+ raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride') -+ log2_stride = 3 -+ while (1 << log2_stride) != stride: -+ log2_stride += 1 -+ return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) | -+ (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) | -+ (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4)) -+ -+class allocator_t: -+ def __init__(self, *available): -+ self.available = list(available) -+ self.allocated = {} -+ self.reserved = [] -+ -+ def copy(self): -+ a = allocator_t() -+ a.available = self.available[:] -+ a.allocated = self.allocated.copy() -+ a.reserved = self.reserved[:] -+ return a -+ -+ def forget(self): -+ self.__init__(self.available + self.allocated.values() + self.reserved) -+ -+ def reserve(self, *rs): -+ for r in rs: -+ self.available.remove(r) -+ self.reserved.append(r) -+ -+ def retire(self, name): -+ r = self.allocated.pop(name) -+ del r.__invert__ -+ del r.retire -+ self.available.append(r) -+ return r -+ -+ def __getattr__(self, name): -+ if name not in self.allocated: -+ r = self.available.pop() -+ r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax -+ r.__invert__ = r.retire -+ self.allocated[name] = r -+ return self.allocated[name] -+ -+def pragma_allow_xor_0(x): -+ global allow_xor_0 -+ -+ if not isinstance(x, bool): -+ raise Exception('allow_xor_0 must be bool') -+ x, allow_xor_0 = allow_xor_0, x -+ return x -+ -+def pragma_dont_warn_when_mul_rot_inp_r5(x): -+ global dont_warn_when_mul_rot_inp_r5 -+ -+ if not isinstance(x, bool): -+ raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool') -+ x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x -+ return x -+ -+arg_defs = { -+ # special reg names (these alias the regular names, but also have appropriate read/write restrictions) -+ 'w': loc_t(MUX_A, 15, 0, 0, None, RW_EITHER), -+ 'z': loc_t(MUX_B, 15, 0, 0, None, RW_EITHER), -+ 'unif': loc_t(MUX_ANY, 32, 0, 0, None, RW_READ), -+ 'vary': loc_t(MUX_ANY, 35, 0, 0, None, RW_READ), -+ 'tmurs': loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE), -+ 'r5quad': loc_t(MUX_A, 37, 0, 0, None, RW_WRITE), -+ 'r5rep': loc_t(MUX_B, 37, 0, 0, None, RW_WRITE), -+ 'elem_num': loc_t(MUX_A, 38, 0, 0, None, RW_READ), -+ 'qpu_num': loc_t(MUX_B, 38, 0, 0, None, RW_READ), -+ 'unif_addr': loc_t(MUX_A, 40, 0, 0, None, RW_WRITE), -+ 'unif_addr_rel': loc_t(MUX_B, 40, 0, 0, None, RW_WRITE), -+ 'x_coord': loc_t(MUX_A, 41, 0, 0, None, RW_EITHER), -+ 'y_coord': loc_t(MUX_B, 41, 0, 0, None, RW_EITHER), -+ 'ms_mask': loc_t(MUX_A, 42, 0, 0, None, RW_EITHER), -+ 'rev_flag': loc_t(MUX_B, 42, 0, 0, None, RW_EITHER), -+ 'stencil': loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE), -+ 'tlbz': loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE), -+ 'tlbm': loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE), -+ 'tlbc': loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE), -+ 'vpm': loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER), -+ 'vr_busy': loc_t(MUX_A, 49, 0, 0, None, RW_READ), -+ 'vw_busy': loc_t(MUX_B, 49, 0, 0, None, RW_READ), -+ 'vr_setup': loc_t(MUX_A, 49, 0, 0, None, RW_WRITE), -+ 'vw_setup': loc_t(MUX_B, 49, 0, 0, None, RW_WRITE), -+ 'vr_wait': loc_t(MUX_A, 50, 0, 0, None, RW_READ), -+ 'vw_wait': loc_t(MUX_B, 50, 0, 0, None, RW_READ), -+ 'vr_addr': loc_t(MUX_A, 50, 0, 0, None, RW_WRITE), -+ 'vw_addr': loc_t(MUX_B, 50, 0, 0, None, RW_WRITE), -+ 'mutex': loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER), -+ 'recip': loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE), -+ 'recipsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE), -+ 'rsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE), -+ 'exp': loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE), -+ 'log': loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE), -+ 't0s': loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE), -+ 't0t': loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE), -+ 't0r': loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE), -+ 't0b': loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE), -+ 't1s': loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE), -+ 't1t': loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE), -+ 't1r': loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE), -+ 't1b': loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE), -+ -+ # semaphore acq/rel -+ 'sacq': lambda i: sema_t(True, i), -+ 'srel': lambda i: sema_t(False, i), -+ -+ # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label) -+ 'r_label_maker': label_maker_t(True), -+ 'a_label_maker': label_maker_t(False), -+ -+ # handy functions -+ 'f': lambda x: struct.unpack('I', struct.pack('f', x))[0], -+ 'sqrt': math.sqrt, -+ 'sin': math.sin, -+ 'cos': math.cos, -+ 'atan2': math.atan2, -+ 'pi': math.pi, -+ 'rseed': random.seed, -+ 'rand': lambda: int(random.getrandbits(32)), -+ 'bits': bits, -+ 'bitsw': bitsw, -+ 'bitsws': bitsws, -+ -+ # handy vpm/vdw/vdr stuff -+ 'h32': lambda y: (1, 0, 0, y, 0, 0), -+ 'h16l': lambda y, p: (1, 1, 1, y, 0, p), -+ 'h16p': lambda y, p: (1, 0, 1, y, 0, p), -+ 'h8l': lambda y, p: (1, 1, 2, y, 0, p), -+ 'h8p': lambda y, p: (1, 0, 2, y, 0, p), -+ 'v32': lambda y, x: (0, 0, 0, y, x, 0), -+ 'v16l': lambda y, x, p: (0, 1, 1, y, x, p), -+ 'v16p': lambda y, x, p: (0, 0, 1, y, x, p), -+ 'v8l': lambda y, x, p: (0, 1, 2, y, x, p), -+ 'v8p': lambda y, x, p: (0, 0, 2, y, x, p), -+ 'dma_h32': lambda y, x: (1, 0, y, x, 0), -+ 'dma_h16p': lambda y, x, p: (1, 1, y, x, p), -+ 'dma_h8p': lambda y, x, p: (1, 2, y, x, p), -+ 'dma_v32': lambda y, x: (0, 0, y, x, 0), -+ 'dma_v16p': lambda y, x, p: (0, 1, y, x, p), -+ 'dma_v8p': lambda y, x, p: (0, 2, y, x, p), -+ 'vpm_setup': vpm_setup, -+ 'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True), -+ 'vdw_setup_0': vdw_setup_0, -+ 'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13), -+ 'vdr_setup_0': vdr_setup_0, -+ 'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride -+ 'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13), -+ -+ # annotations -+ 'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)), -+ 'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff), -+ 'preserve_cond': ('preserve_cond', 1), -+ -+ # somewhat experimental register allocator -+ 'allocator_t': allocator_t, -+ -+ # pragmas -+ 'pragma_allow_xor_0': pragma_allow_xor_0, -+ 'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5} -+ -+# accumulators and regs (regular names -- r0, ra0, etc) -+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6)) -+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64)) -+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64)) -+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64)) -+ -+def arg_eval(arg, sets): -+ s = (arg.strip().split('.', 1) + [None])[:2] -+ if s[0] == '-': -+ return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE) -+ arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings... -+ arg = re_pack.sub('._\\1', arg) -+ try: -+ # todo: i would like to be able to pass both arg_defs and sets in here -+ # (with sets hiding arg_defs in the case of conflicts), but the obvious -+ # dict(arg_defs, **sets) won't permit things such as: -+ # .set f, lambda x: y -+ # .set y, 4 -+ # (the y in the lambda will be looked up in the temporary dict we created -+ # when evaluating the f .set, which doesn't contain y) -+ # -+ # instead, sets is initially set to (a copy of) arg_defs. to simulate the -+ # hiding behaviour, on an unset, we restore any hidden arg_defs value. -+ # also, before dumping sets at the end, we strip out the arg_defs stuff -+ # (this isn't entirely correct as we want to dump sets that are hiding -+ # arg_defs) -+ return eval(arg, sets) -+ except Exception, e: -+ asm_error(e) -+ except: -+ asm_error('unknown error while evaluating argument') -+ -+# doesn't check/fixup pack -+def check_and_fixup_loc(loc, read): -+ if (not read) and (loc.rw == RW_READ): -+ asm_error('writing to read-only hardware register') -+ if read and (loc.rw == RW_WRITE): -+ asm_error('reading from write-only hardware register') -+ if not read: -+ # conceptually, we are writing to a location rotated right by -+ # loc.rot/loc.r5_rot. but we are actually rotating the output right by -+ # -loc.rot/-loc.r5_rot then writing it to the unrotated location -+ loc.rot = -loc.rot % 16 -+ loc.r5_rot = -loc.r5_rot % 16 -+ if (loc.rot != 0) and (loc.r5_rot != 0): -+ asm_error('can\'t rotate by both r5 and immediate') -+ if (loc.r5_rot != 0) and (loc.r5_rot != 1): -+ asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read]) -+ if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later -+ if not read: -+ asm_error('target doesn\'t support write rotation') -+ if loc.mux == MUX_ANY: -+ loc.mux = MUX_A # can't do rotated read from regfile b -+ if loc.mux != MUX_A: -+ asm_error('rotation on read only allowed from regfile a') -+ if loc.i >= 32: -+ asm_warning('rotation only works from physical regfile') -+ if loc.mux == MUX_AC: -+ if (loc.i < 0) or (loc.i >= 6): -+ asm_error('reg out of range') -+ if not read: -+ if loc.i == 4: -+ asm_error('not allowed to write to r4') -+ if loc.i == 5: -+ -+ asm_error('not allowed to write to r5 -- please specify r5quad or r5rep') -+ elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B): -+ if (loc.i < 0) or (loc.i >= 64): -+ asm_error('reg out of range') -+ else: -+ assert 0 -+ -+def get_dst(dst, sets): -+ if not dst: -+ return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0 -+ dst = arg_eval(dst, sets) -+ if not isinstance(dst, loc_t): -+ asm_error('invalid dst') -+ dst = dst.copy() -+ check_and_fixup_loc(dst, False) -+ pack = get_pack(dst.pack) -+ if dst.mux == MUX_AC: -+ if pack[2] == PACK_MODE_A: -+ asm_warning('ra packing only works when writing to physical regfile') -+ return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot -+ return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot -+ if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation -+ if (pack[2] == PACK_MODE_A) and (dst.i >= 32): -+ asm_warning('ra packing only works when writing to physical regfile') -+ return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot -+ if dst.mux == MUX_ANY: -+ return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot -+ if dst.mux == MUX_B: -+ if pack[2] == PACK_MODE_A: -+ asm_error('this packing operation can only be used for regfile a') -+ return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot -+ assert 0 -+ -+def get_src(src, sets): -+ if not src: -+ return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None -+ src = arg_eval(src, sets) -+ if isinstance(src, sema_t): -+ if not have_sema: -+ asm_error('target does not support semaphores') -+ if (src.i < 0) or (src.i >= 16): -+ asm_error('semaphore number must be in [0, 16)') -+ return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if isinstance(src, label_t): -+ return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if isinstance(src, list): -+ if len(src) != 16: -+ asm_error('vector immediate must have length 16') -+ src = src[:] -+ for i in xrange(16): -+ if not is_int(src[i]): -+ asm_error('all elements of vector immediate must be integers') -+ src[i] &= (1 << 32) - 1 -+ return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if is_int(src): -+ return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if not isinstance(src, loc_t): -+ asm_error('invalid src') -+ src = src.copy() -+ check_and_fixup_loc(src, True) -+ if mulw_rotate: -+ srot, sr5rot = 0, 0 -+ drot, dr5rot = src.rot, src.r5_rot -+ else: -+ srot, sr5rot = src.rot, src.r5_rot -+ drot, dr5rot = 0, 0 -+ if src.mux == MUX_AC: -+ if src.i == 4: -+ return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot -+ if src.pack: -+ asm_error('unpack only allowed for regfile a or r4') -+ return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot -+ if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b -+ return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot -+ if src.mux == MUX_ANY: -+ return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot -+ if src.mux == MUX_B: -+ if src.pack: -+ asm_error('unpack only allowed for regfile a or r4') -+ return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot -+ assert 0 -+ -+# signals -+########## -+ -+sigs = { -+ 'bkpt': SIG_BKPT, -+ 'thrsw': SIG_THRSW, -+ 'thrend': SIG_THREND, -+ 'sbwait': SIG_SBWAIT, -+ 'sbdone': SIG_SBDONE, -+ 'int': SIG_INT, -+ 'loadcv': SIG_LOADCV, -+ 'loadc': SIG_LOADC, -+ 'ldcend': SIG_LDCEND, -+ 'ldtmu0': SIG_LDTMU0, -+ 'ldtmu1': SIG_LDTMU1} -+ -+def get_sig(sig): -+ if sig not in sigs: -+ return SIG_NORMAL -+ return sigs[sig] -+ -+# annotations -+############## -+ -+def get_annots(annot, sets): -+ annots = arg_eval(annot, sets) -+ if isinstance(annots, list): -+ annots = annots[:] -+ else: -+ annots = [annots] -+ for i, annot in enumerate(annots): -+ if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or -+ (not is_int(annot[1]))): -+ asm_error('annotation must be (string, integer) pair, or a list of such pairs') -+ annots[i] = (annot[0], annot[1] & ((1 << 32) - 1)) -+ return annots -+ -+############################################################################### -+# core -+############################################################################### -+ -+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats): -+ needfloat = PACK_TYPE_EITHER -+ havefloata = False -+ havefloatr4 = False -+ unpacka = None -+ unpackr4 = None -+ forcebs = [False, False, False, False] -+ forcerafloat = False -+ -+ pm = PACK_MODE_EITHER -+ for i in (0, 1, 2, 3): -+ if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB): -+ assert rpacks[i][0] == 0 -+ else: -+ if rpacks[i][2] == UNPACK_LOC_A: -+ if unpacka is None: -+ unpacka = rpacks[i][0] -+ elif unpacka != rpacks[i][0]: -+ asm_error('conflicting unpack operations on regfile a') -+ havefloata = havefloata or rfloats[i] -+ elif rpacks[i][2] == UNPACK_LOC_R4: -+ if unpackr4 is None: -+ unpackr4 = rpacks[i][0] -+ elif unpackr4 != rpacks[i][0]: -+ asm_error('conflicting unpack operations on r4') -+ havefloatr4 = havefloatr4 or rfloats[i] -+ else: -+ assert 0 -+ -+ if rpacks[i][1] != PACK_TYPE_EITHER: -+ if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]): -+ asm_error('conflicting unpack float requirements') -+ needfloat = rpacks[i][1] -+ for i in (0, 1, 2, 3): -+ if rpacks[i][2] == UNPACK_LOC_AB: -+ if (unpacka is not None) and (unpacka != UNPACK_A_NOP): -+ forcebs[i] = True # non-nop unpack from regfile a. must use b -+ -+ if unpacka: -+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat: -+ havefloata = True -+ forcerafloat = True -+ havefloat = havefloata -+ else: -+ havefloat = havefloatr4 -+ -+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloat): -+ asm_error('float unpack operation used in integer alu operations') -+ if (needfloat == PACK_TYPE_INT) and havefloat: -+ asm_error('integer unpack operation used in float alu operation') -+ -+ unpack = 0 -+ if unpacka and unpackr4: -+ asm_error('cannot specify pack operation for both regfile a and r4') -+ if unpacka: -+ pm = PACK_MODE_A -+ unpack = unpacka -+ elif unpackr4: -+ pm = PACK_MODE_M -+ unpack = unpackr4 -+ -+ pack = 0 -+ if wpacks[0][2] == PACK_MODE_M: -+ asm_error('mul-unit pack operation used on add result') -+ for i in (0, 1): -+ if wpacks[i][2] == PACK_MODE_A: -+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A): -+ asm_error('conflicting pack modes') -+ pm = PACK_MODE_A -+ pack = wpacks[i][0] -+ elif wpacks[i][2] == PACK_MODE_M: -+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M): -+ asm_error('conflicting pack modes') -+ pm = PACK_MODE_M -+ pack = wpacks[i][0] -+ -+ if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]): -+ asm_error('float pack operation used with integer alu result') -+ if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]: -+ asm_error('integer pack operation used with float alu result') -+ -+ if pm == PACK_MODE_EITHER: -+ pm = PACK_MODE_A -+ return pm, pack, unpack, forcebs, forcerafloat -+ -+# immediates that can be encoded with SIG_SMALLIMMED -+bimms = {} -+bimms.update((i, i) for i in xrange(16)) -+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32)) -+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40)) -+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48)) -+ -+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux): -+ if rmux == RMUX_SEMA: -+ asm_error('semaphore op can only be used with mov') -+ if rmux == RMUX_LABEL: -+ asm_error('label not allowed here') -+ if rmux == RMUX_IMMV: -+ asm_error('vector immediate can only be used with mov') -+ if rmux == RMUX_IMM: -+ if raddr not in bimms: -+ asm_error('can\'t encode immediate 0x%08x' % raddr) -+ raddr = bimms[raddr] -+ if not immb: -+ if raddr_b is not None: -+ asm_error('regfile b and immediates don\'t mix') -+ raddr_b = raddr -+ immb = True -+ elif raddr_b != raddr: -+ asm_error('can only encode one rotation/immediate') -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ if rmux == RMUX_AC: -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr -+ if rmux == RMUX_ANY: -+ if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr): -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ if (not immb) and (raddr_b == raddr): -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ if raddr_a is None: -+ assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5)) -+ raddr_a = raddr -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ if raddr_b is None: -+ assert not immb -+ raddr_b = raddr -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ asm_error('no free read slots') -+ if rmux == RMUX_A: -+ if (not mulw_rotate) and (raddr_a is not None) and ( -+ ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))): -+ asm_error('conflicting rotations from regfile a') -+ if raddr_a is None: -+ raddr_a = raddr[0] -+ elif raddr_a != raddr[0]: -+ asm_error('can only read from one location in each regfile') -+ arot_r5 = raddr[2] -+ if raddr[1] == 0: -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ raddr = 48 + raddr[1] -+ if not immb: -+ if raddr_b is not None: -+ asm_error('regfile b and rotation don\'t mix') -+ raddr_b = raddr -+ immb = True -+ elif raddr_b != raddr: -+ asm_error('can only encode one rotation/immediate') -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ if rmux == RMUX_B: -+ if immb: -+ asm_error('regfile b and rotation/immediates don\'t mix') -+ if raddr_b is None: -+ raddr_b = raddr -+ elif raddr_b != raddr: -+ asm_error('can only read from one location in each regfile') -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ assert 0 -+ -+# ok if: -+# - accumulator (r0-r3) -+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy, -+# and vw_busy. it's also true of r5 if it was written by r5rep, but not if it -+# was written by r5quad. so, by default, r5 isn't considered uniform. todo: -+# what about vr_wait/vw_wait/mutex? -+def read_rot_ok(rmux, raddr_a, raddr_b): -+ return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or -+ ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy -+ ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy -+ -+def asm_flush_prog_data(): -+ global prog_data -+ -+ while len(prog_data) & 7: -+ prog_data.append(0) -+ for i in xrange(0, len(prog_data), 8): -+ prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0), -+ (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {})) -+ prog_data = [] -+ -+def asm_line(sets, location, line): -+ global current_location, construct, nwarn_level -+ -+ prev_location = current_location -+ current_location = location -+ -+ try: -+ if construct != None: -+ if re_macro.match(line): -+ construct_stack.append(CONSTRUCT_MACRO) -+ elif re_if.match(line): -+ construct_stack.append(CONSTRUCT_IF) -+ elif re_rep.match(line): -+ construct_stack.append(CONSTRUCT_REP) -+ else: -+ else_m = line == '.else' -+ elif_m = re_elif.match(line) -+ if elif_m: -+ end_construct = CONSTRUCT_IF -+ else: -+ end_construct = { -+ '.endm': CONSTRUCT_MACRO, -+ '.else': CONSTRUCT_IF, -+ '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE, -+ '.endr': CONSTRUCT_REP}.get(line) -+ if end_construct is not None: -+ end_construct &= construct_stack.pop() -+ if end_construct == 0: -+ if elif_m: -+ asm_error('unexpected .elif') -+ asm_error('unexpected %s' % line) -+ if len(construct_stack) == 0: -+ lines = construct -+ construct = None -+ if end_construct == CONSTRUCT_MACRO: -+ return -+ if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE): -+ condition_if, condition_else = lines[0] -+ lines = lines[1:] -+ if condition_if: -+ for location, line in lines: -+ asm_line(sets, location, line) -+ if else_m: -+ construct = [(condition_else, False)] -+ construct_stack.append(CONSTRUCT_ELSE) -+ elif elif_m: -+ if elif_m.group('set'): -+ condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets)) -+ else: -+ condition_if = condition_else and arg_eval(elif_m.group('condition'), sets) -+ condition_else = condition_else and (not condition_if) -+ construct = [(condition_if, condition_else)] -+ construct_stack.append(CONSTRUCT_IF) -+ return -+ if end_construct == CONSTRUCT_REP: -+ name, count = lines[0] -+ lines = lines[1:] -+ for i in xrange(count): -+ sets[name] = i -+ for location, line in lines: -+ asm_line(sets, location, line) -+ return -+ assert 0 -+ if else_m: -+ construct_stack.append(CONSTRUCT_ELSE) -+ elif elif_m: -+ construct_stack.append(CONSTRUCT_IF) -+ construct.append((current_location, line)) -+ return -+ -+ if line in ('.endm', '.else', '.endif', '.endr'): -+ asm_error('unexpected %s' % line) -+ if re_elif.match(line): -+ asm_error('unexpected .elif') -+ -+ m = re_macro.match(line) -+ if m: -+ construct = [] -+ construct_stack.append(CONSTRUCT_MACRO) -+ macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct) -+ return -+ -+ m = re_if.match(line) -+ if m: -+ if m.group('set'): -+ condition = (m.group('set') == 'nset') ^ (m.group('name') in sets) -+ else: -+ # not not forces condition to a bool (this matters if condition is -+ # something mutable like a list) -+ condition = not not arg_eval(m.group('condition'), sets) -+ construct = [(condition, not condition)] -+ construct_stack.append(CONSTRUCT_IF) -+ return -+ -+ m = re_rep.match(line) -+ if m: -+ count = arg_eval(m.group('count'), sets) -+ if not is_int(count): -+ asm_error('.rep count must be integer') -+ construct = [(m.group('name'), count)] -+ construct_stack.append(CONSTRUCT_REP) -+ return -+ -+ m = re_include.match(line) -+ if m: -+ filename = arg_eval(m.group('filename'), sets) -+ if not isinstance(filename, str): -+ asm_error('expected string') -+ asm_file(sets, '%s: %s' % (current_location, filename), filename) -+ return -+ -+ m = re_set.match(line) -+ if m: -+ sets[m.group('name')] = arg_eval(m.group('val'), sets) -+ return -+ -+ m = re_unset.match(line) -+ if m: -+ name = m.group('name') -+ if name not in sets: -+ asm_error('%s not set' % name) -+ if name in arg_defs: # todo: see arg_eval -+ sets[name] = arg_defs[name] -+ else: -+ del sets[name] -+ return -+ -+ m = re_eval.match(line) -+ if m: -+ arg_eval(m.group('expr'), sets) -+ return -+ -+ m = re_print_info_warn_error.match(line) -+ if m: -+ def print_fn(message): -+ print message -+ def info_fn(message): -+ sys.stderr.write('%s\n' % message) -+ {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[ -+ m.group('print_info_warn_error')](arg_eval(m.group('message'), sets)) -+ return -+ -+ m = re_assert.match(line) -+ if m: -+ if not arg_eval(m.group('condition'), sets): -+ asm_error('assertion failure: \'%s\'' % m.group('condition')) -+ return -+ -+ m = re_data.match(line) -+ if m: -+ size = int(m.group('size')) -+ for datum in smart_split(m.group('data')): -+ datum = arg_eval(datum, sets) -+ if not is_int(datum): -+ asm_error('datum must be integer') -+ prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size)) -+ return -+ -+ m = re_macro_inst.match(line) -+ if m: -+ name = m.group('name') -+ if name in macros: -+ params, lines = macros[name] -+ args = smart_split(m.group('args')) -+ if len(args) > len(params): -+ asm_error('too many arguments to macro') -+ sets = sets.copy() -+ sets.update(zip(params, (arg_eval(arg, sets) for arg in args))) -+ for param in params[len(args):]: -+ if param in sets: -+ if param in arg_defs: # todo: see arg_eval -+ sets[param] = arg_defs[param] -+ else: -+ del sets[param] -+ for location, line in lines: -+ asm_line(sets, '%s: %s' % (current_location, location), line) -+ return -+ -+ if line == '.pushnwarn': -+ nwarn_level += 1 -+ return -+ if line == '.popnwarn': -+ if nwarn_level == 0: -+ asm_error('.popnwarn without .pushnwarn') -+ nwarn_level -= 1 -+ return -+ -+ # everything below assumes prog is up to date -+ asm_flush_prog_data() -+ -+ m = re_label.match(line) -+ if m: -+ name = m.group('name') -+ if name[0].isdigit(): -+ labels.setdefault(name, []).append(len(prog)) -+ else: -+ if name[0] == ':': -+ undecorated_name = name[1:] -+ else: -+ undecorated_name = name -+ if (undecorated_name in labels) or ((':' + undecorated_name) in labels): -+ asm_error('named label defined twice') -+ labels[name] = len(prog) -+ return -+ -+ annots = line.split('@') -+ ops = [op.strip() for op in annots[0].split(';')] -+ annots = sum((get_annots(annot, sets) for annot in annots[1:]), []) -+ sig = get_sig(ops[-1]) -+ if sig != SIG_NORMAL: -+ ops = ops[:-1] -+ if len(ops) > 2: -+ asm_error('too many ops') -+ elif (len(ops) == 1) and (ops[0] == ''): -+ ops = [] -+ ops = (ops + ['nop', 'nop'])[:2] -+ m = re_op.match(ops[0]) -+ if not m: -+ asm_error('invalid syntax') -+ aop, aargs_n = get_aop(m.group('op')) -+ if (aop == AOP_BRA) or (aop == AOP_BRR): -+ acond = get_bcond(m.group('cond')) -+ else: -+ acond = get_cond(m.group('cond')) -+ asf = get_setf(m.group('sf')) -+ aargs = smart_split(m.group('args')) -+ if len(aargs) != aargs_n: -+ asm_error('wrong operand count') -+ ard, ara, arb = (aargs + [None, None, None])[:3] -+ m = re_op.match(ops[1]) -+ if not m: -+ asm_error('invalid syntax') -+ mop, margs_n = get_mop(m.group('op')) -+ mcond = get_cond(m.group('cond')) -+ msf = get_setf(m.group('sf')) -+ margs = smart_split(m.group('args')) -+ if len(margs) != margs_n: -+ asm_error('wrong operand count') -+ mrd, mra, mrb = (margs + [None, None, None])[:3] -+ # eval srcs first so allocator can retire and reuse registers for dst -+ aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets) -+ abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets) -+ maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets) -+ mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets) -+ awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets) -+ mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets) -+ if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or -+ ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))): -+ asm_error('cannot have 2 arguments with different rotations') -+ if aarmux is not None: -+ awrot = (awrot + aadrot) % 16 -+ awrot_r5 = (awrot_r5 + aadrot_r5) % 16 -+ if (awrot != 0) or awrot_r5: -+ asm_error('rotate not allowed on add write') -+ if marmux is not None: -+ mwrot = (mwrot + madrot) % 16 -+ mwrot_r5 = (mwrot_r5 + madrot_r5) % 16 -+ -+ afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI) -+ afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF) -+ pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes( -+ [aarpack, abrpack, marpack, mbrpack], -+ [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL], -+ aop == AOP_FTOI, -+ [awpack, mwpack], -+ [afloatw, mop == MOP_FMUL]) -+ if forcebs[0]: -+ aarmux = RMUX_B -+ if forcebs[1]: -+ abrmux = RMUX_B -+ if forcebs[2]: -+ marmux = RMUX_B -+ if forcebs[3]: -+ mbrmux = RMUX_B -+ -+ # extend nops to 3 operands -+ if aop == AOP_NOP: -+ awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC -+ if mop == MOP_NOP: -+ mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC -+ -+ # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand) -+ if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ): -+ if forcerafloat: -+ assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand -+ # instead of duplicating the 2nd operand, take the ra operand from -+ # the mul op thus forcing the ra value to be considered a float for -+ # the purposes of unpacking -+ if marmux == RMUX_A: -+ abraddr, abrmux = maraddr, marmux -+ else: -+ assert mbrmux == RMUX_A -+ abraddr, abrmux = mbraddr, mbrmux -+ else: -+ abraddr, abrmux = aaraddr, aarmux -+ else: -+ assert not forcerafloat # can only forcerafloat if we have an unused operand -+ -+ # handle write addrs -+ if (awmux == mwmux) and (awmux != WMUX_ANY): -+ asm_error('add/mul ops not allowed to write to same regfile') -+ ws = (awmux == WMUX_B) or (mwmux == WMUX_A) -+ -+ # handle branch -+ if (aop == AOP_BRA) or (aop == AOP_BRR): -+ # check setf -+ if asf: -+ asm_error('setf not allowed on bra/brr') -+ -+ # check pack/unpack -+ if (pack != 0) or (unpack != 0): -+ asm_error('pack/unpack not allowed with bra/brr') -+ -+ # handle read address -+ if aarmux == RMUX_LABEL: -+ if (aop == AOP_BRA) and aaraddr[1]: -+ asm_warning('bra with rel label') -+ if (aop == AOP_BRR) and (not aaraddr[1]): -+ asm_warning('brr with abs label') -+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM -+ if aarmux == RMUX_ANY: -+ aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A -+ if (aarmux != RMUX_IMM) and (aarmux != RMUX_A): -+ asm_error('branch destination must be either label, immediate, or from regfile a') -+ if aarmux == RMUX_IMM: -+ imm = aaraddr -+ raddr = 0 # can't use RADDR_NOP -+ elif aarmux == RMUX_A: -+ if (aaraddr[1] != 0) or (aaraddr[2] != 0): -+ asm_error('rotation of read from regfile a not allowed with branch') -+ if aop == AOP_BRR: -+ asm_warning('brr with ra') -+ imm = 0 -+ raddr = aaraddr[0] -+ else: -+ assert 0 -+ -+ # check mul op is nop -+ if mop != MOP_NOP: -+ asm_error('mul op not allowed with branch') -+ -+ # check sig -+ if sig != SIG_NORMAL: -+ asm_error('no signal allowed with branch') -+ -+ if raddr >= 32: -+ asm_error('can only branch to register locations in physical regfile') -+ if raddr & 1: -+ asm_warning('branch instruction will destroy flags (see hw-2780)') -+ -+ # construct branch instruction -+ prog.append((imm, -+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28), -+ line, annots)) -+ -+ return -+ -+ # use COND_NEVER when possible (might save power / allow mul setf) -+ if not dict(annots).get('preserve_cond', 0): -+ if (awaddr == WADDR_NOP) and (not asf): -+ acond = COND_NEVER -+ if (mwaddr == WADDR_NOP) and (not msf): -+ mcond = COND_NEVER -+ -+ # attempt to convert movs to ldi -+ if (# no mul setf -+ (not msf) and -+ # ops must either be nop or mov of sema/label/imm/immv -+ ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and -+ ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and -+ # but we don't want 2 nops -+ ((aop != AOP_NOP) or (mop != MOP_NOP)) and -+ # if both ops are movs, srcs must be identical -+ ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and -+ # no signal -+ (sig == SIG_NORMAL)): -+ # make sure aarmux/aaraddr contains the value -+ if aop != AOP_MOV: -+ aarmux = marmux -+ aaraddr = maraddr -+ -+ # convert immediate -+ if aarmux == RMUX_SEMA: -+ ldi_mode = LDI_SEMA -+ elif aarmux == RMUX_LABEL: -+ ldi_mode = LDI_32 -+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM -+ elif aarmux == RMUX_IMMV: -+ signed, unsigned = True, True -+ imm = 0 -+ for i, elem in enumerate(aaraddr): -+ if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1): -+ signed = False -+ if elem not in (0, 1, 2, 3): -+ unsigned = False -+ imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i)) -+ if not (signed or unsigned): -+ asm_error('can\'t encode vector immediate') -+ if signed: -+ ldi_mode = LDI_EL_SIGNED -+ else: -+ ldi_mode = LDI_EL_UNSIGNED -+ aaraddr, aarmux = imm, RMUX_IMM -+ elif aarmux == RMUX_IMM: -+ ldi_mode = LDI_32 -+ else: -+ assert 0 -+ -+ # construct ldi instruction -+ prog.append((aaraddr, -+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28), -+ line, annots)) -+ -+ return -+ -+ # convert movs to alu ops -+ if aop == AOP_MOV: -+ if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0): -+ aop = AOP_XOR -+ aaraddr, aarmux = 0, RMUX_AC -+ abraddr, abrmux = 0, RMUX_AC -+ else: -+ aop = AOP_OR -+ abraddr, abrmux = aaraddr, aarmux -+ if mop == MOP_MOV: -+ if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0): -+ mop = MOP_V8SUBS -+ maraddr, marmux = 0, RMUX_AC -+ mbraddr, mbrmux = 0, RMUX_AC -+ else: -+ mop = MOP_V8MIN -+ mbraddr, mbrmux = maraddr, marmux -+ -+ # normal alu instruction... -+ -+ # handle setf -+ if asf and (aop == AOP_NOP): -+ asm_error('nop.setf is not allowed in add pipe') -+ if msf and (mop == MOP_NOP): -+ asm_warning('nop.setf, really?') -+ if (aop == AOP_NOP) or (acond == COND_NEVER): -+ sf = msf -+ else: -+ if msf: -+ asm_error('setf only allowed on mul op if add op is nop or add condition is never') -+ sf = asf -+ -+ # handle read addrs -+ raddr_a = None -+ raddr_b = None -+ immb = False -+ arot_r5 = False -+ muxes = [0, 0, 0, 0] -+ if mwrot != 0: -+ raddr_b = 48 + mwrot -+ immb = True -+ if mwrot_r5 and have_am: -+ raddr_b = 48 -+ immb = True -+ for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last -+ for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux): -+ if f(rmux): -+ raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux) -+ add_a, add_b, mul_a, mul_b = muxes -+ if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)): -+ # some output elements might not be as expected -+ if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)): -+ bad_elems = 0xffff -+ else: -+ bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111 -+ if mwrot > 12: -+ bad_elems ^= 0xffff -+ bad_elems &= dict(annots).get('mul_used', 0xffff) -+ if not msf: -+ if mwaddr == WADDR_NOP: -+ # not writing anywhere and not setting flags. no elements used -+ bad_elems = 0 -+ elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or -+ ((not ws) and (mwaddr == 37))): -+ # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/ -+ # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags. -+ # only use element 0 -+ bad_elems &= 0x0001 -+ elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or -+ ((not ws) and (mwaddr == 42))): -+ # writing to r5quad/x_coord/y_coord/rev_flag and not setting -+ # flags. only use elements 0, 4, 8, and 12 -+ bad_elems &= 0x1111 -+ if bad_elems: -+ asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected') -+ if raddr_a is None: -+ raddr_a = RADDR_NOP -+ if raddr_b is None: -+ raddr_b = RADDR_NOP -+ if immb: -+ if sig != SIG_NORMAL: -+ asm_error('rotation/immediates and signal don\'t mix') -+ sig = SIG_SMALLIMMED -+ if arot_r5 or (mwrot_r5 and (not have_am)): -+ if sig != SIG_NORMAL: -+ asm_error('rotation/immediates/signal don\'t mix') -+ sig = SIG_ROTATE -+ -+ # construct instruction -+ prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29), -+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28), -+ line, annots)) -+ finally: -+ current_location = prev_location -+ -+def preprocess_passthrough(file): -+ line_number = 0 -+ for line in file: -+ line_number += 1 -+ yield line_number, line -+ -+def asm_file(sets, location, filename, preprocess = None): -+ global current_dir, current_location -+ -+ if filename is None: -+ location = '' -+ file = sys.stdin -+ -+ prev_dir = current_dir -+ else: -+ filename = os.path.normpath(os.path.join(current_dir, filename)) -+ -+ try: -+ file = open(filename) -+ except Exception, e: -+ asm_error(e) -+ except: -+ asm_error('unknown error while opening file %s' % filename) -+ -+ prev_dir = current_dir -+ current_dir = os.path.dirname(filename) -+ -+ prev_location = current_location -+ current_location = location -+ -+ if preprocess is None: -+ preprocess = preprocess_passthrough -+ -+ try: -+ for line_number, line in preprocess(file): -+ # strip off comments and whitespace -+ line = line.split('#')[0].strip() -+ if line == '': -+ continue -+ -+ asm_line(sets, '%s: %d' % (current_location, line_number), line) -+ finally: -+ current_dir = prev_dir -+ current_location = prev_location -+ -+def asm_end_prog(): -+ # check we aren't in a multi-line construct (eg .macro or .rep) -+ if construct != None: -+ asm_error({ -+ CONSTRUCT_MACRO: '.macro without .endm', -+ CONSTRUCT_IF: '.if/.elif without .endif', -+ CONSTRUCT_ELSE: '.else without .endif', -+ CONSTRUCT_REP: '.rep without .endr'}[construct_stack[-1]]) -+ -+ # check no warnings level back to 0 -+ if nwarn_level != 0: -+ asm_error('.pushnwarn without .popnwarn') -+ -+ # flush queued up data -+ asm_flush_prog_data() -+ -+ # fixup all the label references we can -+ for pc in xrange(len(prog)): -+ if isinstance(prog[pc][0], tuple): -+ location, label, rel, offset = prog[pc][0] -+ if label[0].isdigit(): -+ label_pcs = labels.get(label[:-1], []) -+ if label[-1] == 'b': -+ label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:] -+ else: -+ label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1] -+ if label_pcs == []: -+ asm_error('search for label reached begin/end of file', location = location) -+ imm = label_pcs[0] -+ elif label in labels: -+ imm = labels[label] -+ elif (':' + label) in labels: -+ imm = labels[':' + label] -+ elif external_link: -+ continue # let the external linker deal with it -+ else: -+ asm_error('undefined label', location = location) -+ imm = (imm * 8) + offset -+ if rel: -+ imm -= (pc + 4) * 8 # relative to instruction after delay slots -+ imm &= (1 << 32) - 1 -+ else: -+ if not external_link: -+ asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location) -+ imm = (location, label, rel, offset, imm) -+ prog[pc] = (imm,) + prog[pc][1:] -+ -+def asm_init(): -+ global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level -+ -+ current_dir = os.getcwd() -+ current_location = '' -+ prog = [] -+ prog_data = [] -+ macros = { -+ 'sacq': (['dst', 'i'], [('candyland', 'mov dst, sacq(i)')]), -+ 'srel': (['dst', 'i'], [('candyland', 'mov dst, srel(i)')])} -+ labels = {} -+ construct = None -+ construct_stack = [] -+ nwarn_level = 0 -+ -+def asm_reset_prog(): -+ global prog, labels -+ -+ prog = [] -+ labels = {} -+ -+############################################################################### -+# dumping -+############################################################################### -+ -+def print_lines(lines): -+ for line in lines: -+ print line -+ -+class dumper_t: -+ def external_link(self): return False -+ def begin(self): pass -+ def label(self, pc, name): pass -+ def line(self, pc, ls, ms, line, annots, first): pass -+ def end(self): pass -+ def sets(self, sets): pass -+ def direct(self, line): pass -+ -+class clif_dumper_t(dumper_t): -+ def __init__(self): -+ self.annot_mode = 0 -+ -+ def external_link(self): -+ return True -+ -+ def parse_annot_mode(self, line): -+ l = line.split(',') -+ self.annot_mode = int(l[0]) -+ if self.annot_mode not in (0, 1, 2): -+ asm_error('bad annot mode') -+ if self.annot_mode == 2: -+ if len(l) != 2: -+ asm_error('expected buffer name') -+ self.annot_name = l[1].strip() -+ self.annot_offset = 0 -+ elif len(l) != 1: -+ asm_error('unexpected comma') -+ -+ def label(self, pc, name): -+ if (self.annot_mode != 1) and (name[0] == ':'): -+ if self.annot_mode == 2: -+ name = name + '_annotations' -+ print '@label %s' % name[1:] -+ else: -+ print '// :%s' % name -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if self.annot_mode == 0: -+ if isinstance(ls, tuple): -+ if len(ls) == 5: -+ location, label, rel, offset, offset_from_prog = ls -+ assert not rel -+ ls = '[. - %d + %d]' % (pc * 8, offset_from_prog) -+ else: -+ location, label, rel, offset = ls -+ if rel: -+ asm_error('relative external label references not allowed in this mode', location = location) -+ ls = '[%s + %d]' % (label, offset) -+ else: -+ ls = '0x%08x' % ls -+ print '%s 0x%08x // %s' % (ls, ms, line) -+ elif self.annot_mode == 1: -+ print '// %s' % line -+ for annot in annots: -+ print '0x%08x 0x%08x // %s' % ({ -+ # todo: would rather not have these hard coded -+ 'mul_used': 1, -+ 'preserve_cond': 2, -+ 'geomd_open': 3, -+ 'geomd_i': 4, -+ 'geomd_tris_clear': 5, -+ 'geomd_verts': 6, -+ 'geomd_tris_add': 7, -+ 'geomd_tris_set_center': 8, -+ 'geomd_region_clear': 9, -+ 'geomd_region_set': 10, -+ 'geomd_images_clear': 11, -+ 'geomd_images_l': 12, -+ 'geomd_images_b': 13, -+ 'geomd_images_r': 14, -+ 'geomd_images_t': 15, -+ 'geomd_images_add_vpm': 16, -+ 'trace_4c': 17, -+ 'geomd_images_add_tex': 18,}[annot[0]], annot[1], annot[0]) -+ if len(annots) != 0: -+ print '0x00000000 // end' -+ else: -+ assert self.annot_mode == 2 -+ if len(annots) == 0: -+ print '0x00000000 // %s' % line -+ else: -+ print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line) -+ self.annot_offset += (len(annots) * 8) + 4 -+ -+ def direct(self, line): -+ print line -+ -+class plain_dumper_t(dumper_t): -+ def line(self, pc, ls, ms, line, annots, first): -+ print '0x%08x, 0x%08x, // %s' % (ls, ms, line) -+ -+class c_c_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, array_name): -+ self.header_name = header_name -+ self.array_name = array_name -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ self.external_labels = set() -+ self.lines = [] -+ -+ print '#include "%s.h"' % self.header_name -+ print '' -+ print '#ifdef _MSC_VER' -+ print ' #include ' -+ print ' /* cast through uintptr_t to avoid warnings */' -+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))' -+ print '#else' -+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(X))' -+ print '#endif' -+ print '' -+ print '#ifdef __cplusplus' -+ print 'extern "C" { /* the types are probably wrong... */' -+ print '#endif' -+ -+ def label(self, pc, name): -+ self.lines.append('// :%s' % name) -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if isinstance(ls, tuple): -+ if len(ls) == 5: -+ location, label, rel, offset, offset_from_prog = ls -+ assert not rel -+ ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog) -+ else: -+ location, label, rel, offset = ls -+ if rel: -+ asm_error('relative external label references not allowed in this mode', location = location) -+ if label not in self.external_labels: -+ self.external_labels.add(label) -+ print 'extern uint8_t %s[];' % label -+ ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset) -+ else: -+ ls = '0x%08x' % ls -+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line)) -+ -+ def end(self): -+ print '#ifdef __cplusplus' -+ print '}' -+ print '#endif' -+ print '' -+ print '#ifdef _MSC_VER' -+ print '__declspec(align(8))' -+ print '#elif defined(__GNUC__)' -+ print '__attribute__((aligned(8)))' -+ print '#endif' -+ print 'unsigned int %s[] = {' % self.array_name -+ print_lines(self.lines) -+ print '};' -+ print '#ifdef __HIGHC__' -+ print '#pragma Align_to(8, %s)' % self.array_name -+ print '#endif' -+ -+class c_h_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, array_name): -+ self.full_header_name = full_header_name -+ self.array_name = array_name -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ print '#ifndef %s_H' % self.full_header_name -+ print '#define %s_H' % self.full_header_name -+ print '' -+ print 'extern unsigned int %s[];' % self.array_name -+ print '' -+ -+ def label(self, pc, name): -+ if name[0] == ':': -+ print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2) -+ -+ def end(self): -+ print '' -+ print '#endif' -+ -+class ml_c_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, name, annots): -+ self.header_name = header_name -+ self.name = name -+ self.annots = annots -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ if self.annots: -+ self.annot_lines = [] -+ self.lines = [] -+ self.external_labels = set() -+ self.link_lines = [] -+ -+ print '#include "%s.h"' % self.header_name -+ print '#include ' -+ if self.annots: -+ print '#ifdef SIMPENROSE' -+ print '#include ' -+ print '#include "v3d/verification/tools/2760sim/simpenrose.h"' -+ print '' -+ -+ def label(self, pc, name): -+ self.lines.append('// :%s' % name) -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if self.annots: -+ if len(annots) == 0: -+ self.annot_lines.append('NULL,') -+ else: -+ print 'static unsigned int const annotations_%d[] = {' % pc -+ for annot in annots: -+ print ' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]) -+ print ' SIMPENROSE_SHADER_ANNOTATION_END};' -+ print '' -+ self.annot_lines.append('annotations_%d,' % pc) -+ if isinstance(ls, tuple): -+ self.link_lines.append(' assert(p[%d] == 0xdeadbeef);' % (pc * 2)) -+ if len(ls) == 5: -+ location, label, rel, offset, offset_from_prog = ls -+ assert not rel -+ self.link_lines.append(' p[%d] = base + %d;' % (pc * 2, offset_from_prog)) -+ else: -+ location, label, rel, offset = ls -+ self.external_labels.add(label) -+ if rel: -+ self.link_lines.append(' p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8)) -+ else: -+ self.link_lines.append(' p[%d] = %s + %d;' % (pc * 2, label, offset)) -+ ls = '0xdeadbeef' -+ else: -+ ls = '0x%08x' % ls -+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line)) -+ -+ def end(self): -+ if self.annots: -+ print 'unsigned int const *const %s_annotations_array[] = {' % self.name -+ print_lines(self.annot_lines) -+ print '};' -+ print '#endif' -+ print '' -+ print 'static unsigned int const array[] = {' -+ print_lines(self.lines) -+ print '};' -+ print '' -+ print 'void %s_link(void *p_in, unsigned int base' % self.name -+ for label in sorted(self.external_labels): -+ print ' , unsigned int %s' % label -+ print ' )' -+ print '{' -+ print ' unsigned int *p = (unsigned int *)p_in;' -+ print ' unsigned int i;' -+ print ' for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper() -+ print ' p[i] = array[i];' -+ print ' }' -+ print_lines(self.link_lines) -+ print '}' -+ -+class ml_h_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, name, annots): -+ self.full_header_name = full_header_name -+ self.name = name -+ self.annots = annots -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ self.external_labels = set() -+ self.lines_n = 0 -+ -+ print '#ifndef %s_H' % self.full_header_name -+ print '#define %s_H' % self.full_header_name -+ print '' -+ if self.annots: -+ print '#ifdef SIMPENROSE' -+ print ' extern unsigned int const *const %s_annotations_array[];' % self.name -+ print '#endif' -+ print '' -+ -+ def label(self, pc, name): -+ if name[0] == ':': -+ print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8) -+ if self.annots: -+ print '#ifdef SIMPENROSE' -+ print ' #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc) -+ print '#endif' -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if isinstance(ls, tuple) and (len(ls) != 5): -+ self.external_labels.add(ls[1]) -+ self.lines_n += 1 -+ -+ def end(self): -+ print '' -+ print 'extern void %s_link(void *p, unsigned int base' % self.name -+ for label in sorted(self.external_labels): -+ print ' , unsigned int %s' % label -+ print ' );' -+ print '' -+ print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8)) -+ print '' -+ print '#endif' -+ -+def print_lines_lc(lines): -+ for line in lines: -+ print '%s \\' % line -+ -+def print_groups_lc(groups): -+ first = True -+ for group in groups: -+ if first: -+ print '{ \\' -+ else: -+ print ', { \\' -+ print_lines_lc(group) -+ print '} \\' -+ first = False -+ -+class inline_c_dumper_t(dumper_t): -+ def __init__(self, annots): -+ self.annots = annots -+ self.iteration = False -+ -+ def begin_iteration(self): -+ assert not self.iteration -+ self.iteration = True -+ self.iteration_lines = [] -+ if self.annots: -+ self.iteration_annot_lines = [] -+ self.annot_arrs = [] -+ -+ def end_iteration(self): -+ assert self.iteration -+ self.iteration = False -+ print '%d, \\' % self.iteration_n -+ if self.annots: -+ print '( \\' -+ print_groups_lc(self.iteration_lines) -+ if self.annots: -+ print '), ( \\' -+ print_groups_lc(self.iteration_annot_lines) -+ print '), ( \\' -+ for annot_arr in self.annot_arrs: -+ print_lines_lc(annot_arr) -+ print ') \\' -+ -+ def begin(self): -+ self.n = 0 -+ self.lines = [] -+ if self.annots: -+ self.annot_lines = [] -+ if not self.iteration: -+ self.annot_arrs = [] -+ -+ def label(self, pc, name): -+ self.lines.append('/* :%s */' % name) -+ if self.annots: -+ self.annot_lines.append('/* :%s */' % name) -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ self.n += 1 -+ if first: -+ prefix = '' -+ else: -+ prefix = ', ' -+ self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line)) -+ if self.annots: -+ if len(annots) == 0: -+ a = 'NULL' -+ else: -+ a = 'annotations_%d' % len(self.annot_arrs) -+ annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)] -+ for annot in annots: -+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])) -+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_END};') -+ self.annot_arrs.append(annot_arr) -+ self.annot_lines.append('%s%s /* %s */' % (prefix, a, line)) -+ -+ def end(self): -+ if self.iteration: -+ if len(self.iteration_lines) == 0: -+ self.iteration_n = self.n -+ elif self.iteration_n != self.n: -+ asm_error('number of instructions differs between iterations') -+ self.iteration_lines.append(self.lines) -+ if self.annots: -+ self.iteration_annot_lines.append(self.annot_lines) -+ else: -+ if self.annots: -+ print '( \\' -+ print_lines_lc(self.lines) -+ if self.annots: -+ print '), ( \\' -+ print_lines_lc(self.annot_lines) -+ print '), ( \\' -+ for annot_arr in self.annot_arrs: -+ print_lines_lc(annot_arr) -+ print ') \\' -+ -+ def direct(self, line): -+ print line -+ -+class asvc_dumper_t(dumper_t): -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ print '.align 8' -+ -+ def label(self, pc, name): -+ if name[0] == ':': -+ print '%s::' % name[1:] -+ else: -+ print '%s:' % name -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if isinstance(ls, tuple): -+ location, label, rel, offset = ls[:4] -+ if rel: -+ ls = '%s + %d - (. + 32)' % (label, offset) -+ else: -+ ls = '%s + %d' % (label, offset) -+ else: -+ ls = '0x%08x' % ls -+ print '.word %s, 0x%08x ; %s' % (ls, ms, line) -+ -+def is_ra_or_rb(val): -+ return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B)) -+ -+class aliases_dumper_t(dumper_t): -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ print '#ifndef JUST_DQASM_ARGS' -+ -+ def label(self, pc, name): -+ if not name[0].isdigit(): -+ if name[0] == ':': -+ name = name[1:] -+ print '"bs%s", "bs%x",' % (name, pc * 8) -+ print '"bu%s", "bu%x",' % (name, pc * 8) -+ -+ def end(self): -+ print '#endif' -+ -+ # todo: handle things other than ra and rb? dqasm only allows ra and rb atm -+ def sets(self, sets): -+ dqasm_args = [] -+ print '#ifndef JUST_DQASM_ARGS' -+ for name in sets: -+ if is_ra_or_rb(sets[name]): -+ dqasm_args.append('-r%s=%s' % (sets[name], name)) -+ print '"%s", "%s",' % (name, sets[name]) -+ elif isinstance(sets[name], list): -+ for i, val in enumerate(sets[name]): -+ if is_ra_or_rb(val): -+ dqasm_args.append('-r%s=%s[%d]' % (val, name, i)) -+ print '"%s[%d]", "%s",' % (name, i, val) -+ print '#endif' -+ print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args) -+ -+def dump(dumper): -+ if (len(prog) != 0) or (len(labels) != 0): -+ dumper.begin() -+ -+ sorted_labels = [] -+ for name in labels: -+ if name[0].isdigit(): -+ for pc in labels[name]: -+ sorted_labels.append((pc, name)) -+ else: -+ sorted_labels.append((labels[name], name)) -+ sorted_labels.sort(reverse = True) -+ -+ first = True -+ for pc in xrange(len(prog)): -+ ls, ms, line, annots = prog[pc] -+ while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc): -+ dumper.label(*sorted_labels.pop()) -+ dumper.line(pc, ls, ms, line, annots, first) -+ first = False -+ for sorted_label in sorted_labels: -+ assert sorted_label[0] == len(prog) -+ dumper.label(*sorted_label) -+ -+ dumper.end() -+ -+############################################################################### -+# preprocessing -+############################################################################### -+ -+def preprocess_inline_c(dumper): -+ def preprocess(file): -+ ls = None -+ line_number = 0 -+ for line in file: -+ line_number += 1 -+ while True: -+ if ls is None: -+ l = line.split('%[', 1) -+ if len(l) == 1: -+ dumper.direct(l[0].rstrip()) -+ break -+ dumper.direct('%s \\' % l[0].rstrip()) -+ line = l[1] -+ ls = [] -+ else: -+ l = line.split('%]', 1) -+ ls.append((line_number, l[0])) -+ if len(l) == 1: -+ break -+ line = l[1] -+ l = ls[-1][1].split('%|', 1) -+ if len(l) == 1: -+ for l_number, l in ls: -+ yield l_number, l -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ else: -+ ls[-1] = (ls[-1][0], l[0]) -+ if hasattr(dumper, 'begin_iteration'): -+ dumper.begin_iteration() -+ for repls in l[1].split('%,'): -+ repls = [repl.strip() for repl in repls.split('%/')] -+ for l_number, l in ls: -+ for i, repl in enumerate(repls): -+ l = l.replace('%' + str(i), repl) -+ yield l_number, l -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ if hasattr(dumper, 'end_iteration'): -+ dumper.end_iteration() -+ ls = None -+ return preprocess -+ -+def preprocess_clif(dumper): -+ def preprocess(file): -+ in_asm = False -+ line_number = 0 -+ for line in file: -+ line_number += 1 -+ if in_asm: -+ if line.strip() == '%]': -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ in_asm = False -+ else: -+ yield line_number, line -+ else: -+ if line.strip() == '%[': -+ in_asm = True -+ elif (line[:1] == '%') and (line[:2] != '%@'): -+ yield line_number, line[1:] -+ else: -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ if line[:2] == '%@': -+ if hasattr(dumper, 'parse_annot_mode'): -+ dumper.parse_annot_mode(line[2:]) -+ else: -+ dumper.direct(line.rstrip()) -+ return preprocess -+ -+############################################################################### -+# main -+############################################################################### -+ -+def main(): -+ global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5 -+ global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate -+ -+ asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work -+ -+ # parse command line -+ parser = optparse.OptionParser(usage = 'usage: %prog [options] ') -+ parser.add_option('-m', '--mode', dest = 'mode', -+ help = ' should be clif, plain, ' + -+ 'c_c:,,, ' + -+ 'c_h:,,, ' + -+ 'ml_c:,,[,annots], ' + -+ 'ml_h:,,[,annots], ' + -+ 'inline_c[:annots], asvc, or aliases[:]', metavar = '') -+ parser.add_option('-t', '--target', dest = 'target', -+ help = ' should be a0, b0, or hera', metavar = '') -+ parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False) -+ parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False) -+ parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False) -+ parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False) -+ parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '=') -+ options, args = parser.parse_args() -+ if len(args) == 0: -+ filename = None -+ elif len(args) == 1: -+ filename = args[0] -+ else: -+ parser.print_help() -+ sys.exit(-1) -+ -+ # handle mode -+ mode = options.mode or 'clif' # assume clif if no mode specified -+ if mode == 'clif': -+ dumper = clif_dumper_t() -+ preprocess = preprocess_clif(dumper) -+ elif mode == 'plain': -+ dumper = plain_dumper_t() -+ preprocess = None -+ elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'): -+ mode_options = mode[4:].split(',') -+ if len(mode_options) != 3: -+ asm_error('badly formatted mode on command line') -+ dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options) -+ preprocess = None -+ elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'): -+ mode_options = mode[5:].split(',') -+ if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')): -+ asm_error('badly formatted mode on command line') -+ dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t -+ }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4])) -+ preprocess = None -+ elif mode == 'inline_c': -+ dumper = inline_c_dumper_t(False) -+ preprocess = preprocess_inline_c(dumper) -+ elif mode == 'inline_c:annots': -+ dumper = inline_c_dumper_t(True) -+ preprocess = preprocess_inline_c(dumper) -+ elif mode == 'asvc': -+ dumper = asvc_dumper_t() -+ preprocess = None -+ elif mode == 'aliases': -+ dumper = aliases_dumper_t() -+ preprocess = None -+ elif mode == 'aliases:inline_c': -+ dumper = aliases_dumper_t() -+ preprocess = preprocess_inline_c(dumper) -+ else: -+ asm_error('invalid mode') -+ external_link = dumper.external_link() -+ -+ # handle target -+ target = options.target or 'b0' # assume b0 if no target specified -+ if target == 'a0': -+ have_sema = False -+ have_am = False -+ mulw_rotate = False -+ have_lthrsw = False -+ elif target == 'b0': -+ have_sema = True -+ have_am = True -+ mulw_rotate = True -+ have_lthrsw = True -+ elif target == 'hera': -+ have_sema = True -+ have_am = False -+ mulw_rotate = True -+ have_lthrsw = True -+ else: -+ asm_error('invalid target') -+ if have_am: -+ sigs['loadam'] = SIG_LOADAM -+ arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE) -+ if have_lthrsw: -+ sigs['lthrsw'] = SIG_LTHRSW -+ del sigs['int'] -+ arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE) -+ -+ # handle misc options -+ allow_xor_0 = options.allow_xor_0 -+ dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5 -+ warnings_are_errors = options.warnings_are_errors -+ disable_warnings = options.disable_warnings -+ -+ # make options visible to asm -+ arg_defs['mode'] = mode -+ arg_defs['target'] = target -+ -+ # arg_defs all setup at this point -+ sets = arg_defs.copy() # todo: see arg_eval -+ -+ # handle command line sets -+ re_options_set = re.compile('(?P\\w+)=(?P.+)$') -+ for options_set in options.sets: -+ m = re_options_set.match(options_set) -+ if not m: -+ asm_error('badly formatted set on command line') -+ sets[m.group('name')] = arg_eval(m.group('val'), sets) -+ -+ # assemble input file and dump -+ asm_file(sets, filename, filename, preprocess) -+ asm_end_prog() -+ dump(dumper) -+ for name in arg_defs: # todo: see arg_eval -+ del sets[name] -+ dumper.sets(sets) -+ -+if __name__ == '__main__': -+ main() -diff --git b/pi-util/qem.sh a/pi-util/qem.sh -new file mode 100644 -index 0000000..20ce7ee ---- /dev/null -+++ a/pi-util/qem.sh -@@ -0,0 +1,8 @@ ++++ b/pi-util/qem.sh +@@ -0,0 +1,9 @@ +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex +QASM=python\ pi-util/qasm.py +SRC_FILE=libavcodec/rpi_shader.qasm +DST_BASE=shader + ++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h + -diff --git b/pi-util/rebase_liblinks.py a/pi-util/rebase_liblinks.py +diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py new file mode 100755 -index 0000000..6a9a33f +index 0000000..5935a11 --- /dev/null -+++ a/pi-util/rebase_liblinks.py -@@ -0,0 +1,37 @@ -+#!/usr/bin/env python -+ -+import os, sys -+from stat import * -+ -+def walktree(top, callback, n, prefix): -+ '''recursively descend the directory tree rooted at top, -+ calling the callback function for each regular file''' -+ -+ for f in os.listdir(top): -+ pathname = os.path.join(top, f) -+ mode = os.lstat(pathname).st_mode -+ if S_ISDIR(mode): -+ # It's a directory, recurse into it -+ walktree(pathname, callback, n+1, prefix) -+ elif S_ISLNK(mode): -+ # It's a file, call the callback function -+ callback(pathname, os.readlink(pathname), n, prefix) -+ -+def visitfile(file, linkname, n, prefix): -+ if (linkname.startswith(prefix + 'lib/')): -+ newlink = "../" * n + linkname[len(prefix):] -+ print 'relinking', file, "->", newlink -+ os.remove(file) -+ os.symlink(newlink, file) -+ -+if __name__ == '__main__': -+ argc = len(sys.argv) -+ if argc == 2: -+ walktree(sys.argv[1], visitfile, 0, "/") -+ elif argc == 3: -+ walktree(sys.argv[1], visitfile, 0, sys.argv[2]) -+ else: -+ print "rebase_liblinks.py []" -+ -+ -+ -diff --git b/pi-util/syncroot.sh a/pi-util/syncroot.sh -new file mode 100755 -index 0000000..d8bdd91 ---- /dev/null -+++ a/pi-util/syncroot.sh -@@ -0,0 +1,43 @@ -+set -e -+ -+if [ "$1" == "" ]; then -+ echo Usage: $0 \ [\] -+ echo src_dir is a source for rsync so may contain m/c name. -+ echo rootname will be set to \"raspian_jessie_pi1\" if missing -+ echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1 -+ exit 1 -+fi -+ -+SYSROOT_NAME=$2 -+if [ "$SYSROOT_NAME" == "" ]; then -+ SYSROOT_NAME=raspian_jessie_pi1 -+fi -+ -+DST_ROOT=`pwd` -+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot -+SRC=$1 -+ -+echo Sync src: $SRC -+echo Sync dest: $DST -+ -+mkdir -p $DST/lib -+mkdir -p $DST/opt/vc/include -+mkdir -p $DST/usr/lib/pkgconfig -+mkdir -p $DST/usr/bin -+mkdir -p $DST/usr/share -+ -+#### MUST NOT include /opt/vc/include/*GL* -+# Creates conflicts with GL includes inside Chrome -+ -+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib -+rsync -rl $SRC/opt/vc/lib $DST/opt/vc -+rsync -l $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include -+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include -+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include -+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib -+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib -+rsync -rl $SRC/usr/include $DST/usr -+ -+pi-util/rebase_liblinks.py $DST -+ -+ -diff --git b/pi-util/v3dusage.py a/pi-util/v3dusage.py -new file mode 100644 -index 0000000..7e336a9 ---- /dev/null -+++ a/pi-util/v3dusage.py -@@ -0,0 +1,75 @@ ++++ b/pi-util/v3dusage.py +@@ -0,0 +1,128 @@ +#!/usr/bin/env python + +import sys +import argparse +import re + -+def main(): -+ argp = argparse.ArgumentParser(description="QPU/VPU perf summary") -+ argp.add_argument("logfile") -+ args = argp.parse_args() -+ ++def do_logparse(logname): + + rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ') ++ rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$') ++ rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$') ++ rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$') + + ttotal = {'idle':0.0} + tstart = {} ++ qctotal = {} ++ qtstotal = {} ++ l2hits = {} ++ l2total = {} + time0 = None + idle_start = None + qpu_op_no = 0 + op_count = 0 + -+ with open(args.logfile, "rt") as infile: ++ with open(logname, "rt") as infile: + for line in infile: + match = rmatch.match(line) + if match: @@ -18387,6 +20055,31 @@ index 0000000..7e336a9 + ttotal['idle'] += time - idle_start + idle_start = None + ++ match = rqcycle.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in qctotal: ++ qctotal[unit] = 0 ++ qctotal[unit] += int(match.group(2)) ++ ++ match = rqtscycle.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in qtstotal: ++ qtstotal[unit] = 0 ++ qtstotal[unit] += int(match.group(2)) ++ ++ match = rl2hits.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in l2total: ++ l2total[unit] = 0 ++ l2hits[unit] = 0 ++ l2total[unit] += int(match.group(3)) ++ if match.group(2) == "hits": ++ l2hits[unit] += int(match.group(3)) ++ ++ + if not time0: + print "No v3d profile records found" + else: @@ -18395,8 +20088,34 @@ index 0000000..7e336a9 + print "Logged time:", tlogged, " Op count:", op_count + for unit in sorted(ttotal): + print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged) ++ print ++ for unit in sorted(qctotal): ++ if not unit in qtstotal: ++ qtstotal[unit] = 0; ++ print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit]) ++ if unit in l2total: ++ print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit]) ++ + + +if __name__ == '__main__': -+ main() ++ argp = argparse.ArgumentParser( ++ formatter_class=argparse.RawDescriptionHelpFormatter, ++ description="QPU/VPU perf summary from VC logging", ++ epilog = """ ++Will also summarise TMU stalls if logging requests set in qpu noflush param ++in the profiled code. + ++Example use: ++ vcgencmd set_logging level=0xc0 ++ ++ sudo vcdbg log msg >& t.log ++ v3dusage.py t.log ++""") ++ ++ argp.add_argument("logfile") ++ args = argp.parse_args() ++ ++ do_logparse(args.logfile) ++ +