From b27b2a95cc368141e9604cf0cbdff755a49cdd5b Mon Sep 17 00:00:00 2001 From: MilhouseVH Date: Mon, 23 Oct 2017 10:07:42 +0100 Subject: [PATCH] ffmpeg: update to ffmpeg-9702d0d --- packages/multimedia/ffmpeg/package.mk | 2 +- ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 25774 ++++++++++------ 2 files changed, 17108 insertions(+), 8668 deletions(-) diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 044bf59c51..43d136a1b0 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -18,7 +18,7 @@ PKG_NAME="ffmpeg" # Current branch is: release/3.1-xbmc -PKG_VERSION="f58e5b9" +PKG_VERSION="9702d0d" PKG_ARCH="any" PKG_LICENSE="LGPLv2.1+" PKG_SITE="https://ffmpeg.org" diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 96cfa9ae30..5b3fc489a5 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -1,8 +1,16 @@ diff --git a/.gitignore b/.gitignore -index 524fb73..305632b 100644 +index 524fb73c16..bcc983739f 100644 --- a/.gitignore +++ b/.gitignore -@@ -23,6 +23,7 @@ +@@ -1,6 +1,7 @@ + *.a + *.o + *.o.* ++*.bin + *.d + *.def + *.dll +@@ -23,6 +24,7 @@ .\#* /.config /.version @@ -11,7 +19,7 @@ index 524fb73..305632b 100644 /ffplay /ffprobe diff --git a/ffmpeg.c b/ffmpeg.c -index 9ffd833..e2474e5 100644 +index cdded8673f..5eee7dfd40 100644 --- a/ffmpeg.c +++ b/ffmpeg.c @@ -23,6 +23,11 @@ @@ -20,13 +28,21 @@ index 9ffd833..e2474e5 100644 +#ifdef RPI +#define RPI_DISPLAY -+#define RPI_ZERO_COPY ++#define RPI_DISPLAY_ALL 0 +#endif + #include "config.h" #include #include -@@ -66,6 +71,25 @@ +@@ -42,6 +47,7 @@ + #include "libavformat/avformat.h" + #include "libavdevice/avdevice.h" + #include "libswresample/swresample.h" ++#include "libavutil/atomic.h" + #include "libavutil/opt.h" + #include "libavutil/channel_layout.h" + #include "libavutil/parseutils.h" +@@ -66,6 +72,25 @@ # include "libavfilter/buffersrc.h" # include "libavfilter/buffersink.h" @@ -38,21 +54,21 @@ index 9ffd833..e2474e5 100644 +#include +#include +#include ++#include +#include +#include +#include +#include +#pragma GCC diagnostic pop -+#ifdef RPI_ZERO_COPY +#include "libavcodec/rpi_qpu.h" -+#endif ++#include "libavutil/rpi_sand_fns.h" +#include "libavcodec/rpi_zc.h" +#endif + #if HAVE_SYS_RESOURCE_H #include #include -@@ -158,6 +182,182 @@ static int restore_tty; +@@ -158,6 +183,241 @@ static int restore_tty; static void free_input_threads(void); #endif @@ -60,39 +76,36 @@ index 9ffd833..e2474e5 100644 + +#define NUM_BUFFERS 4 + -+static MMAL_COMPONENT_T* rpi_display = NULL; -+static MMAL_POOL_T *rpi_pool = NULL; -+static volatile int rpi_display_count = 0; + -+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h) ++typedef struct rpi_display_env_s ++{ ++ MMAL_COMPONENT_T* display; ++ MMAL_COMPONENT_T* isp; ++ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup ++ MMAL_CONNECTION_T * conn; ++ ++ MMAL_POOL_T *rpi_pool; ++ volatile int rpi_display_count; ++ enum AVPixelFormat avfmt; ++} rpi_display_env_t; ++ ++static rpi_display_env_t * rpi_display_env = NULL; ++ ++ ++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port) +{ + MMAL_POOL_T* pool; -+ size_t i; -+ size_t size = (w*h*3)/2; -+#ifdef RPI_ZERO_COPY + mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image? + pool = mmal_port_pool_create(port, NUM_BUFFERS, 0); + assert(pool); -+#else -+ pool = mmal_port_pool_create(port, NUM_BUFFERS, size); -+ -+ for (i = 0; i < NUM_BUFFERS; ++i) -+ { -+ MMAL_BUFFER_HEADER_T* buffer = pool->header[i]; -+ char * bufPtr = buffer->data; -+ memset(bufPtr, i*30, w*h); -+ memset(bufPtr+w*h, 128, (w*h)/2); -+ } -+#endif + + return pool; +} + +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) { -+#ifdef RPI_ZERO_COPY ++ rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata; + av_rpi_zc_unref(buffer->user_data); -+ --rpi_display_count; -+#endif ++ avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1); + mmal_buffer_header_release(buffer); +} + @@ -100,9 +113,12 @@ index 9ffd833..e2474e5 100644 + mmal_buffer_header_release(buffer); +} + -+static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h) ++#define DISPLAY_PORT_DEPTH 4 ++ ++static rpi_display_env_t * ++display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h) +{ -+ MMAL_COMPONENT_T* display; ++ MMAL_STATUS_T err; + MMAL_DISPLAYREGION_T region = + { + .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)}, @@ -111,51 +127,113 @@ index 9ffd833..e2474e5 100644 + .fullscreen = 0, + .dest_rect = {x, y, w, h} + }; ++#if RPI_ZC_SAND_8_IN_10_BUF ++ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt; ++#else ++ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt; ++#endif + const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h); ++ rpi_display_env_t * de; ++ int isp_req = (fmt == AV_PIX_FMT_SAND64_10); + -+ bcm_host_init(); // TODO is this needed? -+ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display); -+ assert(display); ++ bcm_host_init(); // Needs to be done by someone... + -+ mmal_port_parameter_set(display->input[0], ®ion.hdr); ++ if ((de = av_mallocz(sizeof(*de))) == NULL) { ++ return NULL; ++ } ++ ++ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display); ++ av_assert0(de->display); ++ de->port_in = de->display->input[0]; ++ ++ if (isp_req) ++ { ++ mmal_component_create("vc.ril.isp", &de->isp); ++ de->port_in = de->isp->input[0]; ++ } ++ ++ mmal_port_parameter_set(de->display->input[0], ®ion.hdr); + + { -+ MMAL_ES_FORMAT_T* format = display->input[0]->format; -+ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420; ++ MMAL_PORT_T * const port = de->port_in; ++ MMAL_ES_FORMAT_T* const format = port->format; ++ port->userdata = (struct MMAL_PORT_USERDATA_T *)de; ++ port->buffer_num = DISPLAY_PORT_DEPTH; ++ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : ++ fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 : ++ MMAL_ENCODING_I420; + format->es->video.width = geo.stride_y; -+ format->es->video.height = geo.height_y; ++ format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ? ++ (h + 15) & ~15 : geo.height_y; // Magic + format->es->video.crop.x = 0; + format->es->video.crop.y = 0; + format->es->video.crop.width = w; + format->es->video.crop.height = h; -+ mmal_port_format_commit(display->input[0]); ++ mmal_port_format_commit(port); + } + -+ mmal_component_enable(display); ++ de->rpi_pool = display_alloc_pool(de->port_in); ++ mmal_port_enable(de->port_in,display_cb_input); + -+ rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y); ++ if (isp_req) { ++ MMAL_PORT_T * const port_out = de->isp->output[0]; ++ mmal_log_dump_port(de->port_in); ++ mmal_format_copy(port_out->format, de->port_in->format); ++ if (fmt == AV_PIX_FMT_SAND64_10) { ++ if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS || ++ (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS) ++ { ++ av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n"); ++ } ++ else ++ av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n"); + -+ mmal_port_enable(display->input[0],display_cb_input); -+ mmal_port_enable(display->control,display_cb_control); ++ } ++ port_out->format->encoding = MMAL_ENCODING_I420; ++ mmal_log_dump_port(port_out); ++ if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n"); ++ goto fail; ++ } ++ if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) { ++ av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n"); ++ goto fail; ++ } ++ if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) { ++ av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n"); ++ goto fail; ++ } ++ mmal_port_enable(de->isp->control,display_cb_control); ++ mmal_component_enable(de->isp); ++ } ++ ++ mmal_component_enable(de->display); ++ mmal_port_enable(de->display->control,display_cb_control); ++ de->avfmt = fmt; + + printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt); + -+ return display; ++ return de; ++ ++fail: ++ // **** Free stuff ++ return NULL; +} + -+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr) ++static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr) +{ + MMAL_BUFFER_HEADER_T* buf; + -+ if (!display || !rpi_pool) ++ if (de == NULL) + return; + -+ if (rpi_display_count >= 3) { ++ if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { + av_log(s, AV_LOG_VERBOSE, "Frame dropped\n"); + return; + } + -+ buf = mmal_queue_get(rpi_pool->queue); ++ buf = mmal_queue_get(de->rpi_pool->queue); + if (!buf) { + // Running too fast so drop the frame + printf("Q alloc failure\n"); @@ -165,67 +243,64 @@ index 9ffd833..e2474e5 100644 + buf->cmd = 0; + buf->offset = 0; // Offset to valid data + buf->flags = 0; -+#ifdef RPI_ZERO_COPY -+{ -+ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1); -+ if (fr_buf == NULL) { -+ mmal_buffer_header_release(buf); -+ return; -+ } -+ -+ buf->user_data = fr_buf; -+ buf->data = av_rpi_zc_vc_handle(fr_buf); -+ buf->offset = av_rpi_zc_offset(fr_buf); -+ buf->length = av_rpi_zc_length(fr_buf); -+ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); -+#if 0 + { -+ unsigned int n; -+ for (n = 0; n < fr->width; n += 128) { -+ memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2); ++ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1); ++ if (fr_buf == NULL) { ++ mmal_buffer_header_release(buf); ++ return; + } ++ ++ buf->user_data = fr_buf; ++ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal ++ buf->offset = av_rpi_zc_offset(fr_buf); ++ buf->length = av_rpi_zc_length(fr_buf); ++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); ++ avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1); + } -+#endif -+ ++rpi_display_count; -+} -+#else -+{ -+#error YYY -+ int w = fr->width; -+ int h = fr->height; -+ int w2 = (w+31)&~31; -+ int h2 = (h+15)&~15; -+ -+ buf->length = (w2 * h2 * 3)/2; -+ buf->user_data = NULL; -+ -+ //mmal_buffer_header_mem_lock(buf); -+ memcpy(buf->data, fr->data[0], w2 * h); -+ memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4); -+ memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4); -+ //mmal_buffer_header_mem_unlock(buf); -+} -+#endif -+ -+ while (rpi_display_count >= 3) { ++#if RPI_DISPLAY_ALL ++ while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { + usleep(5000); + } ++#endif + -+ if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS) ++ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS) + { -+ printf("** send failed: depth=%d\n", rpi_display_count); -+ display_cb_input(NULL, buf); ++ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count); ++ display_cb_input(de->port_in, buf); + } +} + -+static void display_exit(MMAL_COMPONENT_T* display) ++static void display_exit(rpi_display_env_t ** const pde) +{ ++ rpi_display_env_t * const de = *pde; ++ *pde = NULL; ++ ++ if (de != NULL) { +// sleep(120); -+ if (display) { -+ mmal_component_destroy(display); -+ } -+ if (rpi_pool) { -+ mmal_port_pool_destroy(display->input[0], rpi_pool); ++ ++ if (de->port_in != NULL) { ++ mmal_port_disable(de->port_in); ++ } ++ ++ // The above disable should kick out all buffers - check that ++ if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) { ++ av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count)); ++ } ++ ++ if (de->conn != NULL) { ++ mmal_connection_destroy(de->conn); ++ } ++ if (de->isp != NULL) { ++ mmal_component_destroy(de->isp); ++ } ++ if (de->display != NULL) { ++ mmal_component_destroy(de->display); ++ } ++ if (de->rpi_pool != NULL) { ++ mmal_port_pool_destroy(de->display->input[0], de->rpi_pool); ++ } ++ ++ av_free(de); + } +} + @@ -235,29 +310,29 @@ index 9ffd833..e2474e5 100644 /* sub2video hack: Convert subtitles to video with alpha to insert them in filter graphs. This is a temporary solution until libavfilter gets real subtitles support. -@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret) +@@ -540,6 +800,11 @@ static void ffmpeg_cleanup(int ret) avformat_close_input(&input_files[i]->ctx); av_freep(&input_files[i]); } + +#ifdef RPI_DISPLAY -+ display_exit(rpi_display); ++ display_exit(&rpi_display_env); +#endif + for (i = 0; i < nb_input_streams; i++) { InputStream *ist = input_streams[i]; -@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret) +@@ -551,6 +816,9 @@ static void ffmpeg_cleanup(int ret) av_freep(&ist->filters); av_freep(&ist->hwaccel_device); -+#ifdef RPI_ZERO_COPY ++#ifdef RPI_DISPLAY + av_rpi_zc_uninit(ist->dec_ctx); +#endif avcodec_free_context(&ist->dec_ctx); av_freep(&input_streams[i]); -@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret) +@@ -581,6 +849,7 @@ static void ffmpeg_cleanup(int ret) } term_exit(); ffmpeg_exited = 1; @@ -265,28 +340,28 @@ index 9ffd833..e2474e5 100644 } void remove_avoptions(AVDictionary **a, AVDictionary *b) -@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s, +@@ -944,6 +1213,15 @@ static void do_video_out(AVFormatContext *s, if (ost->source_index >= 0) ist = input_streams[ost->source_index]; +#ifdef RPI_DISPLAY + if (next_picture && ist != NULL) + { -+ if (!rpi_display) -+ rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height); -+ display_frame(ist->dec_ctx, rpi_display, next_picture); ++ if (rpi_display_env == NULL) ++ rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height); ++ display_frame(ist->dec_ctx, rpi_display_env, next_picture); + } +#endif + if (filter->inputs[0]->frame_rate.num > 0 && filter->inputs[0]->frame_rate.den > 0) duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base)); -@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) +@@ -2544,6 +2822,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; + -+#ifdef RPI_ZERO_COPY ++#ifdef RPI_DISPLAY + // Overrides the above get_buffer2 + av_rpi_zc_init(ist->dec_ctx); +#endif @@ -295,66 +370,74 @@ index 9ffd833..e2474e5 100644 av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0); diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index fd0d1f0..1740768 100644 +index bb28aea1e2..741aa0bdc4 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -5,6 +5,12 @@ NAME = avcodec +@@ -5,6 +5,16 @@ NAME = avcodec HEADERS = avcodec.h \ avdct.h \ avfft.h \ ++ rpi_opts.h \ + rpi_qpu.h \ + rpi_shader.h \ -+ rpi_shader_cmd.h \ ++ rpi_shader_cmd.h \ ++ rpi_shader_template.h \ ++ rpi_shader_template_fn.h \ + rpi_mailbox.h \ -+ rpi_hevc_transform.h \ ++ rpi_hevc_transform8.h \ ++ rpi_hevc_transform10.h \ + rpi_zc.h \ d3d11va.h \ dirac.h \ dv_profile.h \ -@@ -43,6 +49,10 @@ OBJS = allcodecs.o \ +@@ -43,6 +53,11 @@ OBJS = allcodecs.o \ resample.o \ resample2.o \ utils.o \ + rpi_qpu.o \ + rpi_shader.o \ ++ rpi_shader_template.o \ + rpi_mailbox.o \ + rpi_zc.o \ vorbis_parser.o \ xiph.o \ -@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h +@@ -1079,3 +1094,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h endif + -+QASM := $(SUBDIR)../pi-util/qasm.py ++QASM_PY := ../local/bin/qasm.py ++VASMVIDCORE := ../local/bin/vasmvidcore_std + -+ifneq ("$(wildcard $(QASM))","") ++ifneq ("$(wildcard $(QASM_PY))","") +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm -+ python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ $(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ + +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm -+ python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ ++ $(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ +endif + -+$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h -diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c -index 54efaad..02a89c3 100644 ---- a/libavcodec/allcodecs.c -+++ b/libavcodec/allcodecs.c -@@ -667,6 +667,7 @@ void avcodec_register_all(void) - REGISTER_PARSER(H261, h261); - REGISTER_PARSER(H263, h263); - REGISTER_PARSER(H264, h264); -+ REGISTER_PARSER(H264_MVC, h264_mvc); - REGISTER_PARSER(HEVC, hevc); - REGISTER_PARSER(MJPEG, mjpeg); - REGISTER_PARSER(MLP, mlp); ++ifneq ("$(wildcard $(VASMVIDCORE))","") ++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@ ++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@ ++ ++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin ++ python pi-util/make_array.py $< ++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin ++ python pi-util/make_array.py $< ++ ++endif ++ ++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h ++$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index a4ceca7..cafd25d 100644 +index a4ceca7f46..f8229a80e2 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile -@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -131,9 +131,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ @@ -363,13 +446,15 @@ index a4ceca7..cafd25d 100644 + arm/hevcdsp_epel_neon.o \ arm/hevcdsp_idct_neon.o \ - arm/hevcdsp_qpel_neon.o ++ arm/hevcdsp_cres_neon.o \ ++ arm/hevcdsp_res16_neon.o \ + arm/hevcdsp_qpel_neon.o \ + arm/hevcdsp_sao_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h -index fdbf86b..0a3980a 100644 +index fdbf86b45e..0a3980a1ef 100644 --- a/libavcodec/arm/cabac.h +++ b/libavcodec/arm/cabac.h @@ -26,13 +26,34 @@ @@ -552,7 +637,7 @@ index fdbf86b..0a3980a 100644 #endif /* AVCODEC_ARM_CABAC_H */ diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h new file mode 100644 -index 0000000..31d3c59 +index 0000000000..31d3c59205 --- /dev/null +++ b/libavcodec/arm/hevc_cabac.h @@ -0,0 +1,491 @@ @@ -1047,9 +1132,239 @@ index 0000000..31d3c59 +#endif /* HAVE_ARMV6T2_INLINE */ + +#endif /* AVCODEC_ARM_HEVC_CABAC_H */ +diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S +new file mode 100644 +index 0000000000..380d3c8d3b +--- /dev/null ++++ b/libavcodec/arm/hevc_idct_fn_neon.S +@@ -0,0 +1,224 @@ ++@ Included multiple times from hevc_idct_neon.S ++@ Macros defined there ++ ++#define DC_SHIFT (15 - BIT_DEPTH) ++#define DC_ADD (1 | (1 << (14 - BIT_DEPTH))) ++#define TRN_SHIFT (20 - BIT_DEPTH) ++ ++function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q0, r1 ++ vdup.16 q1, r1 ++ vst1.16 {q0, q1}, [r0] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vmov.16 q10, q8 ++ vmov.16 q11, q8 ++ vmov.16 q12, q8 ++ vmov.16 q13, q8 ++ vmov.16 q14, q8 ++ vmov.16 q15, q8 ++ vstm r0, {q8-q15} ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vmov.16 q10, q8 ++ vmov.16 q11, q8 ++ vmov.16 q12, q8 ++ vmov.16 q13, q8 ++ vmov.16 q14, q8 ++ vmov.16 q15, q8 ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ vstm r0!, {q8-q15} ++ vstm r0, {q8-q15} ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ mov r3, #16 ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vmov.16 q10, q8 ++ vmov.16 q11, q8 ++ vmov.16 q12, q8 ++ vmov.16 q13, q8 ++ vmov.16 q14, q8 ++ vmov.16 q15, q8 ++1: subs r3, #1 ++ vstm r0!, {q8-q15} ++ bne 1b ++ bx lr ++endfunc ++ ++ ++function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1 ++ vpush {d8-d15} ++ vld1.16 {q14, q15}, [r0] // coeffs ++ ldr r3, =0x00240053 // 36 and 83 ++ vmov.32 d0[0], r3 ++ ++ tr4_shift d28, d29, d30, d31, #7 ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ ++ tr4_shift d28, d29, d30, d31, #(TRN_SHIFT) ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ ++ vst1.16 {q14, q15}, [r0] ++ vpop {d8-d15} ++ bx lr ++endfunc ++ ++ ++ ++function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1 ++ vpush {d8-d15} ++ vld1.16 {q14, q15}, [r0] // coeffs ++ ldr r3, =0x4a // 74 ++ vmov.32 d0[0], r3 ++ ldr r3, =0x1d // 29 ++ vmov.32 d0[1], r3 ++ ldr r3, =0x37 // 55 ++ vmov.32 d1[0], r3 ++ ++ tr4_luma_shift d28, d29, d30, d31, #7 ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ ++ tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT) ++ ++ vtrn.16 d28, d29 ++ vtrn.16 d30, d31 ++ vtrn.32 q14, q15 ++ vst1.16 {q14, q15}, [r0] ++ vpop {d8-d15} ++ bx lr ++endfunc ++ ++ ++ ++function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1 ++ push {r4-r8} ++ vpush {d8-d15} ++ mov r5, #16 ++ ++ adrl r3, tr4f ++ vld1.16 {d0, d1}, [r3] ++ ++ // left half ++ vld1.16 {d24}, [r0], r5 ++ vld1.16 {d25}, [r0], r5 ++ vld1.16 {d26}, [r0], r5 ++ vld1.16 {d27}, [r0], r5 ++ vld1.16 {d28}, [r0], r5 ++ vld1.16 {d29}, [r0], r5 ++ vld1.16 {d30}, [r0], r5 ++ vld1.16 {d31}, [r0], r5 ++ sub r0, #128 ++ tr8_begin d25, d27, d29, d31 ++ tr4 d24, d26, d28, d30 ++ tr8_end #7 ++ vst1.16 {d2}, [r0], r5 ++ vst1.16 {d3}, [r0], r5 ++ vst1.16 {d4}, [r0], r5 ++ vst1.16 {d5}, [r0], r5 ++ vst1.16 {d6}, [r0], r5 ++ vst1.16 {d7}, [r0], r5 ++ vst1.16 {d8}, [r0], r5 ++ vst1.16 {d9}, [r0], r5 ++ sub r0, #128 ++ //skip right half if col_limit in r1 is less than 4 ++ cmp r1, #4 ++ blt 1f ++ //right half ++ add r0, #8 ++ vld1.16 {d24}, [r0], r5 ++ vld1.16 {d25}, [r0], r5 ++ vld1.16 {d26}, [r0], r5 ++ vld1.16 {d27}, [r0], r5 ++ vld1.16 {d28}, [r0], r5 ++ vld1.16 {d29}, [r0], r5 ++ vld1.16 {d30}, [r0], r5 ++ vld1.16 {d31}, [r0], r5 ++ sub r0, #128 ++ tr8_begin d25, d27, d29, d31 ++ tr4 d24, d26, d28, d30 ++ tr8_end #7 ++ vst1.16 {d2}, [r0], r5 ++ vst1.16 {d3}, [r0], r5 ++ vst1.16 {d4}, [r0], r5 ++ vst1.16 {d5}, [r0], r5 ++ vst1.16 {d6}, [r0], r5 ++ vst1.16 {d7}, [r0], r5 ++ vst1.16 {d8}, [r0], r5 ++ vst1.16 {d9}, [r0], r5 ++ sub r0, #136 ++1: ++ // top half ++ vldm r0, {q12-q15} // coeffs ++ transpose_16b_4x4 d24, d26, d28, d30 ++ transpose_16b_4x4 d25, d27, d29, d31 ++ tr8_begin d26, d30, d27, d31 ++ tr4 d24, d28, d25, d29 ++ tr8_end #(TRN_SHIFT) ++ transpose_16b_4x4 d2, d3, d4, d5 ++ transpose_16b_4x4 d6, d7, d8, d9 ++ vswp d7, d5 ++ vswp d7, d8 ++ vswp d3, d6 ++ vswp d6, d4 ++ vstm r0!, {q1-q4} ++ ++ // bottom half ++ vldm r0, {q12-q15} // coeffs ++ transpose_16b_4x4 d24, d26, d28, d30 ++ transpose_16b_4x4 d25, d27, d29, d31 ++ tr8_begin d26, d30, d27, d31 ++ tr4 d24, d28, d25, d29 ++ tr8_end #(TRN_SHIFT) ++ transpose_16b_4x4 d2, d3, d4, d5 ++ transpose_16b_4x4 d6, d7, d8, d9 ++ vswp d7, d5 ++ vswp d7, d8 ++ vswp d3, d6 ++ vswp d6, d4 ++ //vstm r0, {q1-q4} ++ vst1.16 {q1-q2}, [r0] ++ add r0, #32 ++ vst1.16 {q3-q4}, [r0] ++ sub r0, #32 ++ vpop {d8-d15} ++ pop {r4-r8} ++ bx lr ++endfunc ++ ++#undef DC_SHIFT ++#undef DC_ADD ++#undef TRN_SHIFT ++ diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S new file mode 100644 -index 0000000..373576b +index 0000000000..373576b4cb --- /dev/null +++ b/libavcodec/arm/hevc_misc_neon.S @@ -0,0 +1,62 @@ @@ -1115,8 +1430,310 @@ index 0000000..373576b + +endfunc + +diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S +new file mode 100644 +index 0000000000..bafefd4318 +--- /dev/null ++++ b/libavcodec/arm/hevcdsp_cres_neon.S +@@ -0,0 +1,296 @@ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ General notes: ++@ ++@ Residual is only guaranteed to be cliped to 16 bits ++@ This means that we do need to do movul, qadd, qmovun ++@ rather than addw, qmovun (if we were clipped to 15 then we could get away ++@ with this) ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_add_residual_4x4_u_neon_8, export=1 ++ vld1.8 {d16}, [r0, :64], r2 ++ vld1.8 {d17}, [r0, :64], r2 ++ vld1.8 {d18}, [r0, :64], r2 ++ vld1.8 {d19}, [r0, :64], r2 ++ vld1.16 {q0, q1}, [r1] ++ vdup.16 q2, r3 ++ vdup.16 q3, r3 ++ vmovl.u8 q10, d16 ++ sub r0, r0, r2, lsl #2 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ vst1.8 {d2}, [r0, :64], r2 ++ vst1.8 {d3}, [r0, :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_add_residual_8x8_u_neon_8, export=1 ++ mov r12, #4 ++ vdup.16 q15, r3 ++1: ++ vld2.8 {d16, d17}, [r0, :128], r2 ++ vld2.8 {d18, d19}, [r0, :128] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ sub r0, r2 ++ vmovl.u8 q11, d18 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d17 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d19 ++ vqmovun.s16 d16, q0 ++ vqmovun.s16 d17, q2 ++ vqmovun.s16 d18, q1 ++ vqmovun.s16 d19, q3 ++ vst2.8 {d16, d17}, [r0, :128], r2 ++ vst2.8 {d18, d19}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_add_residual_16x16_u_neon_8, export=1 ++ mov r12, #16 ++ vdup.16 q15, r3 ++1: ++ vld2.8 {q8, q9}, [r0, :256] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 ++ vqmovun.s16 d16, q0 ++ vqmovun.s16 d17, q1 ++ vqmovun.s16 d18, q2 ++ vqmovun.s16 d19, q3 ++ vst2.8 {q8, q9}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_4x4_v_neon_8, export=1 ++ vld1.8 {d16}, [r0, :64], r2 ++ vld1.8 {d17}, [r0, :64], r2 ++ vld1.8 {d18}, [r0, :64], r2 ++ vld1.8 {d19}, [r0, :64], r2 ++ vld1.16 {q2, q3}, [r1] ++ vdup.16 q0, r3 ++ vdup.16 q1, r3 ++ vmovl.u8 q10, d16 ++ sub r0, r0, r2, lsl #2 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ vst1.8 {d2}, [r0, :64], r2 ++ vst1.8 {d3}, [r0, :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_8x8_v_neon_8, export=1 ++ mov r12, #4 ++ vdup.16 q15, r3 ++1: ++ vld2.8 {d16, d17}, [r0, :128], r2 ++ vld2.8 {d18, d19}, [r0, :128] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d17 ++ sub r0, r2 ++ vmovl.u8 q11, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q2, q15, d16 ++ vaddw.u8 q3, q15, d18 ++ vqmovun.s16 d17, q0 ++ vqmovun.s16 d16, q2 ++ vqmovun.s16 d19, q1 ++ vqmovun.s16 d18, q3 ++ vst2.8 {d16, d17}, [r0, :128], r2 ++ vst2.8 {d18, d19}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_16x16_v_neon_8, export=1 ++ mov r12, #16 ++ vdup.16 q15, r3 ++1: ++ vld2.8 {q8, q9}, [r0, :256] ++ vld1.16 {q0, q1}, [r1, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d18 ++ vmovl.u8 q11, d19 ++ vaddw.u8 q2, q15, d16 ++ vaddw.u8 q3, q15, d17 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqmovun.s16 d16, q2 ++ vqmovun.s16 d17, q3 ++ vqmovun.s16 d18, q0 ++ vqmovun.s16 d19, q1 ++ vst2.8 {q8, q9}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_4x4_c_neon_8, export=1 ++ vld1.8 {d16}, [r0, :64], r2 ++ vld1.8 {d17}, [r0, :64], r2 ++ vld1.8 {d18}, [r0, :64], r2 ++ vld1.8 {d19}, [r0, :64], r2 ++ vldm r1, {q0-q3} @ Q0/1 gets all of U, Q2/3 gets all of V ++ vmovl.u8 q10, d16 ++ sub r0, r0, r2, lsl #2 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ vst1.8 {d2}, [r0, :64], r2 ++ vst1.8 {d3}, [r0, :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_8x8_c_neon_8, export=1 ++ mov r12, #8 ++ add r3, r1, #(8*8*2) @ Offset to V ++1: ++ vld2.8 {d16, d17}, [r0, :128] ++ vld1.16 {q0}, [r1, :128]! ++ vld1.16 {q1}, [r3, :128]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst2.8 {d0, d1}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_add_residual_16x16_c_neon_8, export=1 ++ mov r12, #16 ++ add r3, r1, #(16*16*2) @ Offset to V ++1: ++ vld2.8 {q8, q9}, [r0, :256] ++ vld1.16 {q0, q1}, [r1, :256]! ++ vld1.16 {q2, q3}, [r3, :256]! ++ subs r12, #1 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d2, q2 ++ vqmovun.s16 d3, q3 ++ vst2.8 {q0, q1}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ 32x32 chroma never occurs so NIF ++ ++@ ============================================================================ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S -index 166bddb..9bd0a42 100644 +index 166bddb104..15c4329cdb 100644 --- a/libavcodec/arm/hevcdsp_deblock_neon.S +++ b/libavcodec/arm/hevcdsp_deblock_neon.S @@ -15,7 +15,7 @@ @@ -1128,66 +1745,204 @@ index 166bddb..9bd0a42 100644 */ -@@ -31,6 +31,9 @@ +@@ -24,70 +24,238 @@ + + .macro hevc_loop_filter_chroma_start + ldr r12, [r2] +- ldr r3, [r2, #4] +- add r2, r3, r12 +- cmp r2, #0 ++ ldr r2, [r2, #4] ++ orrs r2, r12, r2, lsl #16 + it eq bxeq lr .endm +-.macro hevc_loop_filter_chroma_body +- vsubl.u8 q3, d4, d2 +- vsubl.u8 q11, d18, d19 +- vshl.i16 q3, #2 +- vadd.i16 q11, q3 +- vdup.16 d0, r12 +- vdup.16 d1, r3 +- vrshr.s16 q11, q11, #3 +- vneg.s16 q12, q0 +@ Uses: d2, d4, d18, d19 +@ Returns: d2, d4 -+@ Modifies: d0-d7, d22-d25 - .macro hevc_loop_filter_chroma_body - vsubl.u8 q3, d4, d2 - vsubl.u8 q11, d18, d19 -@@ -49,6 +52,33 @@ - vqmovun.s16 d4, q2 - .endm - ++@ Modifies: d0-d7, d22-d25, r12 + -+@ Uses r2[0:7], r2[8:15] -+@ Modifies: d0-d7, d22-d25 -+.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1 -+ vsubl.u8 q3, \Q0, \P0 -+ vsubl.u8 q11, \P1, \Q1 -+ vshl.i16 q3, #2 -+ vadd.i16 q11, q3 ++.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1 ++ vsubl.u8 q0, \Q0, \P0 ++ vsubl.u8 q1, \P1, \Q1 ++ vdup.16 d4, r2 ++ lsr r2, r2, #16 ++ vshl.i16 q0, #2 ++ ldr r12, [sp, #0] @ r12 = &no_q ++ vadd.i16 q0, q1 ++ ldrh r3, [r3] @ r3[0:8] = no_p[0], r3[8:15] = no_p[1] ++ vdup.16 d5, r2 + -+ @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all) -+ vdup.16 d0, r2 -+ vmovl.u8 q0, d0 -+ vuzp.16 d0, d1 -+ -+ vrshr.s16 q11, q11, #3 -+ vneg.s16 q12, q0 ++ vrshr.s16 q0, q0, #3 ++ ldrh r12, [r12] ++ vneg.s16 q3, q2 ++ vmin.s16 q0, q0, q2 + vmovl.u8 q2, \Q0 -+ vmin.s16 q11, q11, q0 -+ vmax.s16 q11, q11, q12 -+ vaddw.u8 q1, q11, \P0 -+ vsub.i16 q2, q11 ++ vmax.s16 q0, q0, q3 ++ vaddw.u8 q1, q0, \P0 ++ vsub.i16 q2, q0 ++ orrs r12, r3, r12, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] + vqmovun.s16 \P0, q1 + vqmovun.s16 \Q0, q2 +.endm + ++@ Uses r2 (tc a;b) ++@ Modifies: q0-q3 ++@ On exit ++@ r12 (and flags) contain no_p;no_q ++.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth ++ vsub.i16 q0, \Q0, \P0 ++ lsl r12, r2, #(\bit_depth - 8) ++ vsub.i16 q1, \P1, \Q1 ++ vshl.i16 q0, #2 ++ vdup.16 d4, r12 ++ lsr r12, r12, #16 ++ vadd.i16 q0, q1 ++ ldrh r3, [r3] ++ vdup.16 d5, r12 ++ ++ vrshr.s16 q0, q0, #3 ++ vneg.s16 q3, q2 ++ movw r12, #(1 << \bit_depth) - 1 ++ vmin.s16 q0, q0, q2 ++ vmax.s16 q0, q0, q3 ++ vdup.i16 q3, r12 ++ ldr r12, [sp, #0] ++ ++ vadd.i16 \P0, q0, \P0 ++ vsub.i16 \Q0, q0 ++ ++ vmov.i64 q2, #0 ++ ldrh r12, [r12] ++ vmin.s16 \P0, q3 ++ vmin.s16 \Q0, q3 ++ orrs r12, r3, r12, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] ++ vmax.s16 \P0, q2 ++ vmax.s16 \Q0, q2 ++.endm ++ ++ ++@ Preserves r12 ++@ Clobbers r2 ++.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v ++ vsubl.u8 q0, \Q0u, \P0u ++ vsubl.u8 q1, \Q0v, \P0v ++ vsubl.u8 q2, \P1u, \Q1u ++ vsubl.u8 q3, \P1v, \Q1v ++ vshl.i16 q0, #2 ++ vshl.i16 q1, #2 ++ vadd.i16 q0, q2 ++ vdup.16 d4, r2 ++ lsr r2, #16 ++ vadd.i16 q1, q3 ++ ++ @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all) ++ vrshr.s16 q0, #3 ++ vdup.16 d6, r2 + vmovl.u8 q2, d4 +- vmin.s16 q11, q11, q0 +- vmax.s16 q11, q11, q12 +- vaddw.u8 q1, q11, d2 +- vsub.i16 q2, q11 +- vqmovun.s16 d2, q1 +- vqmovun.s16 d4, q2 ++ vmovl.u8 q3, d6 ++ vuzp.16 d4, d5 ++ vrshr.s16 q1, #3 ++ vuzp.16 d6, d7 ++ ++ vmin.s16 q0, q2 ++ vneg.s16 q2, q2 ++ vmin.s16 q1, q3 ++ vneg.s16 q3, q3 ++ vmax.s16 q0, q2 ++ vaddw.u8 q2, q0, \P0u ++ vmax.s16 q1, q3 ++ vaddw.u8 q3, q1, \P0v ++ ++ vqmovun.s16 \P0u, q2 ++ vmovl.u8 q2, \Q0u ++ vqmovun.s16 \P0v, q3 ++ vmovl.u8 q3, \Q0v ++ vsub.i16 q2, q0 ++ vsub.i16 q3, q1 ++ ++ vqmovun.s16 \Q0u, q2 ++ vqmovun.s16 \Q0v, q3 + .endm + ++@ Preserves r12 ++@ Clobbers r2 ++.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth ++ vsub.i16 q0, \Q0u, \P0u ++ vsub.i16 q1, \Q0v, \P0v ++ vsub.i16 q2, \P1u, \Q1u ++ vsub.i16 q3, \P1v, \Q1v ++ vshl.i16 q0, #2 ++ vshl.i16 q1, #2 ++ vadd.i16 q0, q2 ++ vdup.16 d4, r2 ++ lsr r2, #16 ++ vadd.i16 q1, q3 ++ ++ @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all) ++ vrshr.s16 q0, #3 ++ vdup.16 d6, r2 ++ vshll.u8 q2, d4, #\bit_depth - 8 ++ vshll.u8 q3, d6, #\bit_depth - 8 ++ vuzp.16 d4, d5 ++ vrshr.s16 q1, #3 ++ vuzp.16 d6, d7 ++ ++ movw r2, #(1 << \bit_depth) - 1 ++ vmin.s16 q0, q2 ++ vneg.s16 q2, q2 ++ vmin.s16 q1, q3 ++ vneg.s16 q3, q3 ++ vmax.s16 q0, q2 ++ vmov.i64 q2, #0 ++ vmax.s16 q1, q3 ++ vdup.i16 q3, r2 ++ vadd.i16 \P0u, q0 ++ vsub.i16 \Q0u, q0 ++ vadd.i16 \P0v, q1 ++ vsub.i16 \Q0v, q1 ++ ++ vmax.s16 \P0u, q2 ++ vmax.s16 \Q0u, q2 ++ vmax.s16 \P0v, q2 ++ vmax.s16 \Q0v, q2 ++ vmin.s16 \P0u, q3 ++ vmin.s16 \Q0u, q3 ++ vmin.s16 \P0v, q3 ++ vmin.s16 \Q0v, q3 ++.endm ++ + + .macro hevc_loop_filter_luma_start ldr r12, [r3] ldr r3, [r3, #4] -@@ -60,15 +90,17 @@ - lsr r3, #16 +- lsl r3, #16 +- orr r3, r12 +- cmp r3, #0 ++ orrs r3, r12, r3, lsl #16 + it eq + bxeq lr +- lsr r3, #16 .endm -.macro hevc_loop_filter_luma_body -+@ Uses: r2, r3, r12 -+@ Modifies: r5, r6, r7, r8, r9 -+function hevc_loop_filter_luma_body -+ vmovl.u8 q15, d23 -+ vmovl.u8 q14, d22 -+ vmovl.u8 q13, d21 -+ vmovl.u8 q12, d20 -+ vmovl.u8 q11, d19 -+ vmovl.u8 q10, d18 -+ vmovl.u8 q9, d17 - vmovl.u8 q8, d16 +- vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 @@ -1195,46 +1950,103 @@ index 166bddb..9bd0a42 100644 - vmovl.u8 q13, d26 - vmovl.u8 q14, d28 - vmovl.u8 q15, d30 ++@ Uses: r2, r3, r12 ++@ Modifies: r5, r6, r7, r8, r9 ++ ++@ Input: ++@ r2 beta (raw: needs shift for bitdepth > 8) ++@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8) ++@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8) ++@ [sp,#96] &no_p[0] ++@ [sp,#100] &no_q[0] ++@ ++@ Input & output ++@ 8-bit: d16-d23 ++@ 16-bit: q8-q15 ++@ ++@ Output ++@ Z r10==0 ++@ r10[ 0:7 ] no_p[0] ++@ r10[ 8:15] no_p[1] ++@ r10[16:23] no_q[0] ++@ r10[24:31] no_q[1] ++ ++.macro m_filter_luma bit_depth ++.if \bit_depth == 8 ++ vmovl.u8 q15, d23 ++ vmovl.u8 q14, d22 ++ vmovl.u8 q13, d21 ++ vmovl.u8 q12, d20 ++ vmovl.u8 q11, d19 ++ vmovl.u8 q10, d18 ++ vmovl.u8 q9, d17 ++ vmovl.u8 q8, d16 ++.endif vadd.i16 q7, q9, q11 ++.if \bit_depth > 8 ++ lsl r2, r2, #(\bit_depth - 8) ++.endif vadd.i16 q6, q14, q12 -@@ -77,7 +109,6 @@ ++.if \bit_depth > 8 ++ lsl r3, r3, #(\bit_depth - 8) ++.endif + vsub.i16 q7, q10 ++ ldr r5, [sp, #96] @ Bolt no_x values together into r10 + vsub.i16 q6, q13 vabd.s16 q7, q7, q10 vabd.s16 q6, q6, q13 - - ++ ldrh r10, [r5] + vdup.16 q0, r2 vmov q4, q7 vmov q5, q6 -@@ -152,7 +183,7 @@ +- vdup.16 d4, r12 ++ ldr r5, [sp, #100] ++ vdup.16 d4, r3 ++ lsr r3, r3, #16 + vtrn.16 q7, q4 ++ ldrh r5, [r5] + vtrn.16 q6, q5 + + vshl.u64 q7, #32 + vshr.u64 q4, #32 + vshl.u64 q6, #32 ++ orr r10, r10, r5, lsl #16 + vshr.u64 q5, #32 + vshr.u64 q7, #32 + vshr.u64 q6, #32 +@@ -152,7 +320,7 @@ and r9, r8, r7 cmp r9, #0 - beq weakfilter_\@ -+ beq weakfilter_ ++ beq 1f vadd.i16 q2, q11, q12 vadd.i16 q4, q9, q8 -@@ -210,11 +241,11 @@ +@@ -210,11 +378,11 @@ vbit q13, q3, q5 vbit q14, q2, q5 -weakfilter_\@: -+weakfilter_: ++1: mvn r8, r8 and r9, r8, r7 cmp r9, #0 - beq ready_\@ -+ beq ready_ ++ beq 2f vdup.16 q4, r2 -@@ -275,75 +306,345 @@ weakfilter_\@: +@@ -275,111 +443,1041 @@ weakfilter_\@: vbit q11, q0, q5 vbit q12, q4, q5 -ready_\@: -+ready_: ++2: ++.if \bit_depth == 8 vqmovun.s16 d16, q8 - vqmovun.s16 d18, q9 - vqmovun.s16 d20, q10 @@ -1243,7 +2055,7 @@ index 166bddb..9bd0a42 100644 - vqmovun.s16 d26, q13 - vqmovun.s16 d28, q14 - vqmovun.s16 d30, q15 --.endm ++ cmp r10, #0 + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 + vqmovun.s16 d19, q11 @@ -1251,7 +2063,30 @@ index 166bddb..9bd0a42 100644 + vqmovun.s16 d21, q13 + vqmovun.s16 d22, q14 + vqmovun.s16 d23, q15 ++.else ++ movw r12, #(1 << \bit_depth - 1) ++ vmov.i64 q0, #0 ++ vdup.i16 q1, r12 ++ @ q8 & q15 should be unaltered and so don't require clipping ++ vmax.s16 q9, q0 ++ cmp r10, #0 ++ vmax.s16 q10, q0 ++ vmax.s16 q11, q0 ++ vmax.s16 q12, q0 ++ vmax.s16 q13, q0 ++ vmax.s16 q14, q0 ++ vmin.s16 q9, q1 ++ vmin.s16 q10, q1 ++ vmin.s16 q11, q1 ++ vmin.s16 q12, q1 ++ vmin.s16 q13, q1 ++ vmin.s16 q14, q1 ++.endif + mov pc, lr + .endm + ++function hevc_loop_filter_luma_body ++ m_filter_luma 8 +endfunc + +@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8])) @@ -1263,7 +2098,16 @@ index 166bddb..9bd0a42 100644 + b v_loop_luma_common +endfunc + - ++ ++@ void ff_hevc_v_loop_filter_luma_neon( ++@ uint8_t *_pix, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int _beta, [r2] ++@ int *_tc, [r3] ++@ uint8_t *_no_p, [sp+0] ++@ uint8_t *_no_q) [sp+4] ++ ++ function ff_hevc_v_loop_filter_luma_neon, export=1 hevc_loop_filter_luma_start - push {r5-r11} @@ -1271,14 +2115,6 @@ index 166bddb..9bd0a42 100644 + + sub r4, r0, #4 +v_loop_luma_common: -+ @ Why this isn't a bitmask to start with I have no idea... -+ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 -+ ldr r5, [sp, #32] -+ ldrh r10, [r5] -+ ldr r5, [sp, #36] -+ ldrh r5, [r5] -+ orr r10, r10, r5, lsl #16 @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1] -+ vpush {d8-d15} - sub r0, #4 - vld1.8 {d16}, [r0], r1 @@ -1335,44 +2171,38 @@ index 166bddb..9bd0a42 100644 + + @ no_p[1] + tst r10, #0xff00 -+ itt ne -+ addne r4, r4, r1, lsl #2 ++ add r2, r4, r1, lsl #2 + bne 1f + vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 + vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1 + vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 -+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1 -+ ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r4:32] ++1: ++ @ no_p[0] ++ tst r10, #0xff ++ bne 1f ++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1 ++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1 ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r2:32] +1: + @ no_q[1] + tst r10, #0xff000000 -+ itt ne -+ addne r0, r0, r1, lsl #2 -+ bne 2f ++ add r2, r0, r1, lsl #2 ++ bne 1f + vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 + vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1 + vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 -+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1 -+ -+2: -+ @ no_p[0] -+ tst r10, #0xff -+ bne 3f -+ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 -+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1 -+ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 -+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r4:32] -+ -+3: ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r0:32] ++1: + @ no_q[0] + tst r10, #0xff0000 -+ bne 4f -+ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 -+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1 -+ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 -+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r0:32] -+ -+4: ++ bne 1f ++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 ++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1 ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] ++1: +bypasswrite: vpop {d8-d15} - pop {r5-r11} @@ -1380,6 +2210,81 @@ index 166bddb..9bd0a42 100644 + pop {r4-r10,pc} endfunc ++.macro m_filter_v_luma_common_16 bit_depth ++ vpush {d8-d15} ++ ++ @ Uses slightly fewer instructions to do laned loads than unlaned ++ @ and transpose. This also means that we can use the same code for ++ @ both split & unsplit deblock ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1 ++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 ++ ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 ++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 ++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 ++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 ++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4] ++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ neg r1, r1 ++ ++ @ p[1] ++ tst r10, #0xff00 ++ add r2, r4, r1, lsl #2 ++ bne 1f ++ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 ++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4] ++1: ++ @ p[0] ++ tst r10, #0xff ++ bne 1f ++ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r2], r1 ++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r2], r1 ++ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r2], r1 ++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r2] ++1: ++ @ q[1] ++ tst r10, #0xff000000 ++ add r2, r0, r1, lsl #2 ++ bne 1f ++ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 ++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0] ++1: ++ @ q[0] ++ tst r10, #0xff0000 ++ bne 1f ++ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r2], r1 ++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 ++ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r2], r1 ++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] ++1: ++ vpop {d8-d15} ++ pop {r4-r10,pc} ++.endm ++ ++ ++ ++ +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] +@ ptrdiff_t stride, [r1] +@ int beta, [r2] @@ -1429,13 +2334,6 @@ index 166bddb..9bd0a42 100644 + neg r1, r1 + add r0, r0, r1 + -+ @ Why this isn't a bitmask to start with I have no idea... -+ @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0 -+ ldr r5, [sp, #32] -+ ldrh r10, [r5] -+ ldr r5, [sp, #36] -+ ldrh r5, [r5] -+ orrs r10, r10, r5, lsl #16 @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1] + bne 1f + + vst1.8 {d22}, [r0], r1 @@ -1486,8 +2384,81 @@ index 166bddb..9bd0a42 100644 + + pop {r4-r10,pc} + - endfunc - ++endfunc ++ ++ ++.macro m_filter_h_luma_16 bit_depth ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} ++ ++ vpush {d8-d15} ++ sub r0, r0, r1, lsl #2 ++ ++ vld1.16 { q8}, [r0], r1 ++ vld1.16 { q9}, [r0], r1 ++ vld1.16 {q10}, [r0], r1 ++ vld1.16 {q11}, [r0], r1 ++ vld1.16 {q12}, [r0], r1 ++ vld1.16 {q13}, [r0], r1 ++ vld1.16 {q14}, [r0], r1 ++ vld1.16 {q15}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ vpop {d8-d15} ++ ++ sub r0, r1 ++ neg r1, r1 ++ bne 1f ++ ++ vst1.16 {q14}, [r0], r1 ++ vst1.16 {q13}, [r0], r1 ++ vst1.16 {q12}, [r0], r1 ++ vst1.16 {q11}, [r0], r1 ++ vst1.16 {q10}, [r0], r1 ++ vst1.16 { q9}, [r0] ++ pop {r4-r10,pc} ++ ++@ Partial write ++1: ++ tst r10, #0xff0000 ++ mov r2, r0 ++ bne 1f ++ vst1.16 {d28}, [r2], r1 ++ vst1.16 {d26}, [r2], r1 ++ vst1.16 {d24}, [r2] ++ ++1: ++ tst r10, #0xff000000 ++ add r2, r0, #8 ++ bne 1f ++ vst1.16 {d29}, [r2], r1 ++ vst1.16 {d27}, [r2], r1 ++ vst1.16 {d25}, [r2] ++ ++1: ++ tst r10, #0xff ++ @ r0 = r0 + r1 * 3 ++ add r0, r0, r1 ++ add r0, r0, r1, lsl # 1 ++ add r2, r0, #8 ++ bne 1f ++ vst1.16 {d22}, [r0], r1 ++ vst1.16 {d20}, [r0], r1 ++ vst1.16 {d18}, [r0] ++ ++1: ++ tst r10, #0xff00 ++ bne 1f ++ vst1.16 {d23}, [r2], r1 ++ vst1.16 {d21}, [r2], r1 ++ vst1.16 {d19}, [r2] ++ ++1: ++ pop {r4-r10,pc} ++.endm ++ ++ +@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 @@ -1501,9 +2472,7 @@ index 166bddb..9bd0a42 100644 + vld2.8 {d26,d27}, [r0], r1 + vld2.8 {d28,d29}, [r0] + sub r0, r0, r1, lsl #1 -+ hevc_loop_filter_uv_body d16, d18, d26, d28 -+ lsr r2, r2, #16 -+ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29 + cmp r3, #0 + bne 1f + vst2.8 {d18,d19}, [r0], r1 @@ -1513,122 +2482,509 @@ index 166bddb..9bd0a42 100644 + @ At least one no_f bit is set + @ Which means we need to break this apart in an ugly fashion +1: vzip.8 d18, d19 ++ lsls r2, r3, #31 @ b0 -> N, b1 -> C + vzip.8 d26, d27 + sub r1, r1, #8 + -+ tst r3, #1 -+ bne 1f ++ bmi 1f + vst1.8 {d18}, [r0] +1: add r0, r0, #8 -+ tst r3, #2 -+ bne 2f ++ bcs 2f + vst1.8 {d19}, [r0] -+2: add r0, r0, r1 ++2: lsls r2, r3, #29 @ b2 -> N, b3 -> C ++ add r0, r0, r1 + -+ tst r3, #4 -+ bne 1f ++ bmi 1f + vst1.8 {d26}, [r0] -+1: add r0, r0, #8 -+ tst r3, #8 -+ it ne -+ bxne lr ++1: it cs ++ bxcs lr ++ add r0, r0, #8 + vst1.8 {d27}, [r0] + bx lr + +endfunc + + ++@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++@ ++@ Macro here actual function near bottom ++ ++.macro m_filter_h_uv_16 bit_depth ++ sub r0, r0, r1, lsl #1 ++ vld2.16 {q8, q9 }, [r0], r1 ++ vld2.16 {q10, q11}, [r0], r1 ++ vld2.16 {q12, q13}, [r0], r1 ++ vld2.16 {q14, q15}, [r0] ++ sub r0, r0, r1, lsl #1 ++ ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth ++ ++ cmp r3, #0 ++ bne 1f ++ vst2.16 {q10, q11}, [r0], r1 ++ vst2.16 {q12, q13}, [r0] ++ bx lr ++ ++ @ At least one no_f bit is set ++ @ Which means we need to break this apart in an ugly fashion ++1: vzip.16 q10, q11 ++ lsls r2, r3, #31 @ b0 -> N, b1 -> C ++ vzip.16 q12, q13 ++ sub r1, r1, #16 ++ ++ bmi 1f ++ vst1.16 {q10}, [r0] ++1: add r0, r0, #16 ++ bcs 2f ++ vst1.16 {q11}, [r0] ++2: lsls r2, r3, #29 @ b2 -> N, b3 -> C ++ add r0, r0, r1 ++ ++ bmi 1f ++ vst1.16 {q12}, [r0] ++1: it cs ++ bxcs lr ++ add r0, r0, #16 ++ vst1.16 {q13}, [r0] ++ bx lr ++.endm ++ ++ +@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ uint8_t * src_l, // r3 +@ unsigned int no_f); // sp[0] +@ -+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++ +function ff_hevc_v_loop_filter_uv2_neon_8, export=1 + vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3], r1 -+ vld4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0], r1 ++ vld4.8 {d20[0], d21[0], d22[0], d23[0]}, [r0], r1 ++ sub r12, r0, r3 + + vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 -+ vld4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 ++ vld4.8 {d20[1], d21[1], d22[1], d23[1]}, [r0], r1 ++ cmp r12, #4 + + vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 -+ vld4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 ++ vld4.8 {d20[2], d21[2], d22[2], d23[2]}, [r0], r1 + + vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 -+ vld4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 ++ vld4.8 {d20[3], d21[3], d22[3], d23[3]}, [r0], r1 + + vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 -+ vld4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ vld4.8 {d20[4], d21[4], d22[4], d23[4]}, [r0], r1 + + vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 -+ vld4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 ++ vld4.8 {d20[5], d21[5], d22[5], d23[5]}, [r0], r1 + + vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 -+ vld4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 ++ vld4.8 {d20[6], d21[6], d22[6], d23[6]}, [r0], r1 + + vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3] -+ vld4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0] -+ -+ hevc_loop_filter_uv_body d16, d18, d26, d28 -+ lsr r2, r2, #16 -+ hevc_loop_filter_uv_body d17, d19, d27, d29 ++ vld4.8 {d20[7], d21[7], d22[7], d23[7]}, [r0] ++ it eq ++ ldreq r12, [sp, #0] + ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 ++ cmp r12, #0 ++ add r3, #2 + neg r1, r1 -+ -+ ldr r2, [sp, #0] -+ -+ @ p[1] -+ tst r2, #2 -+ itt ne -+ addne r3, r3, r1, lsl #2 + bne 1f -+ vst4.8 {d16[7], d17[7], d18[7], d19[7]}, [r3], r1 -+ vst4.8 {d16[6], d17[6], d18[6], d19[6]}, [r3], r1 -+ vst4.8 {d16[5], d17[5], d18[5], d19[5]}, [r3], r1 -+ vst4.8 {d16[4], d17[4], d18[4], d19[4]}, [r3], r1 + ++@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ so it is worth having this special case ++ vst4.8 {d18[7], d19[7], d20[7], d21[7]}, [r3], r1 ++ vst4.8 {d18[6], d19[6], d20[6], d21[6]}, [r3], r1 ++ vst4.8 {d18[5], d19[5], d20[5], d21[5]}, [r3], r1 ++ vst4.8 {d18[4], d19[4], d20[4], d21[4]}, [r3], r1 ++ vst4.8 {d18[3], d19[3], d20[3], d21[3]}, [r3], r1 ++ vst4.8 {d18[2], d19[2], d20[2], d21[2]}, [r3], r1 ++ vst4.8 {d18[1], d19[1], d20[1], d21[1]}, [r3], r1 ++ vst4.8 {d18[0], d19[0], d20[0], d21[0]}, [r3] ++ bx lr ++ ++@ Either split or partial +1: -+ @ q[1] -+ tst r2, #8 -+ itt ne -+ addne r0, r0, r1, lsl #2 -+ bne 2f -+ vst4.8 {d26[7], d27[7], d28[7], d29[7]}, [r0], r1 -+ vst4.8 {d26[6], d27[6], d28[6], d29[6]}, [r0], r1 -+ vst4.8 {d26[5], d27[5], d28[5], d29[5]}, [r0], r1 -+ vst4.8 {d26[4], d27[4], d28[4], d29[4]}, [r0], r1 ++ ldr r12, [sp, #0] ++ lsls r12, #29 @ b2 -> N, b3 -> C ++ add r2, r0, r1, lsl #2 ++ bcs 1f ++ vst2.8 {d20[7], d21[7]}, [r0], r1 ++ vst2.8 {d20[6], d21[6]}, [r0], r1 ++ vst2.8 {d20[5], d21[5]}, [r0], r1 ++ vst2.8 {d20[4], d21[4]}, [r0] ++1: ++ bmi 2f ++ vst2.8 {d20[3], d21[3]}, [r2], r1 ++ vst2.8 {d20[2], d21[2]}, [r2], r1 ++ vst2.8 {d20[1], d21[1]}, [r2], r1 ++ vst2.8 {d20[0], d21[0]}, [r2] + +2: -+ @ p[0] -+ tst r2, #1 -+ bne 3f -+ vst4.8 {d16[3], d17[3], d18[3], d19[3]}, [r3], r1 -+ vst4.8 {d16[2], d17[2], d18[2], d19[2]}, [r3], r1 -+ vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3], r1 -+ vst4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3] -+ ++ lsls r12, #2 ++ add r2, r3, r1, lsl #2 ++ bcs 3f ++ vst2.8 {d18[7], d19[7]}, [r3], r1 ++ vst2.8 {d18[6], d19[6]}, [r3], r1 ++ vst2.8 {d18[5], d19[5]}, [r3], r1 ++ vst2.8 {d18[4], d19[4]}, [r3] +3: -+ @ q[0] -+ tst r2, #4 -+ it ne -+ bxne lr -+ vst4.8 {d26[3], d27[3], d28[3], d29[3]}, [r0], r1 -+ vst4.8 {d26[2], d27[2], d28[2], d29[2]}, [r0], r1 -+ vst4.8 {d26[1], d27[1], d28[1], d29[1]}, [r0], r1 -+ vst4.8 {d26[0], d27[0], d28[0], d29[0]}, [r0] -+ ++ it mi ++ bxmi lr ++ vst2.8 {d18[3], d19[3]}, [r2], r1 ++ vst2.8 {d18[2], d19[2]}, [r2], r1 ++ vst2.8 {d18[1], d19[1]}, [r2], r1 ++ vst2.8 {d18[0], d19[0]}, [r2] + bx lr -+endfunc + endfunc + ++ ++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++.macro m_filter_v_uv2_16 bit_depth ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r3], r1 ++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 ++ sub r12, r0, r3 ++ ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r3], r1 ++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ cmp r12, #8 ++ ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r3], r1 ++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r3], r1 ++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r3], r1 ++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r3], r1 ++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r3], r1 ++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r3] ++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] ++ it eq ++ ldreq r12, [sp, #0] ++ ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth ++ cmp r12, #0 ++ add r3, #4 ++ neg r1, r1 ++ bne 1f ++ ++@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ so it is worth having this special case ++ vst4.16 {d21[3], d23[3],d25[3], d27[3]}, [r3], r1 ++ vst4.16 {d21[2], d23[2],d25[2], d27[2]}, [r3], r1 ++ vst4.16 {d21[1], d23[1],d25[1], d27[1]}, [r3], r1 ++ vst4.16 {d21[0], d23[0],d25[0], d27[0]}, [r3], r1 ++ vst4.16 {d20[3], d22[3],d24[3], d26[3]}, [r3], r1 ++ vst4.16 {d20[2], d22[2],d24[2], d26[2]}, [r3], r1 ++ vst4.16 {d20[1], d22[1],d24[1], d26[1]}, [r3], r1 ++ vst4.16 {d20[0], d22[0],d24[0], d26[0]}, [r3], r1 ++ bx lr ++ ++@ Either split or partial ++1: ++ ldr r12, [sp, #0] ++ lsls r12, #29 @ b2 -> N, b3 -> C ++ add r2, r0, r1, lsl #2 ++ bcs 1f ++ vst2.16 {d25[3], d27[3]}, [r0], r1 ++ vst2.16 {d25[2], d27[2]}, [r0], r1 ++ vst2.16 {d25[1], d27[1]}, [r0], r1 ++ vst2.16 {d25[0], d27[0]}, [r0] ++1: ++ bmi 2f ++ vst2.16 {d24[3], d26[3]}, [r2], r1 ++ vst2.16 {d24[2], d26[2]}, [r2], r1 ++ vst2.16 {d24[1], d26[1]}, [r2], r1 ++ vst2.16 {d24[0], d26[0]}, [r2] ++ ++2: ++ lsls r12, #2 ++ add r2, r3, r1, lsl #2 ++ bcs 3f ++ vst2.16 {d21[3], d23[3]}, [r3], r1 ++ vst2.16 {d21[2], d23[2]}, [r3], r1 ++ vst2.16 {d21[1], d23[1]}, [r3], r1 ++ vst2.16 {d21[0], d23[0]}, [r3] ++3: ++ it mi ++ bxmi lr ++ vst2.16 {d20[3], d22[3]}, [r2], r1 ++ vst2.16 {d20[2], d22[2]}, [r2], r1 ++ vst2.16 {d20[1], d22[1]}, [r2], r1 ++ vst2.16 {d20[0], d22[0]}, [r2] ++ bx lr ++.endm ++ + + function ff_hevc_v_loop_filter_chroma_neon, export=1 hevc_loop_filter_chroma_start ++ ++ sub r0, #2 ++ vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r0], r1 ++ vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0], r1 ++ vld4.8 {d16[2], d17[2], d18[2], d19[2]}, [r0], r1 ++ vld4.8 {d16[3], d17[3], d18[3], d19[3]}, [r0], r1 ++ vld4.8 {d16[4], d17[4], d18[4], d19[4]}, [r0], r1 ++ vld4.8 {d16[5], d17[5], d18[5], d19[5]}, [r0], r1 ++ vld4.8 {d16[6], d17[6], d18[6], d19[6]}, [r0], r1 ++ vld4.8 {d16[7], d17[7], d18[7], d19[7]}, [r0], r1 ++ ++ sub r0, r0, r1, lsl #3 ++ add r0, r0, #1 ++ hevc_loop_filter_chroma_body d16, d17, d18, d19 ++ bne 1f ++ ++ vst2.8 {d17[0], d18[0]}, [r0], r1 ++ vst2.8 {d17[1], d18[1]}, [r0], r1 ++ vst2.8 {d17[2], d18[2]}, [r0], r1 ++ vst2.8 {d17[3], d18[3]}, [r0], r1 ++ vst2.8 {d17[4], d18[4]}, [r0], r1 ++ vst2.8 {d17[5], d18[5]}, [r0], r1 ++ vst2.8 {d17[6], d18[6]}, [r0], r1 ++ vst2.8 {d17[7], d18[7]}, [r0], r1 ++ bx lr ++ ++1: ++ tst r12, #0xff @ P0a ++ bne 2f ++ ++ vst1.8 {d17[0]}, [r0], r1 ++ vst1.8 {d17[1]}, [r0], r1 ++ vst1.8 {d17[2]}, [r0], r1 ++ vst1.8 {d17[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++2: ++ tst r12, #0xff0000 @ Q0a ++ add r0, #1 ++ bne 3f ++ vst1.8 {d18[0]}, [r0], r1 ++ vst1.8 {d18[1]}, [r0], r1 ++ vst1.8 {d18[2]}, [r0], r1 ++ vst1.8 {d18[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++3: ++ tst r12, #0xff000000 @ Q0b ++ add r0, r0, r1, lsl #2 ++ bne 4f ++ vst1.8 {d18[4]}, [r0], r1 ++ vst1.8 {d18[5]}, [r0], r1 ++ vst1.8 {d18[6]}, [r0], r1 ++ vst1.8 {d18[7]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++4: ++ tst r12, #0xff00 @ P0b ++ it ne ++ bxne lr ++ ++ sub r0, #1 ++ vst1.8 {d17[4]}, [r0], r1 ++ vst1.8 {d17[5]}, [r0], r1 ++ vst1.8 {d17[6]}, [r0], r1 ++ vst1.8 {d17[7]}, [r0], r1 ++ bx lr ++ ++endfunc ++ ++ ++.macro m_filter_v_chroma_16 bit_depth ++ hevc_loop_filter_chroma_start ++ sub r0, #4 -@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1 - vst1.8 {d4}, [r0] ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r0], r1 ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0], r1 ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r0], r1 ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r0], r1 ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r0], r1 ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r0], r1 ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r0], r1 ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0], r1 ++ ++ sub r0, r0, r1, lsl #3 ++ add r0, r0, #2 ++ hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth ++ bne 1f ++ ++ vst2.16 {d18[0], d20[0]}, [r0], r1 ++ vst2.16 {d18[1], d20[1]}, [r0], r1 ++ vst2.16 {d18[2], d20[2]}, [r0], r1 ++ vst2.16 {d18[3], d20[3]}, [r0], r1 ++ vst2.16 {d19[0], d21[0]}, [r0], r1 ++ vst2.16 {d19[1], d21[1]}, [r0], r1 ++ vst2.16 {d19[2], d21[2]}, [r0], r1 ++ vst2.16 {d19[3], d21[3]}, [r0], r1 ++ bx lr ++ ++1: ++ tst r12, #0xff @ P0a ++ bne 2f ++ ++ vst1.16 {d18[0]}, [r0], r1 ++ vst1.16 {d18[1]}, [r0], r1 ++ vst1.16 {d18[2]}, [r0], r1 ++ vst1.16 {d18[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++2: ++ tst r12, #0xff0000 @ Q0a ++ add r0, #1 ++ bne 3f ++ vst1.16 {d20[0]}, [r0], r1 ++ vst1.16 {d20[1]}, [r0], r1 ++ vst1.16 {d20[2]}, [r0], r1 ++ vst1.16 {d20[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++3: ++ tst r12, #0xff000000 @ Q0b ++ add r0, r0, r1, lsl #2 ++ bne 4f ++ vst1.16 {d21[0]}, [r0], r1 ++ vst1.16 {d21[1]}, [r0], r1 ++ vst1.16 {d21[2]}, [r0], r1 ++ vst1.16 {d21[3]}, [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ ++4: ++ tst r12, #0xff00 @ P0b ++ it ne ++ bxne lr ++ ++ sub r0, #1 ++ vst1.16 {d19[0]}, [r0], r1 ++ vst1.16 {d19[1]}, [r0], r1 ++ vst1.16 {d19[2]}, [r0], r1 ++ vst1.16 {d19[3]}, [r0], r1 ++ bx lr ++.endm ++ ++ ++@ void ff_hevc_h_loop_filter_chroma_neon( ++@ uint8_t *_pix, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int *_tc, [r2] ++@ uint8_t *_no_p, [r3] ++@ uint8_t *_no_q); [sp+0] ++ ++function ff_hevc_h_loop_filter_chroma_neon, export=1 ++ hevc_loop_filter_chroma_start ++ sub r0, r0, r1, lsl #1 + vld1.8 {d16}, [r0], r1 + vld1.8 {d17}, [r0], r1 + vld1.8 {d18}, [r0], r1 +- vld1.8 {d2}, [r0], r1 +- vld1.8 {d4}, [r0], r1 +- vld1.8 {d19}, [r0], r1 +- vld1.8 {d20}, [r0], r1 +- vld1.8 {d21}, [r0], r1 +- sub r0, r0, r1, lsl #3 +- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 +- hevc_loop_filter_chroma_body +- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 +- vst1.8 {d16}, [r0], r1 ++ vld1.8 {d19}, [r0] ++ sub r0, r0, r1, lsl #1 ++ hevc_loop_filter_chroma_body d16, d17, d18, d19 ++ bne 1f @ Partial write + vst1.8 {d17}, [r0], r1 +- vst1.8 {d18}, [r0], r1 +- vst1.8 {d2}, [r0], r1 +- vst1.8 {d4}, [r0], r1 +- vst1.8 {d19}, [r0], r1 +- vst1.8 {d20}, [r0], r1 +- vst1.8 {d21}, [r0] ++ vst1.8 {d18}, [r0] ++ bx lr ++1: ++ tst r12, #0xff ++ vmov r2, r3, d17 ++ it eq ++ streq r2, [r0] ++ tst r12, #0xff00 ++ it eq ++ streq r3, [r0, #4] ++ ++ add r0, r1 ++ tst r12, #0xff0000 ++ vmov r2, r3, d18 ++ it eq ++ streq r2, [r0] ++ tst r12, #0xff000000 ++ it eq ++ streq r3, [r0, #4] ++ bx lr endfunc + +-function ff_hevc_h_loop_filter_chroma_neon, export=1 ++.macro m_filter_h_chroma_16 bit_depth + hevc_loop_filter_chroma_start + sub r0, r0, r1, lsl #1 +- vld1.8 {d18}, [r0], r1 +- vld1.8 {d2}, [r0], r1 +- vld1.8 {d4}, [r0], r1 +- vld1.8 {d19}, [r0] ++ vld1.16 {q8}, [r0], r1 ++ vld1.16 {q9}, [r0], r1 ++ vld1.16 {q10}, [r0], r1 ++ vld1.16 {q11}, [r0] + sub r0, r0, r1, lsl #1 +- hevc_loop_filter_chroma_body +- vst1.8 {d2}, [r0], r1 +- vst1.8 {d4}, [r0] ++ hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth ++ bne 1f @ Partial write ++ vst1.16 {q9}, [r0], r1 ++ vst1.16 {q10}, [r0] ++ bx lr ++1: ++ tst r12, #0xff ++ bne 2f ++ vst1.16 {d18}, [r0] ++2: ++ tst r12, #0xff00 ++ bne 3f ++ add r0, #8 ++ vst1.16 {d19}, [r0] ++ sub r0, #8 ++3: ++ tst r12, #0xff0000 ++ add r0, r1 ++ bne 4f ++ vst1.16 {d20}, [r0] ++4: ++ tst r12, #0xff000000 ++ it ne ++ bxne lr ++ add r0, #8 ++ vst1.16 {d21}, [r0] ++ + bx lr ++.endm ++ + +/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i + * int *curr_rpl0, int *curr_ @@ -1754,9 +3110,54 @@ index 166bddb..9bd0a42 100644 + b 11b +endfunc + ++@ ============================================================================= ++@ ++@ 10 bit ++ ++function hevc_loop_filter_luma_body_10 ++ m_filter_luma 10 ++endfunc ++ ++function ff_hevc_h_loop_filter_luma_neon_10, export=1 ++ m_filter_h_luma_16 10 ++endfunc ++ ++function ff_hevc_v_loop_filter_luma2_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} @ 8 regs = 32 bytes ++ ++ ldr r4, [sp, #40] ++ b v_loop_luma_common_10 ++endfunc ++ ++function ff_hevc_v_loop_filter_luma_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ push {r4-r10,lr} ++ ++ sub r4, r0, #8 ++v_loop_luma_common_10: ++ m_filter_v_luma_common_16 10 ++endfunc ++ ++function ff_hevc_h_loop_filter_uv_neon_10, export=1 ++ m_filter_h_uv_16 10 ++endfunc ++ ++function ff_hevc_v_loop_filter_uv2_neon_10, export=1 ++ m_filter_v_uv2_16 10 ++endfunc ++ ++function ff_hevc_h_loop_filter_chroma_neon_10, export=1 ++ m_filter_h_chroma_16 10 ++endfunc ++ ++function ff_hevc_v_loop_filter_chroma_neon_10, export=1 ++ m_filter_v_chroma_16 10 + endfunc ++ diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S new file mode 100644 -index 0000000..00eab9e +index 0000000000..00eab9eeee --- /dev/null +++ b/libavcodec/arm/hevcdsp_epel_neon.S @@ -0,0 +1,337 @@ @@ -2097,11 +3498,399 @@ index 0000000..00eab9e + .byte 4, 28, 46, 6 + .byte 2, 16, 54, 4 + .byte 2, 10, 58, 2 +diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S +index 13d540e5ff..9b6d745556 100644 +--- a/libavcodec/arm/hevcdsp_idct_neon.S ++++ b/libavcodec/arm/hevcdsp_idct_neon.S +@@ -21,82 +21,6 @@ + #include "libavutil/arm/asm.S" + #include "neon.S" + +-function ff_hevc_idct_4x4_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- vdup.16 q0, r1 +- vdup.16 q1, r1 +- vst1.16 {q0, q1}, [r0] +- bx lr +-endfunc +- +-function ff_hevc_idct_8x8_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- vdup.16 q8, r1 +- vdup.16 q9, r1 +- vmov.16 q10, q8 +- vmov.16 q11, q8 +- vmov.16 q12, q8 +- vmov.16 q13, q8 +- vmov.16 q14, q8 +- vmov.16 q15, q8 +- vstm r0, {q8-q15} +- bx lr +-endfunc +- +-function ff_hevc_idct_16x16_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- vdup.16 q8, r1 +- vdup.16 q9, r1 +- vmov.16 q10, q8 +- vmov.16 q11, q8 +- vmov.16 q12, q8 +- vmov.16 q13, q8 +- vmov.16 q14, q8 +- vmov.16 q15, q8 +- vstm r0!, {q8-q15} +- vstm r0!, {q8-q15} +- vstm r0!, {q8-q15} +- vstm r0, {q8-q15} +- bx lr +-endfunc +- +-function ff_hevc_idct_32x32_dc_neon_8, export=1 +- ldrsh r1, [r0] +- ldr r2, =0x20 +- add r1, #1 +- asr r1, #1 +- add r1, r2 +- asr r1, #6 +- mov r3, #16 +- vdup.16 q8, r1 +- vdup.16 q9, r1 +- vmov.16 q10, q8 +- vmov.16 q11, q8 +- vmov.16 q12, q8 +- vmov.16 q13, q8 +- vmov.16 q14, q8 +- vmov.16 q15, q8 +-1: subs r3, #1 +- vstm r0!, {q8-q15} +- bne 1b +- bx lr +-endfunc +- + function ff_hevc_transform_add_4x4_neon_8, export=1 + vldm r1, {q0-q1} + vld1.32 d4[0], [r0], r2 +@@ -168,6 +92,131 @@ function ff_hevc_transform_add_32x32_neon_8, export=1 + bx lr + endfunc + ++ ++@ ff_hevc_add_residual_4x4_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_4x4_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ ++ vld1.32 d4[0], [r0], r1 ++ vld1.32 d4[1], [r0], r1 ++ vld1.32 d5[0], [r0], r1 ++ vld1.32 d5[1], [r0], r1 ++ sub r0, r0, r1, lsl #2 ++ vaddw.u8 q0, q15, d4 ++ vaddw.u8 q1, q15, d5 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.32 d0[0], [r0], r1 ++ vst1.32 d0[1], [r0], r1 ++ vst1.32 d1[0], [r0], r1 ++ vst1.32 d1[1], [r0], r1 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_add_residual_4x4_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1 ++ vdup.32 q15, r2 ++ mov r3, #4 ++ b 1f ++endfunc ++ ++@ ff_hevc_add_residual_8x8_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_8x8_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #8 ++ ++1: subs r3, #1 ++ vld1.8 d16, [r0] ++ vaddw.u8 q0, q15, d16 ++ vqmovun.s16 d0, q0 ++ vst1.32 d0, [r0], r1 ++ bne 1b ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_add_residual_8x8_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1 ++ vdup.32 q15, r2 ++ mov r3, #8 ++ b 1f ++endfunc ++ ++@ ff_hevc_add_residual_16x16_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_16x16_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #16 ++ ++1: subs r3, #1 ++ vld1.8 {q8}, [r0] ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.8 {q0}, [r0], r1 ++ bne 1b ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_add_residual_16x16_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1 ++ vdup.32 q15, r2 ++ mov r3, #16 ++ b 1f ++endfunc ++ ++@ ff_hevc_add_residual_32x32_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_add_residual_32x32_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #32 ++ ++1: subs r3, #1 ++ vld1.8 {q8, q9}, [r0] ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d2, q2 ++ vqmovun.s16 d3, q3 ++ vst1.8 {q0, q1}, [r0], r1 ++ bne 1b ++ bx lr ++endfunc ++ ++ ++ + .macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7 + vtrn.64 \r0, \r4 + vtrn.64 \r1, \r5 +@@ -263,55 +312,6 @@ endfunc + vqrshrn.s32 \r3, q3, \shift + .endm + +-function ff_hevc_transform_4x4_neon_8, export=1 +- vpush {d8-d15} +- vld1.16 {q14, q15}, [r0] // coeffs +- ldr r3, =0x00240053 // 36 and 83 +- vmov.32 d0[0], r3 +- +- tr4_shift d28, d29, d30, d31, #7 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- +- tr4_shift d28, d29, d30, d31, #12 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- +- vst1.16 {q14, q15}, [r0] +- vpop {d8-d15} +- bx lr +-endfunc +- +-function ff_hevc_transform_luma_4x4_neon_8, export=1 +- vpush {d8-d15} +- vld1.16 {q14, q15}, [r0] // coeffs +- ldr r3, =0x4a // 74 +- vmov.32 d0[0], r3 +- ldr r3, =0x1d // 29 +- vmov.32 d0[1], r3 +- ldr r3, =0x37 // 55 +- vmov.32 d1[0], r3 +- +- tr4_luma_shift d28, d29, d30, d31, #7 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- +- tr4_luma_shift d28, d29, d30, d31, #12 +- +- vtrn.16 d28, d29 +- vtrn.16 d30, d31 +- vtrn.32 q14, q15 +- vst1.16 {q14, q15}, [r0] +- vpop {d8-d15} +- bx lr +-endfunc +- + .macro tr8_begin in0, in1, in2, in3 + vmull.s16 q7, \in0, d1[1] // 89 * src1 + vmull.s16 q8, \in0, d1[0] // 75 * src1 +@@ -356,100 +356,6 @@ endfunc + vqrshrn.s32 d8, q5, \shift + .endm + +-function ff_hevc_transform_8x8_neon_8, export=1 +- push {r4-r8} +- vpush {d8-d15} +- mov r5, #16 +- +- adr r3, tr4f +- vld1.16 {d0, d1}, [r3] +- +- // left half +- vld1.16 {d24}, [r0], r5 +- vld1.16 {d25}, [r0], r5 +- vld1.16 {d26}, [r0], r5 +- vld1.16 {d27}, [r0], r5 +- vld1.16 {d28}, [r0], r5 +- vld1.16 {d29}, [r0], r5 +- vld1.16 {d30}, [r0], r5 +- vld1.16 {d31}, [r0], r5 +- sub r0, #128 +- tr8_begin d25, d27, d29, d31 +- tr4 d24, d26, d28, d30 +- tr8_end #7 +- vst1.16 {d2}, [r0], r5 +- vst1.16 {d3}, [r0], r5 +- vst1.16 {d4}, [r0], r5 +- vst1.16 {d5}, [r0], r5 +- vst1.16 {d6}, [r0], r5 +- vst1.16 {d7}, [r0], r5 +- vst1.16 {d8}, [r0], r5 +- vst1.16 {d9}, [r0], r5 +- sub r0, #128 +- //skip right half if col_limit in r1 is less than 4 +- cmp r1, #4 +- blt 1f +- //right half +- add r0, #8 +- vld1.16 {d24}, [r0], r5 +- vld1.16 {d25}, [r0], r5 +- vld1.16 {d26}, [r0], r5 +- vld1.16 {d27}, [r0], r5 +- vld1.16 {d28}, [r0], r5 +- vld1.16 {d29}, [r0], r5 +- vld1.16 {d30}, [r0], r5 +- vld1.16 {d31}, [r0], r5 +- sub r0, #128 +- tr8_begin d25, d27, d29, d31 +- tr4 d24, d26, d28, d30 +- tr8_end #7 +- vst1.16 {d2}, [r0], r5 +- vst1.16 {d3}, [r0], r5 +- vst1.16 {d4}, [r0], r5 +- vst1.16 {d5}, [r0], r5 +- vst1.16 {d6}, [r0], r5 +- vst1.16 {d7}, [r0], r5 +- vst1.16 {d8}, [r0], r5 +- vst1.16 {d9}, [r0], r5 +- sub r0, #136 +-1: +- // top half +- vldm r0, {q12-q15} // coeffs +- transpose_16b_4x4 d24, d26, d28, d30 +- transpose_16b_4x4 d25, d27, d29, d31 +- tr8_begin d26, d30, d27, d31 +- tr4 d24, d28, d25, d29 +- tr8_end #12 +- transpose_16b_4x4 d2, d3, d4, d5 +- transpose_16b_4x4 d6, d7, d8, d9 +- vswp d7, d5 +- vswp d7, d8 +- vswp d3, d6 +- vswp d6, d4 +- vstm r0!, {q1-q4} +- +- // bottom half +- vldm r0, {q12-q15} // coeffs +- transpose_16b_4x4 d24, d26, d28, d30 +- transpose_16b_4x4 d25, d27, d29, d31 +- tr8_begin d26, d30, d27, d31 +- tr4 d24, d28, d25, d29 +- tr8_end #12 +- transpose_16b_4x4 d2, d3, d4, d5 +- transpose_16b_4x4 d6, d7, d8, d9 +- vswp d7, d5 +- vswp d7, d8 +- vswp d3, d6 +- vswp d6, d4 +- //vstm r0, {q1-q4} +- vst1.16 {q1-q2}, [r0] +- add r0, #32 +- vst1.16 {q3-q4}, [r0] +- sub r0, #32 +- vpop {d8-d15} +- pop {r4-r8} +- bx lr +-endfunc + + .align 4 + tr4f: +@@ -463,3 +369,11 @@ tr16: + .word 0x00500046 // 80, d2[2] = 70 + .word 0x0039002b // 57, d2[0] = 43 + .word 0x00190009 // 25, d2[2] = 9 ++ ++#define BIT_DEPTH 8 ++#include "hevc_idct_fn_neon.S" ++ ++#undef BIT_DEPTH ++#define BIT_DEPTH 10 ++#include "hevc_idct_fn_neon.S" ++ diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c -index 5591807..b6c48ee 100644 +index 55918077e2..e708b7c074 100644 --- a/libavcodec/arm/hevcdsp_init_neon.c +++ b/libavcodec/arm/hevcdsp_init_neon.c -@@ -22,11 +22,26 @@ +@@ -22,11 +22,41 @@ #include "libavutil/arm/cpu.h" #include "libavcodec/hevcdsp.h" #include "hevcdsp_arm.h" @@ -2113,6 +3902,11 @@ index 5591807..b6c48ee 100644 void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); + ++void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ +#ifdef RPI +void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, + unsigned int _stride, unsigned int beta, const int32_t tc[2], @@ -2123,44 +3917,201 @@ index 5591807..b6c48ee 100644 +void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, + uint8_t * src_l, + unsigned int no_f); ++ ++void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); +#endif + void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit); void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit); void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs); -@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, +@@ -34,14 +64,174 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs); + void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs); + void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs); + void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs); ++ ++void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs); ++void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs); ++void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs); ++void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs); ++void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs); ++ + void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, +- ptrdiff_t stride); ++ ptrdiff_t stride); + void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, +- ptrdiff_t stride); ++ ptrdiff_t stride); + void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, +- ptrdiff_t stride); ++ ptrdiff_t stride); void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); - -+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); -+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); -+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); -+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height); +- ptrdiff_t stride); ++ ptrdiff_t stride); + -+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); ++void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); + -+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); -+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table); + -+void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, -+ const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo); ++void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); + -+void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++#if RPI_HEVC_SAND ++void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++ ++ ++void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++#endif ++ ++void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++ ++void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++ ++#if RPI_HEVC_SAND ++void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++ ++void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++ ++void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); + ++void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++#endif + ++void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ ++void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ + #define PUT_PIXELS(name) \ void name(int16_t *dst, uint8_t *src, \ - ptrdiff_t srcstride, int height, \ -@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); +@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8); PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8); #undef PUT_PIXELS @@ -2176,227 +4127,110 @@ index 5591807..b6c48ee 100644 static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int width); -@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t +@@ -142,25 +341,181 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); } -+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, ++ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, ++ MvField *curr, MvField *neigh, uint8_t *bs); ++ ++ ++static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int8_t offset_table[32] = { 0 }; -+ int k, y, x; -+ int shift = 3; // BIT_DEPTH - 5 -+ int cwidth = 0; -+ -+ stride_src /= sizeof(pixel); -+ stride_dst /= sizeof(pixel); -+ -+ for (k = 0; k < 4; k++) -+ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1]; -+ -+ if (height % 8 == 0) -+ cwidth = width; -+ -+ switch(cwidth){ -+ case 8: -+ ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ case 16: -+ ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ case 32: -+ ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ case 64: -+ ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height); -+ break; -+ default: -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) -+ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); -+ dst += stride_dst; -+ src += stride_src; -+ } -+ } ++ ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); ++} ++static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); +} + -+static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ++static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++ ++#if SAO_FILTER_N == 6 ++static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); ++} ++static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); ++} ++ ++static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++ ++#if RPI_HEVC_SAND ++static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++ ++static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height) +{ -+ // Width 32 already dealt with -+ // width 16 code works in double lines -+ if (width == 16 && (height & 1) == 0) { -+ ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst, -+ sao_offset_val_u, sao_left_class_u, -+ sao_offset_val_v, sao_left_class_v, -+ width, height); -+ } -+ else -+ { -+ const int shift = 3; // BIT_DEPTH - 5 -+ int k, y, x; -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int8_t offset_table_u[32] = { 0 }; -+ int8_t offset_table_v[32] = { 0 }; -+ -+ stride_src /= sizeof(pixel); -+ stride_dst /= sizeof(pixel); -+ -+ for (k = 0; k < 4; k++) -+ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; -+ for (k = 0; k < 4; k++) -+ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; -+ -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width * 2; x += 2) -+ { -+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); -+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); -+ } -+ dst += stride_dst; -+ src += stride_src; -+ -+ } -+ } ++ ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} -+ -+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1)) -+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, -+ int16_t *_sao_offset_val, int eo, int width, int height) ++static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) +{ -+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; -+ static const int8_t pos[4][2][2] = { -+ { { -1, 0 }, { 1, 0 } }, // horizontal -+ { { 0, -1 }, { 0, 1 } }, // vertical -+ { { -1, -1 }, { 1, 1 } }, // 45 degree -+ { { 1, -1 }, { -1, 1 } }, // 135 degree -+ }; -+ int8_t sao_offset_val[8]; // padding of 3 for vld -+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE); -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int a_stride, b_stride; -+ int x, y; -+ int cwidth = 0; -+ -+ for (x = 0; x < 5; x++) { -+ sao_offset_val[x] = _sao_offset_val[edge_idx[x]]; -+ } -+ -+ if (height % 8 == 0) -+ cwidth = width; -+ -+ stride_src /= sizeof(pixel); -+ stride_dst /= sizeof(pixel); -+ -+ switch (cwidth) { -+ case 32: -+ switch(eo) { -+ case 0: -+ ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 1: -+ ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 2: -+ ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 3: -+ ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ } -+ break; -+ case 64: -+ switch(eo) { -+ case 0: -+ ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 1: -+ ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 2: -+ ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ case 3: -+ ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val); -+ break; -+ } -+ break; -+ default: -+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; -+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x++) { -+ int diff0 = CMP(src[x], src[x + a_stride]); -+ int diff1 = CMP(src[x], src[x + b_stride]); -+ int idx = diff0 + diff1; -+ if (idx) -+ dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]); -+ } -+ src += stride_src; -+ dst += stride_dst; -+ } -+ } ++ ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} ++#endif ++#endif + + -+static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, -+ int eo, int width, int height) -+{ -+ const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); + -+ if (width == 32 && (height & 7) == 0) { -+ ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo); -+ } -+ else -+ { -+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; -+ static const int8_t pos[4][2][2] = { -+ { { -1, 0 }, { 1, 0 } }, // horizontal -+ { { 0, -1 }, { 0, 1 } }, // vertical -+ { { -1, -1 }, { 1, 1 } }, // 45 degree -+ { { 1, -1 }, { -1, 1 } }, // 135 degree -+ }; -+ int8_t sao_offset_val_u[8]; // padding of 3 for vld -+ int8_t sao_offset_val_v[8]; // padding of 3 for vld -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int a_stride, b_stride; -+ int x, y; -+ -+ for (x = 0; x < 5; x++) { -+ sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]]; -+ sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]]; -+ } -+ -+ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; -+ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width * 2; x += 2) { -+ int diff0u = CMP(src[x], src[x + a_stride]); -+ int diff1u = CMP(src[x], src[x + b_stride]); -+ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); -+ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); -+ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]); -+ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]); -+ } -+ src += stride_src; -+ dst += stride_dst; -+ } -+ } -+} -+#undef CMP -+ -+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, -+ int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1, -+ MvField *curr, MvField *neigh, uint8_t *bs); ++#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160 ++#error SAO edge src stride not 160 - value used in .S ++#endif + av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) { @@ -2407,7 +4241,9 @@ index 5591807..b6c48ee 100644 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon; + c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon; c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon; ++ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon; ++ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon; +#ifdef RPI + c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_8; + c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_8; @@ -2416,21 +4252,68 @@ index 5591807..b6c48ee 100644 c->idct[0] = ff_hevc_transform_4x4_neon_8; c->idct[1] = ff_hevc_transform_8x8_neon_8; c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8; -@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) - c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8; - c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_8; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_8; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_8; +- c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8; +- c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8; +- c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8; +- c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8; ++ c->transform_add[0] = ff_hevc_transform_add_4x4_neon_8; ++ c->transform_add[1] = ff_hevc_transform_add_8x8_neon_8; ++ c->transform_add[2] = ff_hevc_transform_add_16x16_neon_8; ++ c->transform_add[3] = ff_hevc_transform_add_32x32_neon_8; ++ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_8; ++ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_8; ++ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_8; ++ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_8; ++#if RPI_HEVC_SAND ++ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_8; ++ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_8; ++ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_8; ++ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_8; ++ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_8; ++ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_8; ++ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_8; ++ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_8; ++ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_8; ++ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_8; ++ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_8; ++ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_8; ++#endif c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; -+ for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) { -+ c->sao_band_filter[x] = ff_hevc_sao_band_neon_wrapper; -+ c->sao_band_filter_c[x] = ff_hevc_sao_band_c_neon_wrapper; -+ c->sao_edge_filter[x] = ff_hevc_sao_edge_neon_wrapper; -+ c->sao_edge_filter_c[x] = ff_hevc_sao_edge_c_neon_wrapper; -+ } -+ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_neon_8; // width=32 ++ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_8; ++ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_8; ++ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_8; ++ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_8; ++ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_8; ++ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_8; ++ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_8; ++ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_8; ++ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_8; ++ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_8; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_8; ++ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_8; ++#endif ++#if RPI_HEVC_SAND ++ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_8; ++ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_8; ++ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_8; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_8; ++ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_8; ++ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_8; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_8; ++ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_8; ++#endif ++#endif put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8; put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8; put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8; -@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper; c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper; @@ -2452,22 +4335,711 @@ index 5591807..b6c48ee 100644 c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8; c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8; c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8; -@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) +@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8; c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8; } ++ else if (bit_depth == 10) { ++ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon_10; ++ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10; ++ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon_10; ++ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10; ++#ifdef RPI ++ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_10; ++ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_10; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_10; ++#endif ++ c->idct[0] = ff_hevc_transform_4x4_neon_10; ++ c->idct[1] = ff_hevc_transform_8x8_neon_10; ++ c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_10; ++ c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_10; ++ c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_10; ++ c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_10; ++ c->transform_add[0] = ff_hevc_add_residual_4x4_neon_10; ++ c->transform_add[1] = ff_hevc_add_residual_8x8_neon_10; ++ c->transform_add[2] = ff_hevc_add_residual_16x16_neon_10; ++ c->transform_add[3] = ff_hevc_add_residual_32x32_neon_10; ++ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_10; ++ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_10; ++ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_10; ++ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_10; ++#if RPI_HEVC_SAND ++ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_10; ++ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_10; ++ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_10; ++ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_10; ++ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_10; ++ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_10; ++ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_10; ++ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_10; ++ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_10; ++ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_10; ++ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_10; ++ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_10; ++#endif ++ c->idct_4x4_luma = ff_hevc_transform_luma_4x4_neon_10; ++ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_10; ++ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_10; ++ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_10; ++ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_10; ++ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_10; ++ ++ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_10; ++ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_10; ++ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_10; ++ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_10; ++ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_10; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_10; ++ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_10; ++#endif ++#if RPI_HEVC_SAND ++ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_10; ++ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_10; ++ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_10; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_10; ++ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_10; ++ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_10; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_10; ++ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_10; ++#endif ++#endif ++ } + + assert(offsetof(MvField, mv) == 0); + assert(offsetof(MvField, ref_idx) == 8); + assert(offsetof(MvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon; } +diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S +new file mode 100644 +index 0000000000..7cc5cd5e5c +--- /dev/null ++++ b/libavcodec/arm/hevcdsp_res16_neon.S +@@ -0,0 +1,610 @@ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++#define BIT_DEPTH 10 ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ add_residual4x4( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1] ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vld1.16 {d0}, [r0, :64], r2 ++ vld1.16 {d1}, [r0, :64], r2 ++ vld1.16 {d2}, [r0, :64], r2 ++ vld1.16 {d3}, [r0, :64], r2 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ sub r0, r0, r2, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0, :64], r2 ++ vst1.16 {d1}, [r0, :64], r2 ++ vst1.16 {d2}, [r0, :64], r2 ++ vst1.16 {d3}, [r0, :64], r2 ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vdup.i16 q9, r3 ++ vld1.16 {d0}, [r0, :64], r1 ++ vld1.16 {d1}, [r0, :64], r1 ++ vdup.16 q15, r2 ++ vld1.16 {d2}, [r0, :64], r1 ++ vld1.16 {d3}, [r0, :64], r1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ sub r0, r0, r1, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0, :64], r1 ++ vst1.16 {d1}, [r0, :64], r1 ++ vst1.16 {d2}, [r0, :64], r1 ++ vst1.16 {d3}, [r0, :64], r1 ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual8x8( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #2 ++1: ++ vldm r1!, {q10-q13} ++ vld1.16 {q0}, [r0, :128], r2 ++ subs r12, #1 ++ vld1.16 {q1}, [r0, :128], r2 ++ vqadd.s16 q0, q10 ++ vld1.16 {q2}, [r0, :128], r2 ++ vqadd.s16 q1, q11 ++ vld1.16 {q3}, [r0, :128], r2 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r0, r2, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {q0}, [r0, :128], r2 ++ vmin.s16 q2, q2, q9 ++ vst1.16 {q1}, [r0, :128], r2 ++ vmin.s16 q3, q3, q9 ++ vst1.16 {q2}, [r0, :128], r2 ++ vst1.16 {q3}, [r0, :128], r2 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #1 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual8x8_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 ++ mov r12, #2 ++ vdup.16 q15, r2 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ vld1.16 {q0}, [r0, :128], r1 ++ subs r12, #1 ++ vld1.16 {q1}, [r0, :128], r1 ++ vqadd.s16 q0, q15 ++ vld1.16 {q2}, [r0, :128], r1 ++ vqadd.s16 q1, q15 ++ vld1.16 {q3}, [r0, :128], r1 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ sub r0, r0, r1, lsl #2 ++ vmax.s16 q0, q8 ++ vmax.s16 q1, q8 ++ vmax.s16 q2, q8 ++ vmax.s16 q3, q8 ++ vmin.s16 q0, q9 ++ vmin.s16 q1, q9 ++ vst1.16 {q0}, [r0, :128], r1 ++ vmin.s16 q2, q9 ++ vst1.16 {q1}, [r0, :128], r1 ++ vmin.s16 q3, q9 ++ vst1.16 {q2}, [r0, :128], r1 ++ vst1.16 {q3}, [r0, :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual16x16( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #8 ++1: ++ vldm r1!, {q10-q13} ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0, :128], r2 ++ subs r12, #1 ++ vld1.16 {q2, q3}, [r0, :128] ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst1.16 {q0, q1}, [r0, :128], r2 ++ vst1.16 {q2, q3}, [r0, :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual8x8_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #4 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual16x16_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r12, #8 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0, :128], r1 ++ subs r12, #1 ++ vld1.16 {q2, q3}, [r0, :128] ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ sub r0, r1 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0, q1}, [r0, :128], r1 ++ vst1.16 {q2, q3}, [r0, :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual32x32( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #32 ++1: ++ vldm r1!, {q10-q13} ++ vldm r0, {q0-q3} ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vstm r0, {q0-q3} ++ add r0, r2 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual8x8_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #16 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual32x32_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r12, #32 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ vldm r0, {q0-q3} ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vstm r0, {q0-q3} ++ add r0, r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1, :256] ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 ++ ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, r0, r2, lsl #2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, r2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, #32 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1, :256] ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 ++ ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, r0, r2, lsl #2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, r2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, #32 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 ++ vldm r1, {q10-q13} ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 ++ ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r0, r2, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++ add r3, r1, #(8*8*2) @ Offset to V ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ vld1.16 {q12, q13}, [r3, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ add r3, r1, #(16*16*2) @ Offset to V ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ vld1.16 {q12, q13}, [r3, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, #32 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S new file mode 100644 -index 0000000..08a021d +index 0000000000..30113d9c93 --- /dev/null +++ b/libavcodec/arm/hevcdsp_sao_neon.S -@@ -0,0 +1,862 @@ +@@ -0,0 +1,1882 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi + * @@ -2491,124 +5063,211 @@ index 0000000..08a021d +#include "libavutil/arm/asm.S" +#include "neon.S" + -+.macro init_sao_band -+ pld [r1] -+ vld1.8 {q0, q1}, [r2] // offset table -+ ldr r2, [sp, #0] // stride_dst -+ ldr r12, [sp, #4] // height -+ vmov.u8 q3, #128 -+.endm ++.set EDGE_SRC_STRIDE, 160 ++ ++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128 ++ vshr.u8 q12, q8, #3 ++ vadd.s8 q8, \Q_K128 ++ vshr.u8 q13, q9, #3 ++ vadd.s8 q9, \Q_K128 ++ ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 + -+// 128 in q3 -+// input q8 - q11 -+.macro sao_band_64 -+ vtbl.8 d24, {d0, d1, d2, d3}, d24 -+ vadd.s8 q8, q3 -+ vtbl.8 d25, {d0, d1, d2, d3}, d25 -+ vadd.s8 q9, q3 -+ vtbl.8 d26, {d0, d1, d2, d3}, d26 -+ vadd.s8 q10, q3 -+ vtbl.8 d27, {d0, d1, d2, d3}, d27 -+ vadd.s8 q11, q3 -+ vtbl.8 d28, {d0, d1, d2, d3}, d28 + vqadd.s8 q8, q12 -+ vtbl.8 d29, {d0, d1, d2, d3}, d29 ++ vshr.u8 q12, q10, #3 ++ vadd.s8 q10, \Q_K128 + vqadd.s8 q9, q13 -+ vtbl.8 d30, {d0, d1, d2, d3}, d30 -+ vqadd.s8 q10, q14 -+ vtbl.8 d31, {d0, d1, d2, d3}, d31 -+ vsub.s8 q8, q3 -+ vqadd.s8 q11, q15 -+ vsub.s8 q9, q3 -+ vsub.s8 q10, q3 -+ vsub.s8 q11, q3 ++ vshr.u8 q13, q11, #3 ++ vadd.s8 q11, \Q_K128 ++ ++ vsub.s8 q8, \Q_K128 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vsub.s8 q9, \Q_K128 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vqadd.s8 q10, q12 ++ vqadd.s8 q11, q13 ++ vsub.s8 q10, \Q_K128 ++ vsub.s8 q11, \Q_K128 +.endm + -+function ff_hevc_sao_band_w8_neon_8, export=1 -+ init_sao_band -+1: subs r12, #8 -+ vld1.8 {d16}, [r1, :64], r3 -+ vld1.8 {d17}, [r1, :64], r3 -+ vshr.u8 q12, q8, #3 -+ vld1.8 {d18}, [r1, :64], r3 -+ vld1.8 {d19}, [r1, :64], r3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {d20}, [r1, :64], r3 -+ vld1.8 {d21}, [r1, :64], r3 -+ vshr.u8 q14, q10, #3 -+ vld1.8 {d22}, [r1, :64], r3 -+ vld1.8 {d23}, [r1, :64], r3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 -+ vst1.8 {d16}, [r0, :64], r2 -+ vst1.8 {d17}, [r0, :64], r2 -+ vst1.8 {d18}, [r0, :64], r2 -+ vst1.8 {d19}, [r0, :64], r2 -+ vst1.8 {d20}, [r0, :64], r2 -+ vst1.8 {d21}, [r0, :64], r2 -+ vst1.8 {d22}, [r0, :64], r2 -+ vst1.8 {d23}, [r0, :64], r2 -+ bne 1b ++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128 ++ vshr.u8 q12, q8, #3 ++ vadd.s8 q8, \Q_K128 + -+ bx lr ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ ++ vqadd.s8 q8, q12 ++ vsub.s8 q8, \Q_K128 ++.endm ++ ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ Clobbers q12, q13 ++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vshrn.i16 d26, \Q2, #(\bit_depth - 5) ++ vshrn.i16 d27, \Q3, #(\bit_depth - 5) ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vtbl.8 d26, \XLAT0, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vaddw.s8 \Q2, d26 ++ vaddw.s8 \Q3, d27 ++ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX ++.endm ++ ++@ Clobbers q12 ++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++.endm ++ ++ ++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) ++@ so we are quite safe stuffing it into a byte array ++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma ++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of ++@ precision ++ ++@ This, somewhat nasty, bit of code builds the {d0-d3} translation ++@ array via the stack ++@ Given that sao_left_class > 28 can cause wrap we can't just poke ++@ all 4 bytes in at once ++@ ++@ It also loads other common regs ++ ++function band_load_y ++ vmov.i64 q0, #0 ++ ldr r12, [sp, #8] @ &sao_offset_val[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ vmov.i64 q1, #0 ++ ldr r12, [sp, #12] @ sao_left_class ++ ++ mov r4, sp ++ sub sp, #32 ++ and sp, #~63 @ Align stack so we can wrap with a simple AND ++ vst1.8 {q0, q1}, [sp, :256] @ Put zero array on stack ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array ++ mov sp, r4 ++ ++ ldr r12, [sp, #20] @ height ++ pld [r1] ++ ++ sub r12, #1 ++ add r4, r1, r3 ++ bx lr +endfunc + -+function ff_hevc_sao_band_w16_neon_8, export=1 -+ init_sao_band -+1: subs r12, #4 -+ vld1.8 {q8}, [r1, :128], r3 -+ vshr.u8 q12, q8, #3 -+ vld1.8 {q9}, [r1, :128], r3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {q10}, [r1, :128], r3 -+ vshr.u8 q14, q10, #3 -+ vld1.8 {q11}, [r1, :128], r3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 -+ vst1.8 {q8}, [r0, :128], r2 -+ vst1.8 {q9}, [r0, :128], r2 -+ vst1.8 {q10}, [r0, :128], r2 -+ vst1.8 {q11}, [r0, :128], r2 -+ bne 1b + -+ bx lr -+endfunc ++function band_load_c ++ vmov.i64 q2, #0 ++ ldr r12, [sp, #8] @ &sao_offset_val1[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ vmov.i64 q3, #0 ++ ldr r12, [sp, #12] @ sao_left_class + -+function ff_hevc_sao_band_w32_neon_8, export=1 -+ init_sao_band -+1: subs r12, #2 -+ vld1.8 {q8-q9}, [r1, :128], r3 -+ vshr.u8 q12, q8, #3 -+ vshr.u8 q13, q9, #3 -+ vld1.8 {q10-q11}, [r1, :128], r3 -+ vshr.u8 q14, q10, #3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 -+ vst1.8 {q8-q9}, [r0, :128], r2 -+ vst1.8 {q10-q11}, [r0, :128], r2 -+ bne 1b ++ mov r4, sp @ Remember SP ++ sub sp, #32 ++ and sp, #~63 @ Align stack so we can wrap with a simple AND + -+ bx lr -+endfunc ++ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array + -+function ff_hevc_sao_band_w64_neon_8, export=1 -+ init_sao_band ++ @ And again for the 2nd set ++ ldr r12, [r4, #16] @ &sao_offset_val2[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ ldr r12, [r4, #20] @ sao_left_class2 ++ ++ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack (again) ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q2, q3}, [sp, :256] @ Pop modified array ++ ++ mov sp, r4 ++ ++ ldr r12, [sp, #28] @ height ++ pld [r1] + -+ push {r4, lr} + subs r12, #1 -+ mov r4, r1 -+ it ne -+ addne r4, r3 ++ add r4, r1, r3 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_sao_band_64_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_band_64_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 + +1: subs r12, #1 + vldm r1, {q8-q11} + pld [r4] -+ vshr.u8 q12, q8, #3 -+ vshr.u8 q13, q9, #3 + add r1, r3 -+ vshr.u8 q14, q10, #3 -+ vshr.u8 q15, q11, #3 -+ sao_band_64 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ + it ne + addne r4, r3 + vstm r0, {q8-q11} @@ -2618,8 +5277,113 @@ index 0000000..08a021d + pop {r4, pc} +endfunc + ++@ ff_hevc_sao_band_32_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] + -+@ ff_hevc_sao_band_c_w64_neon_8( ++function ff_hevc_sao_band_32_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 ++ ++1: subs r12, #2 ++ vld1.8 { q8, q9 }, [r1, :128], r3 ++ vld1.8 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 { q8, q9 }, [r0, :128], r2 ++ vst1.8 {q10, q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_16_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_band_16_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 ++ ++1: subs r12, #4 ++ vld1.8 { q8}, [r1, :128], r3 ++ vld1.8 { q9}, [r1, :128], r3 ++ vld1.8 {q10}, [r1, :128], r3 ++ vld1.8 {q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 { q8}, [r0, :128], r2 ++ vst1.8 { q9}, [r0, :128], r2 ++ vst1.8 {q10}, [r0, :128], r2 ++ vst1.8 {q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_8_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_band_8_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ ldr lr, [sp, #16] @ width ++ vmov.u8 q15, #128 ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #2 ++ vld1.8 {d16}, [r1, :64], r3 ++ vld1.8 {d17}, [r1, :64], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 {d16}, [r0, :64], r2 ++ vst1.8 {d17}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #4 ++ vld1.32 {d16[0]}, [r1, :32], r3 ++ vld1.32 {d16[1]}, [r1, :32], r3 ++ vld1.32 {d17[0]}, [r1, :32], r3 ++ vld1.32 {d17[1]}, [r1, :32], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.32 {d16[0]}, [r0, :32], r2 ++ vst1.32 {d16[1]}, [r0, :32], r2 ++ vst1.32 {d17[0]}, [r0, :32], r2 ++ vst1.32 {d17[1]}, [r0, :32], r2 ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_c_32_neon_8( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] @@ -2631,707 +5395,1535 @@ index 0000000..08a021d +@ int width sp[16] +@ int height sp[20] + -+@ As this is often done in-place on the frame buffer it is worth preloading -+@ the pixel values but we want to beware of loading ouside our buffer to avoid -+@ loading stuff into the cache that should still be invalid (in use by QPU, VPU) ++function ff_hevc_sao_band_c_32_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c + -+function ff_hevc_sao_band_c_neon_8, export=1 -+ mov r12, sp -+ push {r4-r8, lr} // 24 bytes ++ vmov.i8 q15, #128 ++ sub r3, #32 ++ sub r2, #32 + -+ ldm r12, {r4-r7} ++1: subs r12, #1 ++ vld2.8 { q8, q9 }, [r1, :128]! ++ vld2.8 {q10, q11}, [r1, :128], r3 + -+ add r4, #2 -+ add r6, #2 -+ vld1.16 {d16}, [r4] @ Unaligned -+ lsl r5, r5, #3 -+ vld1.16 {d18}, [r6] -+ pld [r1] -+ vmov.i8 d17, #0 -+ mov r4, r1 -+ vmov.i8 d19, #0 -+ lsl r7, r7, #3 -+ vdup.8 q1, r5 -+ ldr r5, [r12, #16] @ width -+ vdup.8 q2, r7 -+ ldr r12, [r12, #20] -+ vqmovn.s16 d0, q8 -+ cmp r5, #16 @ At some point we may want a table lookup -+ vqmovn.s16 d1, q9 -+ vmov.i8 q3, #128 -+ beq 16f ++ pld [r4] + -+ @ d0 U lookup -+ @ d1 V lookup -+ @ q1 U raw offset -+ @ q2 V raw offset -+ @ q3 #128 ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 + -+ @ r4 = r1 = src - Inteded for preload pointer -+ @ r12 = height ++ vst2.8 { q8, q9 }, [r0, :128]! ++ vst2.8 {q10, q11}, [r0, :128], r2 ++ ++ itt ne ++ addne r4, r3 ++ addne r4, #32 ++ ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_c_16_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_sao_band_c_16_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c ++ vmov.i8 q15, #128 ++ ++1: subs r12, #2 ++ vld2.8 { q8, q9 }, [r1, :128], r3 ++ vld2.8 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_sao_band_c_8_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_sao_band_c_8_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c ++ ldr lr, [sp, #16] @ width ++ vmov.u8 q15, #128 ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #1 ++ vld2.8 {d16, d17}, [r1, :128], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vst2.8 {d16, d17}, [r0, :128], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #1 ++ vld1.8 {d16}, [r1, :64], r3 ++ vld1.8 {d17}, [r1, :64], r3 ++ vuzp.8 d16, d17 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vzip.8 d16, d17 ++ vst1.8 {d16}, [r0, :64], r2 ++ vst1.8 {d17}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++ ++@ ff_hevc_sao_band_64_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_64_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q2, #0 ++ vdup.i16 q3, lr ++ bl band_load_y ++ vpush {q4-q7} ++ ++1: subs r12, #1 ++ vldm r1, {q4-q11} ++ add r1, r3 ++ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++ vstm r0, {q4-q11} ++ add r0, r2 ++ bpl 1b ++ ++ vpop {q4-q7} ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_64_neon_10, export=1 ++ band_64_16 10 ++endfunc ++ ++@ ff_hevc_sao_band_32_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_32_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q2, #0 ++ vdup.i16 q3, lr ++ bl band_load_y ++ ++1: subs r12, #1 ++ vldm r1, {q8-q11} ++ add r1, r3 ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth ++ vstm r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_32_neon_10, export=1 ++ band_32_16 10 ++endfunc ++ ++@ ff_hevc_sao_band_16_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_16_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ bl band_load_y ++ ++1: subs r12, #2 ++ vld1.16 { q8, q9 }, [r1, :128], r3 ++ vld1.16 {q10, q11}, [r1, :128], r3 ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth ++ vst1.16 { q8, q9 }, [r0, :128], r2 ++ vst1.16 {q10, q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_16_neon_10, export=1 ++ band_16_16 10 ++endfunc ++ ++@ ff_hevc_sao_band_8_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_8_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ bl band_load_y ++ ldr lr, [sp, #16] ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #2 ++ vld1.16 { q8}, [r1, :128], r3 ++ vld1.16 { q9}, [r1, :128], r3 ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth ++ vst1.16 { q8}, [r0, :128], r2 ++ vst1.16 { q9}, [r0, :128], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #4 ++ vld1.16 {d16}, [r1, :64], r3 ++ vld1.16 {d17}, [r1, :64], r3 ++ vld1.16 {d18}, [r1, :64], r3 ++ vld1.16 {d19}, [r1, :64], r3 ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth ++ vst1.16 {d16}, [r0, :64], r2 ++ vst1.16 {d17}, [r0, :64], r2 ++ vst1.16 {d18}, [r0, :64], r2 ++ vst1.16 {d19}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_8_neon_10, export=1 ++ band_8_16 10 ++endfunc ++ ++ ++@ ff_hevc_sao_band_c_32_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_32_16 bit_depth ++ push {r4, lr} ++ bl band_load_c ++ vpush {q4-q7} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ sub r2, #96 ++ ++1: subs r12, #1 ++ ++ vld2.16 { q4, q5 }, [r1, :128]! ++ vld2.16 { q6, q7 }, [r1, :128]! ++ vld2.16 { q8, q9 }, [r1, :128]! ++ vld2.16 {q10, q11}, [r1, :128], r3 ++ ++ pld [r4] ++ sub r1, #96 ++ ++ sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth + -+ @ Might (unlikely) be called with height == 1 -+ subs r12, #1 + it ne + addne r4, r3 + -+1: -+ subs r12, #1 -+ vld2.8 {q8-q9}, [r1, :128]! -+ vsub.u8 q12, q8, q1 -+ vld2.8 {q10-q11}, [r1, :128], r3 -+ vsub.u8 q14, q10, q1 -+ vsub.u8 q13, q9, q2 -+ sub r1, #32 -+ vsub.u8 q15, q11, q2 -+ pld [r4] -+ vshr.u8 q12, #3 -+ vadd.s8 q8, q3 -+ vshr.u8 q13, #3 -+ vadd.s8 q9, q3 ++ vst2.16 { q4, q5 }, [r0, :128]! ++ vst2.16 { q6, q7 }, [r0, :128]! ++ vst2.16 { q8, q9 }, [r0, :128]! ++ vst2.16 {q10, q11}, [r0, :128], r2 + -+ vtbl.8 d24, {d0}, d24 -+ vshr.u8 q14, #3 -+ vtbl.8 d25, {d0}, d25 -+ vshr.u8 q15, #3 -+ vtbl.8 d26, {d1}, d26 -+ vadd.s8 q10, q3 -+ vtbl.8 d27, {d1}, d27 -+ vadd.s8 q11, q3 -+ vtbl.8 d28, {d0}, d28 -+ vqadd.s8 q8, q12 -+ vtbl.8 d29, {d0}, d29 -+ vqadd.s8 q9, q13 -+ vtbl.8 d30, {d1}, d30 -+ vqadd.s8 q10, q14 -+ vtbl.8 d31, {d1}, d31 -+ vsub.s8 q8, q3 -+ vqadd.s8 q11, q15 -+ vsub.s8 q9, q3 -+ vsub.s8 q10, q3 -+ vsub.s8 q11, q3 -+ -+ it ne -+ addne r4, r3 @ Do not inc on final pass -+ vst2.8 {q8-q9}, [r0, :128]! -+ vst2.8 {q10-q11}, [r0, :128], r2 -+ sub r0, #32 + bpl 1b + -+ pop {r4-r8, pc} -+ -+@ -- width 16 (UV pairs) -- -+16: -+ subs r12, #2 -+ it ne -+ addne r4, r4, r3, lsl #1 -+ -+1: -+ subs r12, #2 -+ vld2.8 {q8-q9}, [r1, :128], r3 -+ vsub.u8 q12, q8, q1 -+ vld2.8 {q10-q11}, [r1, :128], r3 -+ vsub.u8 q14, q10, q1 -+ vsub.u8 q13, q9, q2 -+ pld [r4] -+ vsub.u8 q15, q11, q2 -+ pld [r4, r3] -+ vshr.u8 q12, #3 -+ vadd.s8 q8, q3 -+ vshr.u8 q13, #3 -+ vadd.s8 q9, q3 -+ -+ vtbl.8 d24, {d0}, d24 -+ vshr.u8 q14, #3 -+ vtbl.8 d25, {d0}, d25 -+ vshr.u8 q15, #3 -+ vtbl.8 d26, {d1}, d26 -+ vadd.s8 q10, q3 -+ vtbl.8 d27, {d1}, d27 -+ vadd.s8 q11, q3 -+ vtbl.8 d28, {d0}, d28 -+ vqadd.s8 q8, q12 -+ vtbl.8 d29, {d0}, d29 -+ vqadd.s8 q9, q13 -+ vtbl.8 d30, {d1}, d30 -+ vqadd.s8 q10, q14 -+ vtbl.8 d31, {d1}, d31 -+ vsub.s8 q8, q3 -+ vqadd.s8 q11, q15 -+ vsub.s8 q9, q3 -+ vsub.s8 q10, q3 -+ vsub.s8 q11, q3 -+ -+ it ne -+ addne r4, r4, r3, lsl #1 -+ vst2.8 {q8-q9}, [r0, :128], r2 -+ vst2.8 {q10-q11}, [r0, :128], r2 -+ bpl 1b -+ -+ pop {r4-r8, pc} ++ vpop {q4-q7} ++ pop {r4, pc} ++.endm + ++function ff_hevc_sao_band_c_32_neon_10, export=1 ++ band_c_32_16 10 +endfunc + + -+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3 -+ vcgt.u8 \out0, \in2, \in0 // c > a -> -1 , otherwise 0 -+ vcgt.u8 \tmp0, \in0, \in2 // a > c -> -1 , otherwise 0 -+ vcgt.u8 \out1, \in3, \in1 // c > a -> -1 , otherwise 0 part 2 -+ vcgt.u8 \tmp1, \in1, \in3 // a > c -> -1 , otherwise 0 part 2 -+ vsub.s8 \out0, \tmp0, \out0 // diff0 -+ vsub.s8 \out1, \tmp1, \out1 // diff0 part 2 ++@ ff_hevc_sao_band_c_16_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_16_16 bit_depth ++ push {r4, lr} ++ bl band_load_c ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ sub r2, #32 ++ sub r3, #32 ++ ++1: subs r12, #1 ++ ++ vld2.16 { q8, q9 }, [r1, :128]! ++ vld2.16 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ ++ vst2.16 { q8, q9 }, [r0, :128]! ++ vst2.16 {q10, q11}, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} +.endm + ++function ff_hevc_sao_band_c_16_neon_10, export=1 ++ band_c_16_16 10 ++endfunc + -+// input -+// a in q0 - q3 -+// c in q4 - q7 -+// b in q8 - q11 -+// offset table r4,r5 and r6,r7 -+// r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C -+// output in q0 - q3 -+// clobbers q12 - q15 + -+@ a <- c <- b ++@ ff_hevc_sao_band_c_8_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_8_16 bit_depth ++ push {r4, lr} ++ bl band_load_c ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q14, #0 ++ vdup.i16 q15, lr ++ ldr lr, [sp, #24] @ width ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #1 ++ vld2.16 { q8, q9 }, [r1, :128], r3 ++ ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #2 ++ vld2.16 {d16, d17}, [r1, :128], r3 ++ vld2.16 {d18, d19}, [r1, :128], r3 ++ ++ sao_band_32b_16 q8, q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth ++ ++ vst2.16 {d16, d17}, [r0, :128], r2 ++ vst2.16 {d18, d19}, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} ++.endm ++ ++function ff_hevc_sao_band_c_8_neon_10, export=1 ++ band_c_8_16 10 ++endfunc ++ ++ ++@ ============================================================================= ++@ SAO EDGE ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ [r5] translate values +@ -+@ It appears that Neon can stall if you try and use results too soon so we try to -+@ spread our instruction out ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 + -+.macro edgeidx64 ++function edge_64b_body_8 + -+ vcgt.u8 q12, q4, q0 // c > a -> -1 , otherwise 0 -+ vcgt.u8 q13, q5, q1 -+ vcgt.u8 q14, q6, q2 -+ vcgt.u8 q15, q7, q3 ++ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q1 ++ vcgt.u8 q14, q6, q2 ++ vcgt.u8 q15, q7, q3 + -+ vcgt.u8 q0, q0, q4 // a > c -> -1 , otherwise 0 -+ vcgt.u8 q1, q1, q5 -+ vcgt.u8 q2, q2, q6 -+ vcgt.u8 q3, q3, q7 ++ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0 ++ vcgt.u8 q1, q5 ++ vcgt.u8 q2, q6 ++ vcgt.u8 q3, q7 + -+ vsub.s8 q0, q0, q12 // a = sign(c-a) -+ vsub.s8 q1, q1, q13 -+ vsub.s8 q2, q2, q14 -+ vsub.s8 q3, q3, q15 ++ vsub.s8 q0, q12 @ a = sign(c-a) ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 + -+ vcgt.u8 q12, q4, q8 // c > b -> -1 , otherwise 0 -+ vcgt.u8 q13, q5, q9 -+ vcgt.u8 q14, q6, q10 -+ vcgt.u8 q15, q7, q11 ++ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q9 ++ vcgt.u8 q14, q6, q10 ++ vcgt.u8 q15, q7, q11 + -+ vsub.s8 q0, q0, q12 -+ vsub.s8 q1, q1, q13 -+ vsub.s8 q2, q2, q14 -+ vsub.s8 q3, q3, q15 ++ vsub.s8 q0, q12 ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 + -+ vcgt.u8 q12, q8, q4 // c < b -> -1 , otherwise 0 -+ vcgt.u8 q13, q9, q5 -+ vcgt.u8 q14, q10, q6 -+ vcgt.u8 q15, q11, q7 ++ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0 ++ vcgt.u8 q13, q9, q5 ++ vcgt.u8 q14, q10, q6 ++ vcgt.u8 q15, q11, q7 + -+ vadd.s8 q0, q0, q12 // a = sign(c-a) + sign(c-b) -+ vadd.s8 q1, q1, q13 -+ vmov.u8 q12, #2 -+ vadd.s8 q2, q2, q14 -+ vadd.s8 q3, q3, q15 ++ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b) ++ vadd.s8 q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s8 q2, q14 ++ vadd.s8 q3, q15 + -+ vadd.s8 q0, q0, q12 -+ vadd.s8 q1, q1, q12 -+ @ whilst vmov dn, rm, rn exists it is a vfp instruction -+ @ and causes a stall till neon pipe empty - so don't do that! -+ vmov d26[0], r4 -+ vmov d26[1], r5 -+ vmov d27[0], r6 -+ vmov d27[1], r7 -+ vadd.s8 q2, q2, q12 -+ vuzp.8 q0, q1 -+ vmov.u8 q15, #128 -+ vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b) ++ vadd.s8 q0, q12 ++ vadd.s8 q1, q12 + -+ vtbl.8 d0, {d26}, d0 -+ vadd.s8 q12, q4, q15 // Add -128 so we can use saturating signed add ++ vld1.8 {d26, d27}, [r5] + -+ vtbl.8 d1, {d26}, d1 -+ vadd.s8 q14, q5, q15 ++ vadd.s8 q2, q12 ++ vuzp.8 q0, q1 ++ vmov.u8 q15, #128 ++ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b) + -+ vtbl.8 d2, {d27}, d2 -+ vuzp.8 q2, q3 ++ vtbl.8 d0, {d26}, d0 ++ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add + -+ vtbl.8 d3, {d27}, d3 ++ vtbl.8 d1, {d26}, d1 ++ vadd.s8 q14, q5, q15 + -+ vtbl.8 d4, {d26}, d4 -+ vzip.8 q0, q1 ++ vtbl.8 d2, {d27}, d2 ++ vuzp.8 q2, q3 + -+ vtbl.8 d5, {d26}, d5 -+ vqadd.s8 q0, q0, q12 -+ vqadd.s8 q1, q1, q14 -+ vadd.s8 q12, q6, q15 // Add -128 so we can use saturating signed add ++ vtbl.8 d3, {d27}, d3 + -+ vtbl.8 d6, {d27}, d6 -+ vadd.s8 q14, q7, q15 // Add -128 so we can use saturating signed add ++ vtbl.8 d4, {d26}, d4 ++ vzip.8 q0, q1 + -+ vtbl.8 d7, {d27}, d7 -+ vzip.8 q2, q3 ++ vtbl.8 d5, {d26}, d5 ++ vqadd.s8 q0, q12 ++ vqadd.s8 q1, q14 ++ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add + -+ vsub.s8 q0, q0, q15 -+ vqadd.s8 q2, q2, q12 -+ vqadd.s8 q3, q3, q14 -+ vsub.s8 q1, q1, q15 -+ vsub.s8 q2, q2, q15 -+ vsub.s8 q3, q3, q15 ++ vtbl.8 d6, {d27}, d6 ++ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add + -+.endm ++ vtbl.8 d7, {d27}, d7 ++ vzip.8 q2, q3 ++ ++ vsub.s8 q0, q15 ++ vqadd.s8 q2, q12 ++ vqadd.s8 q3, q14 ++ vsub.s8 q1, q15 ++ vsub.s8 q2, q15 ++ vsub.s8 q3, q15 ++ ++ bx lr ++endfunc ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ r4 upper clip value ++@ [r5] translate values ++@ ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 ++ ++function edge_64b_body_16 ++ ++ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q1 ++ vcgt.u16 q14, q6, q2 ++ vcgt.u16 q15, q7, q3 ++ ++ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0 ++ vcgt.u16 q1, q1, q5 ++ vcgt.u16 q2, q2, q6 ++ vcgt.u16 q3, q3, q7 ++ ++ vsub.s16 q0, q0, q12 // a = sign(c-a) ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q9 ++ vcgt.u16 q14, q6, q10 ++ vcgt.u16 q15, q7, q11 ++ ++ vsub.s16 q0, q0, q12 ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0 ++ vcgt.u16 q13, q9, q5 ++ vcgt.u16 q14, q10, q6 ++ vcgt.u16 q15, q11, q7 ++ ++ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b) ++ vadd.s16 q1, q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s16 q2, q2, q14 ++ vadd.s16 q3, q3, q15 ++ ++ vmovn.s16 d0, q0 ++ vmovn.s16 d1, q1 ++ vmovn.s16 d2, q2 ++ vmovn.s16 d3, q3 ++ ++ vuzp.8 q0, q1 ++ ++ vld1.8 {d26, d27}, [r5] ++ ++ vadd.s8 q0, q0, q12 ++ vadd.s8 q1, q1, q12 ++ ++ vtbl.8 d0, {d26}, d0 ++ vtbl.8 d1, {d26}, d1 ++ vtbl.8 d2, {d27}, d2 ++ vtbl.8 d3, {d27}, d3 ++ ++ vmov.i64 q12, #0 ++ ++ vzip.8 q0, q1 ++ ++ vdup.i16 q13, r4 ++ ++ @ Avoid overwrite whilst widening ++ vaddw.s8 q2, q6, d2 ++ vaddw.s8 q3, q7, d3 ++ vaddw.s8 q1, q5, d1 ++ vaddw.s8 q0, q4, d0 ++ ++ @ now clip ++ clip16_4 q2, q3, q1, q0, q12, q13 + -+function edge_w64_body -+ edgeidx64 -+ vstm r0, {q0-q3} -+ add r0, r0, r2 + bx lr +endfunc + -+.macro init_edge_64 -+ push {r4-r8,lr} -+ ldr r12, [sp, #24] // height -+ ldr r5, [sp, #28] // sao_offset_val_table -+ ldrd r4, r5, [r5] -+ mov r6, r4 -+ mov r7, r5 -+.endm + -+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+ sub r1, #8 -+1: subs r12, #1 -+ vld1.64 {d7}, [r1, :64]! -+ vld1.64 {q4-q5}, [r1, :128]! // load c -+ vld1.64 {q6-q7}, [r1, :128]! -+ vld1.64 {d24}, [r1, :64], r3 -+ sub r1, #72 -+ // load a -+ vext.8 q0, q3, q4, #15 -+ vext.8 q1, q4, q5, #15 -+ vext.8 q2, q5, q6, #15 -+ vext.8 q3, q6, q7, #15 -+ // load b -+ vext.8 q8, q4, q5, #1 -+ vext.8 q9, q5, q6, #1 -+ vext.8 q10, q6, q7, #1 -+ vext.8 q11, q7, q12, #1 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3, q9, q10 ++@ ++@ d16, d17 (q8) xlat U, V ++@ q14.u8 #2 ++@ q15.u8 #128 ++ ++function edge_16b_body_8 ++ vcgt.u8 q3, q1, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u8 q0, q1 @ a > c -> -1 , otherwise 0 ++ vcgt.u8 q9, q1, q2 @ c > b -> -1 , otherwise 0 ++ vcgt.u8 q10, q2, q1 @ c < b -> -1 , otherwise 0 ++ ++ vsub.s8 q0, q3 ++ vsub.s8 q10, q9 ++ vadd.s8 q0, q10 @ a = sign(c-a) ++ ++ vadd.s8 q0, q14 ++ vuzp.8 d0, d1 ++ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ vqadd.s8 q0, q3 ++ vsub.s8 q0, q15 ++ ++ bx lr +endfunc + -+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+ sub r1, r3 ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3 ++@ ++@ q12, #0 ++@ d16, d17 xlat U, V ++@ q14.u8 #2 ++@ q15.u16 max ++function edge_16b_body_16 ++ vcgt.u16 q3, q1, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u16 q0, q1 @ a > c -> -1 , otherwise 0 ++ vsub.s16 q0, q3 @ a = sign(c-a) ++ vcgt.u16 q3, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.s16 q0, q3 ++ vcgt.u16 q3, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.s16 q0, q3 @ a = sign(c-a) + sign(c-b) ++ ++ vmovn.s16 d0, q0 ++ @ d1 will have random contents that we transform but ++ @ that doesn't matter as we then discard them ++ vuzp.8 d0, d1 ++ ++ vadd.s8 q0, q0, q14 ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ ++ vaddw.s8 q0, q1, d0 ++ ++ @ now clip ++ vmax.s16 q0, q12 ++ vmin.s16 q0, q15 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_sao_edge_[c_]xx_neon( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only ++@ int eo, [sp, #sp_base + 0] ++@ int width, [sp, #sp_base + 4] ++@ int height) [sp, #sp_base + 8] ++ ++.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0 ++ push {r4-r6, lr} @ 16 bytes ++.set sp_base, 16 ++ ++@ Build translate registers ++@ As translate values can only be 0-4 we don't care about junk in the rest ++@ of the register ++ mov r12, #2 ++.if \is_chroma ++ ldr r4, [sp, #16] ++.set sp_base, sp_base + 4 ++.endif ++ vld1.8 {d16[2]}, [r3], r12 ++ vld1.8 {d16[0]}, [r3], r12 ++ vld1.8 {d16[1]}, [r3], r12 ++ vld1.8 {d16[3]}, [r3], r12 ++ vld1.8 {d16[4]}, [r3] ++.if \is_chroma ++ vld1.8 {d17[2]}, [r4], r12 ++ vld1.8 {d17[0]}, [r4], r12 ++ vld1.8 {d17[1]}, [r4], r12 ++ vld1.8 {d17[3]}, [r4], r12 ++ vld1.8 {d17[4]}, [r4] ++.else ++ vmov d17, d16 ++.endif ++ ++@ Setup constant registers ++.if \bit_depth > 8 ++ movw r4, (1 << \bit_depth) - 1 ++.endif ++.if \setup_16b ++.if \bit_depth > 8 ++ vmov.i64 q12, #0 ++ vdup.16 q15, r4 ++.else ++ vmov.u8 q15, #128 ++.endif ++ vmov.u8 q14, #2 ++.endif ++ movw r3, EDGE_SRC_STRIDE ++ ++@ If setup_64b we need the xlat table on the stack and q4-q7 saved ++.if \setup_64b ++ sub r5, sp, #16 ++ vpush {q4-q8} @ 80 bytes, q8 pushed first ++.set sp_base, sp_base + 80 ++.endif ++ ++@ Get jump address ++@ We have a special case for width 4 as the calling code doesn't detect it ++@ If we may have w4 then we add a 2nd jump table after the 1st ++.if \check_w4 ++ ldr r12, [sp, #sp_base + 4] @ width ++ cmp r12, #8 ++.endif ++ ldr r12, [sp, #sp_base + 0] @ e0 ++ adr r6, \jump_tab ++.if \check_w4 ++ it lt ++ addlt r6, #16 ++.endif ++ ldr r6, [r6, r12, lsl #2] ++ ++ ldr r12, [sp, #sp_base + 8] @ height ++ ++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes ++.if \do2 ++ push {r0, r1, r6, r12} ++ blx r6 ++ pop {r0, r1, r6, r12} ++ ++ add r0, #64 ++ add r1, #64 ++.endif ++ ++ blx r6 ++ ++@ Tidy up & return ++.if \setup_64b ++ vpop {q4-q8} @ spurious but harmless load of q8 ++.endif ++ pop {r4-r6, pc} ++.endm ++ ++ ++.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1 ++.endm ++ ++.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1 ++.endm ++ ++ ++.macro edge_64b_e0, body_fn, pb ++ mov r6, lr ++ sub r1, #8 ++1: vldm r1, {d7-d16} ++ subs r12, #1 ++ add r1, r3 + // load a -+ vld1.8 {q0-q1}, [r1, :128]! -+ vld1.8 {q2-q3}, [r1, :128], r3 -+ sub r1, #32 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+1: subs r12, #1 ++ vext.8 q0, q3, q4, #(16 - \pb) ++ vext.8 q1, q4, q5, #(16 - \pb) ++ vext.8 q2, q5, q6, #(16 - \pb) ++ vext.8 q3, q6, q7, #(16 - \pb) + // load b -+ vld1.8 {q8-q9}, [r1, :128]! -+ vld1.8 {q10-q11}, [r1, :128], r3 -+ sub r1, #32 -+ bl edge_w64_body ++ vext.8 q11, q7, q8, #\pb @ Avoid overwrite ++ vext.8 q8, q4, q5, #\pb ++ vext.8 q9, q5, q6, #\pb ++ vext.8 q10, q6, q7, #\pb ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_32bx2_e0, body_fn, pb ++ mov r6, lr ++ ++1: subs r12, #2 ++ ++ vld1.8 {q4-q5}, [r1] ++ sub r1, #\pb ++ vld1.8 {q0-q1}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {q8-q9}, [r1], r3 ++ sub r1, #\pb ++ vld1.8 {q6-q7}, [r1] ++ sub r1, #\pb ++ vld1.8 {q2-q3}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {q10-q11}, [r1], r3 ++ sub r1, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {q0,q1}, [r0], r2 ++ vst1.8 {q2,q3}, [r0], r2 ++ ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_16b_e0, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb ++ sub r3, #\pb * 2 ++ ++1: subs r12, #1 ++ ++ vld1.64 {q0}, [r1] @ load a ++ add r1, #\pb ++ vld1.64 {q1}, [r1, :128] @ load c ++ add r1, #\pb ++ vld1.64 {q2}, [r1], r3 @ load b ++ ++ bl \body_fn ++ vst1.8 {q0}, [r0], r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_8bx2_e0, body_fn, pb ++ mov r6, lr ++ ++1: subs r12, #2 ++ ++ vld1.8 {d2}, [r1, :64] ++ sub r1, #\pb ++ vld1.8 {d0}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {d4}, [r1], r3 ++ sub r1, #\pb ++ vld1.8 {d3}, [r1, :64] ++ sub r1, #\pb ++ vld1.8 {d1}, [r1] ++ add r1, #(\pb * 2) ++ vld1.8 {d5}, [r1], r3 ++ sub r1, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r0, :64], r2 ++ ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_4bx4_e0, body_fn, pb ++ mov r6, lr ++ ++1: subs r12, #4 ++ ++ vld1.32 {d2[0]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d0[0]}, [r1] ++ add r1, #(\pb * 2) ++ vld1.32 {d4[0]}, [r1], r3 @ R ++ vld1.32 {d4[1]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d2[1]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d0[1]}, [r1], r3 @ L ++ vld1.32 {d1[0]}, [r1] ++ add r1, #\pb ++ vld1.32 {d3[0]}, [r1] ++ add r1, #\pb ++ vld1.32 {d5[0]}, [r1], r3 @ R ++ vld1.32 {d5[1]}, [r1] ++ sub r1, #(\pb * 2) ++ vld1.32 {d1[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d3[1]}, [r1], r3 @ M ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0], r2 ++ vst1.32 {d0[1]}, [r0], r2 ++ vst1.32 {d1[0]}, [r0], r2 ++ vst1.32 {d1[1]}, [r0], r2 ++ ++ bgt 1b ++ bx r6 ++.endm ++ ++ ++.macro edge_64b_e1, body_fn ++ mov r6, lr ++ sub r1, r3 ++ // load a ++ vld1.8 {q0-q1}, [r1, :128]! ++ vld1.8 {q2-q3}, [r1, :128], r3 ++ sub r1, #32 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ sub r1, #32 ++1: subs r12, #1 ++ // load b ++ vld1.8 {q8-q9}, [r1, :128]! ++ vld1.8 {q10-q11}, [r1, :128], r3 ++ sub r1, #32 ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 + // copy c to a -+ vmov.64 q0, q4 -+ vmov.64 q1, q5 -+ vmov.64 q2, q6 -+ vmov.64 q3, q7 ++ vmov.64 q0, q4 ++ vmov.64 q1, q5 ++ vmov.64 q2, q6 ++ vmov.64 q3, q7 + // copy b to c -+ vmov.64 q4, q8 -+ vmov.64 q5, q9 -+ vmov.64 q6, q10 -+ vmov.64 q7, q11 -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc ++ vmov.64 q4, q8 ++ vmov.64 q5, q9 ++ vmov.64 q6, q10 ++ vmov.64 q7, q11 ++ bgt 1b ++ bx r6 ++.endm + -+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+1: sub r1, r3 ++.macro edge_32bx2_e1, body_fn ++ mov r6, lr ++ sub r1, r3 + // load a -+ // TODO: fix unaligned load -+ // don't reload a like in eo1 -+ sub r1, #1 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #31 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ add r1, #1 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #33 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc ++ vld1.8 {q0-q1}, [r1, :128], r3 ++ vld1.8 {q4-q5}, [r1, :128], r3 + -+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1 -+ init_edge_64 -+ vpush {d8-d15} -+1: sub r1, r3 -+ // load a -+ // TODO: fix unaligned load -+ // don't reload a like in eo1 -+ add r1, #1 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #33 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ sub r1, #1 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #31 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc ++1: subs r12, #2 ++ @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vmov q2, q4 ++ vmov q3, q5 ++ vld1.8 {q8-q9}, [r1, :128], r3 ++ vld1.8 {q10-q11}, [r1, :128], r3 ++ vmov q6, q8 ++ vmov q7, q9 + ++ bl \body_fn + -+@ void ff_hevc_sao_edge_c_eo1_w64_neon_8( -+@ uint8_t *_dst, r0 -+@ uint8_t *_src, r1 -+@ ptrdiff_t stride_dst, r2 -+@ ptrdiff_t stride_src, r3 -+@ int height, sp[0] -+@ int16_t *sao_offset_table_u, sp[4] -+@ int16_t *sao_offset_table_v); sp[8] -+@ int eo sp[12] ++ vst1.8 {q0,q1}, [r0], r2 ++ vst1.8 {q2,q3}, [r0], r2 + -+function ff_hevc_sao_edge_c_w64_neon_8, export=1 -+ push {r4-r8,lr} // 6 reg = 24 -+ ldr r5, [sp, #28] // sao_offset_val_table_u -+ ldr r7, [sp, #32] // sao_offset_val_table_v -+ -+ @ Load and rearrange offsets -+ @ Also "convert" from 16bit to 8bit -+ ldrb r4, [r5, #2] -+ ldrb r8, [r5, #4] -+ ldrb r6, [r7, #2] -+ ldrb r12, [r7, #4] -+ orr r4, r4, r8, lsl #8 -+ orr r6, r6, r12, lsl #8 -+ ldrb r8, [r5, #6] -+ ldrb r12, [r7, #6] -+ orr r4, r4, r8, lsl #24 -+ orr r6, r6, r12, lsl #24 -+ ldrb r5, [r5, #8] -+ ldrb r7, [r7, #8] -+ -+ ldr r12, [sp, #36] // e0 -+ adr r8, edge_c_tbl_w64 -+ ldr r8, [r8, r12, lsl #2] -+ -+ ldr r12, [sp, #24] // height -+ vpush {d8-d15} -+ mov pc, r8 -+ -+edge_c_tbl_w64: -+ .word ff_hevc_sao_edge_c_eo0_w64_neon_8 -+ .word ff_hevc_sao_edge_c_eo1_w64_neon_8 -+ .word ff_hevc_sao_edge_c_eo2_w64_neon_8 -+ .word ff_hevc_sao_edge_c_eo3_w64_neon_8 -+ -+ff_hevc_sao_edge_c_eo0_w64_neon_8: -+ sub r1, #8 -+1: subs r12, #1 -+ vld1.64 {d7}, [r1, :64]! -+ vld1.64 {q4-q5}, [r1, :128]! // load c -+ vld1.64 {q6-q7}, [r1, :128]! -+ vld1.64 {d24}, [r1, :64], r3 -+ sub r1, #72 -+ // load a -+ vext.8 q0, q3, q4, #14 -+ vext.8 q1, q4, q5, #14 -+ vext.8 q2, q5, q6, #14 -+ vext.8 q3, q6, q7, #14 -+ // load b -+ vext.8 q8, q4, q5, #2 -+ vext.8 q9, q5, q6, #2 -+ vext.8 q10, q6, q7, #2 -+ vext.8 q11, q7, q12, #2 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+ -+ff_hevc_sao_edge_c_eo1_w64_neon_8: -+ sub r1, r3 -+ // load a -+ vldm r1, {q0-q3} -+ add r1, r3 -+ // load c -+ vldm r1, {q4-q7} -+ add r1, r3 -+1: subs r12, #1 -+ // load b -+ vldm r1, {q8-q11} -+ add r1, r3 -+ bl edge_w64_body + // copy c to a -+ vmov.64 q0, q4 -+ vmov.64 q1, q5 -+ vmov.64 q2, q6 -+ vmov.64 q3, q7 ++ vmov.64 q0, q8 ++ vmov.64 q1, q9 ++ + // copy b to c -+ vmov.64 q4, q8 -+ vmov.64 q5, q9 -+ vmov.64 q6, q10 -+ vmov.64 q7, q11 -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} ++ vmov.64 q4, q10 ++ vmov.64 q5, q11 ++ bgt 1b ++ bx r6 ++.endm + -+ff_hevc_sao_edge_c_eo2_w64_neon_8: -+1: sub r1, r3 ++.macro edge_16b_e1, body_fn ++ mov r6, lr ++ sub r1, r3 ++ // load a ++ vld1.8 {q0}, [r1, :128], r3 ++ // load c ++ vld1.8 {q1}, [r1, :128], r3 ++1: subs r12, #1 ++ // load b ++ vld1.8 {q2}, [r1, :128], r3 ++ bl \body_fn ++ vst1.8 {q0}, [r0], r2 ++ // copy c to a ++ vmov.64 q0, q1 ++ // copy b to c ++ vmov.64 q1, q2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_8bx2_e1, body_fn ++ mov r6, lr ++ sub r1, r3 ++ // load a ++ vld1.8 {d0}, [r1, :64], r3 ++ vld1.8 {d2}, [r1, :64], r3 ++ ++1: subs r12, #2 ++ @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vmov.64 d1, d2 ++ vld1.8 {d4}, [r1, :64], r3 ++ vld1.8 {d5}, [r1, :64], r3 ++ vmov.64 d3, d4 ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0], r2 ++ vst1.8 {d1}, [r0], r2 ++ ++ // copy c to a ++ vmov.64 d0, d4 ++ // copy b to c ++ vmov.64 d2, d5 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_4bx4_e1, body_fn ++ mov r6, lr ++debug_me: ++ sub r1, r3 ++ // load a ++ vld1.32 {d0[0]}, [r1], r3 ++ vld1.32 {d0[1]}, [r1], r3 ++ ++1: subs r12, #4 ++ @ Given the data duplication here we could probably do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vld1.32 {d4[0]}, [r1], r3 ++ vld1.32 {d4[1]}, [r1], r3 ++ vld1.32 {d5[0]}, [r1], r3 ++ vld1.32 {d5[1]}, [r1], r3 ++ ++ vmov.32 d1, d4 ++ vext.32 d2, d0, d4, #1 ++ vext.32 d3, d4, d5, #1 ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0], r2 ++ vst1.32 {d0[1]}, [r0], r2 ++ vst1.32 {d1[0]}, [r0], r2 ++ vst1.32 {d1[1]}, [r0], r2 ++ ++ vmov.32 d0, d5 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_64b_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #32 ++ sub r3, #(32 - \pb) ++ ++1: sub r1, r3 + // load a + // TODO: fix unaligned load + // don't reload a like in eo1 -+ sub r1, #2 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #30 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ add r1, #2 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #34 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} ++ vld1.8 {q0-q1}, [r1]! ++ vld1.8 {q2-q3}, [r1], r3 ++ subs r12, #1 ++ // load c ++ vld1.8 {q4-q5}, [r1, :128]! ++ vld1.8 {q6-q7}, [r1, :128], r3 ++ // load b ++ vld1.8 {q8-q9}, [r1]! ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #(64 + \pb) ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bgt 1b + -+ff_hevc_sao_edge_c_eo3_w64_neon_8: -+1: sub r1, r3 -+ // load a -+ // TODO: fix unaligned load -+ // don't reload a like in eo1 -+ add r1, #2 -+ vld1.8 {q0-q1}, [r1]! -+ vld1.8 {q2-q3}, [r1], r3 -+ sub r1, #34 -+ subs r12, #1 -+ // load c -+ vld1.8 {q4-q5}, [r1, :128]! -+ vld1.8 {q6-q7}, [r1, :128], r3 -+ sub r1, #32 -+ // load b -+ sub r1, #2 -+ vld1.8 {q8-q9}, [r1]! -+ vld1.8 {q10-q11}, [r1] -+ sub r1, #30 -+ bl edge_w64_body -+ bne 1b -+ vpop {d8-d15} -+ pop {r4-r8,pc} -+endfunc -+ -+ -+.macro init_edge_32 -+ ldr r12, [sp, #4] // sao_offset_val_table -+ vld1.32 {d31}, [r12] -+ ldr r12, [sp] // height ++ add r3, #(32 - \pb) ++ bx r6 +.endm + -+.macro diff out0, tmp0, in0, in1 -+ vcgt.u8 \out0, \in1, \in0 // c > a -> -1 , otherwise 0 -+ vcgt.u8 \tmp0, \in0, \in1 // a > c -> -1 , otherwise 0 -+ vsub.s8 \out0, \tmp0, \out0 // diff0 ++.macro edge_32bx2_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb ++ ++1: sub r1, r3 ++ vld1.8 {q0-q1}, [r1], r3 ++ vld1.8 {q2-q3}, [r1] ++ subs r12, #2 ++ // load c ++ add r1, #\pb ++ vld1.8 {q4-q5}, [r1, :128], r3 ++ vld1.8 {q6-q7}, [r1, :128] ++ // load b ++ add r1, #\pb ++ vld1.8 {q8-q9}, [r1], r3 ++ vld1.8 {q10-q11}, [r1] ++ sub r1, #(\pb * 2) ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0], r2 ++ vst1.8 {q2-q3}, [r0], r2 ++ bgt 1b ++ ++ bx r6 +.endm + -+.macro table32 -+ vmov.s8 q10, #2 -+ vadd.s8 q0, q10 -+ vadd.s8 q1, q10 -+ vmov.s8 q10, #128 -+ vtbl.8 d0, {d31}, d0 -+ vadd.s8 q11, q2, q10 -+ vtbl.8 d1, {d31}, d1 -+ vadd.s8 q12, q3, q10 -+ vtbl.8 d2, {d31}, d2 -+ vqadd.s8 q11, q0 -+ vtbl.8 d3, {d31}, d3 -+ vqadd.s8 q12, q1 -+ vsub.s8 q0, q11, q10 -+ vsub.s8 q1, q12, q10 -+ vst1.8 {q0-q1}, [r0, :128], r2 ++.macro edge_16b_e2, body_fn, pb ++ mov r6, lr ++ add r3, #\pb ++ ++1: sub r1, r3 ++ // load a ++ vld1.8 {q0}, [r1], r3 ++ subs r12, #1 ++ // load c ++ vld1.8 {q1}, [r1, :128], r3 ++ // load b ++ vld1.8 {q2}, [r1] ++ sub r1, #\pb ++ bl \body_fn ++ vst1.8 {q0}, [r0], r2 ++ bgt 1b ++ bx r6 +.endm + -+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1 -+ init_edge_32 -+ vpush {q4-q7} -+ sub r1, #4 -+1: subs r12, #1 -+ vld1.8 {q13-q14}, [r1]! -+ vld1.32 d30, [r1], r3 -+ sub r1, #32 -+ // a -+ vext.8 q0, q13, q14, #3 -+ vext.8 q1, q14, q15, #3 -+ vshr.u64 d24, d30, #24 -+ // c -+ vext.8 q2, q13, q14, #4 -+ vext.8 q3, q14, q15, #4 -+ vshr.u64 d16, d30, #32 -+ // diff0 -+ diff32 q13, q14, q4, q5, q0, q1, q2, q3 -+ diff d18, d25, d24, d16 -+ // -diff1 -+ vext.s8 q0, q13, q14, #1 -+ vext.s8 q1, q14, q9, #1 ++.macro edge_8bx2_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb + -+ vsub.s8 q0, q13, q0 //diff0 + diff1 -+ vsub.s8 q1, q14, q1 -+ table32 -+ bne 1b -+ vpop {q4-q7} ++1: sub r1, r3 ++ vld1.8 {d0}, [r1], r3 ++ vld1.8 {d1}, [r1] ++ subs r12, #2 ++ // load c ++ add r1, #\pb ++ vld1.8 {d2}, [r1, :64], r3 ++ vld1.8 {d3}, [r1, :64] ++ // load b ++ add r1, #\pb ++ vld1.8 {d4}, [r1], r3 ++ vld1.8 {d5}, [r1] ++ sub r1, #(\pb * 2) + -+ bx lr ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0], r2 ++ vst1.8 {d1}, [r0], r2 ++ bgt 1b ++ ++ bx r6 ++.endm ++ ++.macro edge_4bx4_e2, body_fn, pb ++ mov r6, lr ++ sub r1, #\pb ++ ++1: sub r1, r3 ++ @ line 0 {d0[0], -, - } r1 lo ++ vld1.32 {d0[0]}, [r1], r3 ++ subs r12, #4 ++ @ Line 1 {d0[1], d2[0], - } r1 lo ++ vld1.32 {d0[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d2[0]}, [r1], r3 ++ @ Line 2 {d1[0], d2[1], d4[0]} r1 mid ++ vld1.32 {d2[1]}, [r1] ++ sub r1, #\pb ++ vld1.32 {d1[0]}, [r1] ++ add r1, #\pb * 2 ++ vld1.32 {d4[0]}, [r1], r3 ++ @ Line 2 {d1[1], d3[0], d4[1]} r1 hi ++ vld1.32 {d4[1]}, [r1] ++ sub r1, #\pb * 2 ++ vld1.32 {d1[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d3[0]}, [r1], r3 ++ @ Line 3 {-, d3[1], d5[0]} r1 mid ++ vld1.32 {d3[1]}, [r1] ++ add r1, #\pb ++ vld1.32 {d5[0]}, [r1], r3 ++ @ Line 4 {-, -, d5[1]} r1 hi ++ vld1.32 {d5[1]}, [r1] ++ sub r1, #(\pb * 2) ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0], r2 ++ vst1.32 {d0[1]}, [r0], r2 ++ vst1.32 {d1[0]}, [r0], r2 ++ vst1.32 {d1[1]}, [r0], r2 ++ bgt 1b ++ ++ bx r6 ++.endm ++ ++.macro edge_64b_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_64b_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_32bx2_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_32bx2_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_16b_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_16b_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_8bx2_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_8bx2_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_4bx4_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_4bx4_e2 \body_fn, (-\pb) ++.endm ++ ++.macro edge_64b_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ ++0: edge_64b_e0 \body_fn, \pb ++10: edge_64b_e1 \body_fn ++20: edge_64b_e2 \body_fn, \pb ++30: edge_64b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ ++0: edge_32bx2_e0 \body_fn, \pb ++10: edge_32bx2_e1 \body_fn ++20: edge_32bx2_e2 \body_fn, \pb ++30: edge_32bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_16b_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ .word 5f ++ .word 15f ++ .word 25f ++ .word 35f ++ ++0: edge_32bx2_e0 \body_fn_64b, \pb ++10: edge_32bx2_e1 \body_fn_64b ++20: edge_32bx2_e2 \body_fn_64b, \pb ++30: edge_32bx2_e3 \body_fn_64b, \pb ++5: edge_16b_e0 \body_fn_16b, \pb ++15: edge_16b_e1 \body_fn_16b ++25: edge_16b_e2 \body_fn_16b, \pb ++35: edge_16b_e3 \body_fn_16b, \pb ++.endm ++ ++.macro edge_16b_8bx2_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ .word 5f ++ .word 15f ++ .word 25f ++ .word 35f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++5: edge_8bx2_e0 \body_fn, \pb ++15: edge_8bx2_e1 \body_fn ++25: edge_8bx2_e2 \body_fn, \pb ++35: edge_8bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_8bx2_4bx4_bodies, body_fn, pb ++ .word 0f ++ .word 10f ++ .word 20f ++ .word 30f ++ .word 5f ++ .word 15f ++ .word 25f ++ .word 35f ++ ++0: edge_8bx2_e0 \body_fn, \pb ++10: edge_8bx2_e1 \body_fn ++20: edge_8bx2_e2 \body_fn, \pb ++30: edge_8bx2_e3 \body_fn, \pb ++5: edge_4bx4_e0 \body_fn, \pb ++15: edge_4bx4_e1 \body_fn ++25: edge_4bx4_e2 \body_fn, \pb ++35: edge_4bx4_e3 \body_fn, \pb ++.endm ++ ++@ void ff_hevc_sao_edge_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_8_neon_8, export=1 ++ edge_16b_init 8, 0, 1, 99f ++99: ++ edge_8bx2_4bx4_bodies edge_16b_body_8, 1 +endfunc + -+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1 -+ init_edge_32 -+ vpush {q4-q7} -+ // load a -+ sub r1, r3 -+ vld1.8 {q0-q1}, [r1, :128], r3 -+ // load c -+ vld1.8 {q2-q3}, [r1, :128], r3 -+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a ) -+1: subs r12, #1 -+ // load b -+ vld1.8 {q8-q9}, [r1, :128], r3 -+ diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b ) -+ vadd.s8 q0, q4, q12 //diff0 + diff1 -+ vadd.s8 q1, q5, q13 -+ table32 -+ // CMP ( c, a ) -+ vneg.s8 q12, q4 -+ vneg.s8 q13, q5 -+ // c -+ vmov.64 q2, q8 -+ vmov.64 q3, q9 -+ bne 1b -+ vpop {q4-q7} -+ bx lr ++@ void ff_hevc_sao_edge_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_16_neon_8, export=1 ++ edge_16b_init 8, 0, 0, 99f ++99: ++ edge_16b_bodies edge_16b_body_8, 1 +endfunc + -+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1 -+ init_edge_32 -+ vpush {d8-d15} -+ // load a -+ sub r1, r3 -+ sub r1, #8 -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {d24}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q0, q10, q11, #7 -+ vext.8 q1, q11, q12, #7 -+ // load c -+ vld1.8 {d9}, [r1, :64]! -+ vld1.8 {q2-q3}, [r1, :64], r3 -+ sub r1, #8 -+ vext.8 q4, q4, q2, #15 -+1: subs r12, #1 -+ // load b -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {q12}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q8, q10, q11, #9 -+ vext.8 q9, q11, q12, #9 -+ vext.8 q6, q10, q11, #8 -+ vext.8 q7, q11, q12, #8 -+ vext.8 q5, q10, q11, #7 -+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 -+ vadd.s8 q0, q12 //diff0 + diff1 -+ vadd.s8 q1, q13 -+ table32 -+ // inputs for next loop iteration -+ // a -+ vmov.8 q0, q4 -+ vext.8 q1, q2, q3, #15 -+ // c -+ vmov.8 q2, q6 -+ vmov.8 q3, q7 -+ vmov.8 q4, q5 -+ bne 1b -+ vpop {d8-d15} -+ bx lr ++@ void ff_hevc_sao_edge_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_32_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 1 +endfunc + -+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1 -+ init_edge_32 -+ sub r1, r3 -+ // load a -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {d24}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q0, q10, q11, #1 -+ vext.8 q1, q11, q12, #1 -+ // load c -+ vld1.8 {q2-q3}, [r1, :64]! -+ vld1.8 {d30}, [r1, :64], r3 -+ sub r1, #40 -+1: subs r12, #1 -+ // load b -+ vld1.8 {q10-q11}, [r1, :64]! -+ vld1.8 {q12}, [r1, :64], r3 -+ sub r1, #32 -+ vext.8 q8, q10, q11, #7 -+ vext.8 q9, q11, q12, #7 -+ vext.8 q14, q12, q10, #7 ++@ void ff_hevc_sao_edge_64_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] + -+ diff32 q12, q13, q0, q1, q0, q1, q2, q3 -+ diff32 q0, q1, q10, q11, q8, q9, q2, q3 ++function ff_hevc_sao_edge_64_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 1 ++endfunc + -+ vadd.s8 q0, q12 //diff0 + diff1 -+ vadd.s8 q1, q13 -+ table32 ++@ ff_hevc_sao_edge_c_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] + -+ // inputs for next loop iteration -+ // a -+ vext.8 q0, q2, q3, #1 -+ vext.8 q1, q3, q15, #1 -+ // c -+ vext.8 q2, q8, q9, #1 -+ vext.8 q3, q9, q14, #1 -+ vext.8 d30, d28, d2, #1 -+ bne 1b -+ bx lr ++function ff_hevc_sao_edge_c_8_neon_8, export=1 ++ edge_16b_init 8, 1, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_8, 2 ++endfunc ++ ++@ ff_hevc_sao_edge_c_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_16_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ ff_hevc_sao_edge_c_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_32_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ void ff_hevc_sao_edge_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_8_neon_10, export=1 ++ edge_16b_init 10, 0, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_sao_edge_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_16_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_sao_edge_64_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++@ We simply split the 32 case into 2 vertical stripes ++@ and call the fns for w32 ++@ ++@ Calling code will always have src != dst so we don't have to worry ++@ about edge effects ++ ++function ff_hevc_sao_edge_64_neon_10, export=1 ++ edge_64b_init 10, 0, 1, 99f ++endfunc ++ ++@ void ff_hevc_sao_edge_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_sao_edge_32_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ ff_hevc_sao_edge_c_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_8_neon_10, export=1 ++ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1 ++99: ++ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4 ++endfunc ++ ++@ ff_hevc_sao_edge_c_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_32_neon_10, export=1 ++ edge_64b_init 10, 1, 1, 99f ++endfunc ++ ++ ++@ ff_hevc_sao_edge_c_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_sao_edge_c_16_neon_10, export=1 ++ edge_64b_init 10, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 4 +endfunc + diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h -index 39713ed..25eb52b 100644 +index 1be52e7a12..bae5df4bc6 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -410,6 +410,8 @@ enum AVCodecID { @@ -3343,15 +6935,7 @@ index 39713ed..25eb52b 100644 /* various PCM "codecs" */ AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs AV_CODEC_ID_PCM_S16LE = 0x10000, -@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext { - #define FF_BUG_DC_CLIP 4096 - #define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders. - #define FF_BUG_TRUNCATED 16384 -+#define FF_BUG_GMC_UNSUPPORTED 32768 - - /** - * strictly follow the standard (MPEG-4, ...). -@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext { +@@ -3205,6 +3207,9 @@ typedef struct AVCodecContext { #define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244 #define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA) #define FF_PROFILE_H264_CAVLC_444 44 @@ -3361,7 +6945,7 @@ index 39713ed..25eb52b 100644 #define FF_PROFILE_VC1_SIMPLE 0 #define FF_PROFILE_VC1_MAIN 1 -@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext { +@@ -3515,6 +3520,13 @@ typedef struct AVCodecContext { #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1 #endif @@ -3371,11 +6955,12 @@ index 39713ed..25eb52b 100644 + * @author jc (08/02/2016) + */ + void * get_buffer_context; ++ } AVCodecContext; AVRational av_codec_get_pkt_timebase (const AVCodecContext *avctx); diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h -index 1bf1c62..ccfa991 100644 +index 1bf1c620d6..ccfa991f60 100644 --- a/libavcodec/cabac.h +++ b/libavcodec/cabac.h @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63]; @@ -3394,140 +6979,11 @@ index 1bf1c62..ccfa991 100644 const uint8_t *bytestream_start; const uint8_t *bytestream; const uint8_t *bytestream_end; -diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c -index 9d94b72..535ebf0 100644 ---- a/libavcodec/codec_desc.c -+++ b/libavcodec/codec_desc.c -@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = { - .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), - .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, - }, -+ { -+ .id = AV_CODEC_ID_H264_MVC, -+ .type = AVMEDIA_TYPE_VIDEO, -+ .name = "h264_mvc", -+ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"), -+ .props = AV_CODEC_PROP_LOSSY, -+ }, - - /* various PCM "codecs" */ - { -diff --git a/libavcodec/h264.h b/libavcodec/h264.h -index efe3555..16358aa 100644 ---- a/libavcodec/h264.h -+++ b/libavcodec/h264.h -@@ -126,7 +126,9 @@ enum { - NAL_END_STREAM = 11, - NAL_FILLER_DATA = 12, - NAL_SPS_EXT = 13, -+ NAL_SPS_SUBSET = 15, - NAL_AUXILIARY_SLICE = 19, -+ NAL_SLICE_EXT = 20, - NAL_FF_IGNORE = 0xff0f001, - }; - -diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c -index ce4bab2..b9b0c78 100644 ---- a/libavcodec/h264_parser.c -+++ b/libavcodec/h264_parser.c -@@ -58,6 +58,8 @@ typedef struct H264ParseContext { - uint8_t parse_history[6]; - int parse_history_count; - int parse_last_mb; -+ int is_mvc; -+ int slice_ext; - } H264ParseContext; - - -@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf, - } else if (state <= 5) { - int nalu_type = buf[i] & 0x1F; - if (nalu_type == NAL_SEI || nalu_type == NAL_SPS || -- nalu_type == NAL_PPS || nalu_type == NAL_AUD) { -+ nalu_type == NAL_PPS || nalu_type == NAL_AUD || -+ nalu_type == NAL_SPS_SUBSET) { - if (pc->frame_start_found) { - i++; - goto found; - } - } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA || -- nalu_type == NAL_IDR_SLICE) { -+ nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) { - state += 8; -+ -+ p->slice_ext = (nalu_type == NAL_SLICE_EXT); - continue; - } - state = 7; - } else { - p->parse_history[p->parse_history_count++] = buf[i]; -- if (p->parse_history_count > 5) { -+ if (p->parse_history_count > 8) { - unsigned int mb, last_mb = p->parse_last_mb; - GetBitContext gb; - -- init_get_bits(&gb, p->parse_history, 8*p->parse_history_count); -+ init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext); - p->parse_history_count = 0; - mb= get_ue_golomb_long(&gb); - p->parse_last_mb = mb; -@@ -145,7 +150,7 @@ found: - pc->frame_start_found = 0; - if (p->is_avc) - return next_avc; -- return i - (state & 5) - 5 * (state > 7); -+ return i - (state & 5) - 8 * (state > 7); - } - - static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb, -@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s, - } - } - -- parse_nal_units(s, avctx, buf, buf_size); -+ if (!p->is_mvc) -+ parse_nal_units(s, avctx, buf, buf_size); - - if (avctx->framerate.num) - avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1})); -@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx, - if ((state & 0xFFFFFF00) != 0x100) - break; - nalu_type = state & 0x1F; -- if (nalu_type == NAL_SPS) { -+ if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) { - has_sps = 1; - } else if (nalu_type == NAL_PPS) - has_pps = 1; -@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = { - .parser_close = h264_close, - .split = h264_split, - }; -+ -+static av_cold int init_mvc(AVCodecParserContext *s) -+{ -+ H264ParseContext *p = s->priv_data; -+ int ret = init(s); -+ if (ret < 0) -+ return ret; -+ -+ p->is_mvc = 1; -+ return 0; -+} -+ -+AVCodecParser ff_h264_mvc_parser = { -+ .codec_ids = { AV_CODEC_ID_H264_MVC }, -+ .priv_data_size = sizeof(H264ParseContext), -+ .parser_init = init_mvc, -+ .parser_parse = h264_parse, -+ .parser_close = h264_close, -+ .split = h264_split, -+}; diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c -index b478065..955e426 100644 +index c1fa67f67b..6f99021339 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c -@@ -41,8 +41,196 @@ +@@ -41,8 +41,346 @@ #include "hevc.h" #include "profiles.h" @@ -3535,33 +6991,19 @@ index b478065..955e426 100644 + #include "rpi_qpu.h" + #include "rpi_shader.h" + #include "rpi_shader_cmd.h" ++ #include "rpi_shader_template.h" + #include "rpi_zc.h" ++ #include "libavutil/rpi_sand_fns.h" + + // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory + #define RPI_CACHE_UNIF_MVS 1 + -+ // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*) -+ //#define RPI_SIMULATE_QPUS -+ #ifdef RPI_WORKER -+ #include "pthread.h" -+ #endif ++ #include "pthread.h" ++ #include "libavutil/atomic.h" + + static void worker_core(HEVCContext * const s); -+ -+ // We can pred any block height but annoyingly if we we do then the TMU cache -+ // explodes and it goes even slower :-( -+ #if 0 -+ #define Y_P_MAX_H 16 -+ #define Y_B_MAX_H 16 -+ #else -+ #define Y_P_MAX_H 64 -+ #define Y_B_MAX_H 64 -+ #endif +#endif + -+// #define DISABLE_MC -+ -+#define DISABLE_CHROMA 0 +#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards + +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) @@ -3573,8 +7015,6 @@ index b478065..955e426 100644 +} +# define av_mod_uintp2 av_mod_uintp2_c +#endif -+ -+#define Y_B_ONLY 0 + const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; @@ -3584,18 +7024,23 @@ index b478065..955e426 100644 +#define MC_DUMMY_X (-32) +#define MC_DUMMY_Y (-32) + -+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks -+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks -+// For each block of 64*64 the smallest block size is 8x4 -+// We also need an extra command for the setup information ++// UV still has min 4x4 pred ++// Allow for even spread +1 for setup, +1 for rounding ++// As we have load sharing this can (in theory) be exceeded so we have to ++// check after each CTU, but it is a good base size ++ ++// Worst case (all 4x4) commands per CTU ++#define QPU_Y_CMD_PER_CTU_MAX (8 * 8) ++#define QPU_C_CMD_PER_CTU_MAX (4 * 4) ++ ++#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX) ++#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) + 2 * QPU_N_MAX) + -+#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4)) +// The QPU code for UV blocks only works up to a block width of 8 +#define RPI_CHROMA_BLOCK_WIDTH 8 + +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) + -+// TODO Chroma only needs 4 taps + +// Actual filter goes -ve, +ve, +ve, -ve using these values +static const uint32_t rpi_filter_coefs[8] = { @@ -3609,29 +7054,135 @@ index b478065..955e426 100644 + ENCODE_COEFFS( 2, 10, 58, 2) +}; + -+#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4))) ++// Function arrays by QPU ++ ++static const int * const inter_pred_setup_c_qpu[12] = { ++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn ++}; ++ ++static const int * const inter_pred_setup_c10_qpu[12] = { ++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn ++}; ++ ++static const int * const inter_pred_setup_y_qpu[12] = { ++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn ++}; ++ ++static const int * const inter_pred_setup_y10_qpu[12] = { ++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn ++}; ++ ++static const int * const inter_pred_sync_qpu[12] = { ++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, ++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, ++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 ++}; ++ ++static const int * const inter_pred_sync10_qpu[12] = { ++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, ++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, ++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 ++}; ++ ++static const int * const inter_pred_exit_c_qpu[12] = { ++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn ++}; ++ ++static const int * const inter_pred_exit_c10_qpu[12] = { ++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn ++}; ++ ++static const int * const inter_pred_exit_y_qpu[12] = { ++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn ++}; ++ ++static const int * const inter_pred_exit_y10_qpu[12] = { ++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn ++}; ++ ++typedef struct ipe_chan_info_s ++{ ++ const unsigned int n; ++ const int * const * setup_fns; ++ const int * const * sync_fns; ++ const int * const * exit_fns; ++} ipe_chan_info_t; ++ ++typedef struct ipe_init_info_s ++{ ++ ipe_chan_info_t luma; ++ ipe_chan_info_t chroma; ++} ipe_init_info_t; ++ ++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 ++ { // 8 ++ .luma = {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, ++ .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} ++ }, ++ { // 9 ++ .luma = {0}, ++ .chroma = {0} ++ }, ++ { // 10 ++ .luma = {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, ++ .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} ++ } ++ ++}; ++ ++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) ++{ ++ const unsigned int n = ici->n; ++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word ++ ++ ipe->n = n; ++ ipe->max_fill = q1_size - ipe->min_gap; ++ for(unsigned int i = 0; i < n; i++) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base = ++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); ++ q->code_setup = qpu_fn(ici->setup_fns[i]); ++ q->code_sync = qpu_fn(ici->sync_fns[i]); ++ q->code_exit = qpu_fn(ici->exit_fns[i]); ++ } ++} ++ ++static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth) ++{ ++ const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8; ++ ++ av_assert0(bit_depth >= 8 && bit_depth <= 16); ++ ++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); ++ ++ for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) { ++ HEVCRpiJob *const jb = s->jobs + i; ++ set_ipe_from_ici(&jb->chroma_ip, &iii->chroma); ++ set_ipe_from_ici(&jb->luma_ip, &iii->luma); ++ } ++} ++ + +#endif + + -+#ifdef RPI_WORKER -+ -+typedef struct worker_global_env_s -+{ -+ volatile int arm_load; -+ pthread_mutex_t lock; -+ -+ unsigned int arm_y; -+ unsigned int arm_c; -+ unsigned int gpu_y; -+ unsigned int gpu_c; -+} worker_global_env_t; -+ -+static worker_global_env_t worker_global_env = -+{ -+ .lock = PTHREAD_MUTEX_INITIALIZER -+}; -+ ++#ifdef RPI + +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s); @@ -3639,108 +7190,154 @@ index b478065..955e426 100644 +#define LOG_ENTER +#define LOG_EXIT + ++#define USE_SEM 1 ++ +// Call this when we have completed pass0 and wish to trigger pass1 for the current job -+static void worker_submit_job(HEVCContext *s) ++static void worker_submit_job(HEVCContext * const s) +{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ s->worker_tail++; -+ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot -+ pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT ++ LOG_ENTER ++ sem_post(&s->jb0->sem_in); ++ s->jb0->pending = 1; ++ s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot ++ s->jb0 = s->jobs + s->pass0_job; ++ LOG_EXIT +} + +// Call this to say we have completed pass1 -+static void worker_complete_job(HEVCContext *s) ++static void worker_complete_job(HEVCContext * const s) +{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ s->worker_head++; -+ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot -+ pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT ++ LOG_ENTER ++ sem_t * const sem = &s->jb1->sem_out; ++ // Must set job no before signalling as otherwise rpi_do_all_passes ++ // may call worker_core from the main thread with a bad job number ++ s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot ++ s->jb1 = s->jobs + s->pass1_job; ++ sem_post(sem); ++ LOG_EXIT +} + -+// Call this to wait for all jobs to have completed at the end of a frame -+static void worker_wait(HEVCContext *s) -+{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ while( s->worker_head !=s->worker_tail) -+ { -+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); -+ } -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT -+} + +// Call worker_pass0_ready to wait until the s->pass0_job slot becomes +// available to receive the next job. +static void worker_pass0_ready(HEVCContext *s) +{ -+ LOG_ENTER -+ pthread_mutex_lock(&s->worker_mutex); -+ // tail is number of submitted jobs -+ // head is number of completed jobs -+ // tail-head is number of outstanding jobs in the queue -+ // we need to ensure there is at least 1 space left for us to use -+ while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS) -+ { -+ // Wait until another job is completed -+ pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex); ++ LOG_ENTER ++ HEVCRpiJob * const jb = s->jb0; ++ if (jb->pending) { ++ while (sem_wait(&jb->sem_out) == -1 && errno == EINTR) ++ /* Loop */; ++ jb->pending = 0; + } -+ pthread_mutex_unlock(&s->worker_mutex); -+ LOG_EXIT ++ LOG_EXIT ++} ++ ++// Call this to wait for all jobs to have completed at the end of a frame ++static void worker_wait(HEVCContext * const s) ++{ ++ LOG_ENTER ++ unsigned int i; ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ HEVCRpiJob * const jb = s->jobs + i; ++ if (jb->pending) { ++ while (sem_wait(&jb->sem_out) == -1 && errno == EINTR) ++ /* Loop */; ++ jb->pending = 0; ++ } ++ } ++ LOG_EXIT +} + +static void *worker_start(void *arg) +{ -+ HEVCContext *s = (HEVCContext *)arg; -+ while(1) { -+ pthread_mutex_lock(&s->worker_mutex); ++ HEVCContext * const s = (HEVCContext *)arg; + -+ while( !s->kill_worker && s->worker_tail - s->worker_head <= 0) ++ for (;;) + { -+ pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex); -+ } -+ pthread_mutex_unlock(&s->worker_mutex); ++ HEVCRpiJob * const jb = s->jb1; ++ while (sem_wait(&jb->sem_in) == -1 && errno == EINTR) ++ /* Loop */; ++ if (jb->terminate) ++ break; + -+ if (s->kill_worker) { -+ break; ++ LOG_ENTER ++ worker_core(s); ++ worker_complete_job(s); ++ LOG_EXIT + } -+ LOG_ENTER -+ worker_core(s); -+ -+ worker_complete_job(s); -+ LOG_EXIT -+ } -+ return NULL; ++ return NULL; +} + ++static void worker_pic_free_all(HEVCContext * const s) ++{ ++ unsigned int i; ++ ++ // Free coeff stuff - allocation not the same for all buffers ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ { ++ HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs; ++ ++ if (cf->s[0].buf != NULL) ++ av_freep(&cf->mptr); ++ if (cf->s[2].buf != NULL) ++ gpu_free(&cf->gptr); ++ memset(cf, 0, sizeof(*cf)); ++ } ++} ++ ++static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count) ++{ ++ unsigned int i; ++ ++ // Free coeff stuff - allocation not the same for all buffers ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ { ++ HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs; ++ ++// av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL); ++// av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL); ++// av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL); ++// av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL); ++ ++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) ++ goto fail; ++ cf->s[2].buf = (int16_t *)cf->gptr.arm; ++ cf->s[3].buf = cf->s[2].buf + coeff_count; ++ ++ // Must be 64 byte aligned for our zero apping code so over-allocate & ++ // round ++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL) ++ goto fail; ++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); ++ } ++ return 0; ++ ++fail: ++ printf("%s: **** Failed\n", __func__); ++ worker_pic_free_all(s); ++ return -1; ++} ++ ++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) ++{ ++ unsigned int i; ++ for (i = 0; i != 4; ++i) { ++ cf->s[i].n = 0; ++ } ++} +#endif ++ + /** * NOTE: Each function hls_foo correspond to the function foo in the * specification (HLS stands for High Level Syntax). -@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 +@@ -55,6 +393,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 /* free everything allocated by pic_arrays_init() */ static void pic_arrays_free(HEVCContext *s) { +#ifdef RPI -+ int job; -+ for(job=0;jobcoeffs_buf_arm[job][0]) { -+ gpu_free(&s->coeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = 0; -+ } -+ if (s->coeffs_buf_arm[job][2]) { -+ gpu_free(&s->coeffs_buf_accelerated[job]); -+ s->coeffs_buf_arm[job][2] = 0; -+ } -+ } ++ worker_pic_free_all(s); +#endif ++ +#ifdef RPI_DEBLOCK_VPU + { + int i; @@ -3757,7 +7354,7 @@ index b478065..955e426 100644 av_freep(&s->sao); av_freep(&s->deblock); -@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) +@@ -91,6 +446,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) int ctb_count = sps->ctb_width * sps->ctb_height; int min_pu_size = sps->min_pu_width * sps->min_pu_height; @@ -3766,32 +7363,17 @@ index b478065..955e426 100644 + const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS; + const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1]; + const int coefs_per_row = coefs_per_luma + coefs_per_chroma; -+ int job; + + av_assert0(sps); -+// s->max_ctu_count = sps->ctb_width; -+// printf("CTB with=%d\n", sps->ctb_width); -+// s->max_ctu_count = coefs_per_luma / coefs_in_ctb; -+ s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width); -+ s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y; -+ s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV; ++ s->max_ctu_count = coefs_per_luma / coefs_in_ctb; ++#if RPI_ROUND_TO_LINES ++ // Round down to an integral quantity of lines ++ if (s->max_ctu_count > sps->ctb_width) ++ s->max_ctu_count -= s->max_ctu_count % sps->ctb_width; ++#endif + -+ for(job=0;jobcoeffs_buf_default[job]); -+ s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm; -+ if (!s->coeffs_buf_arm[job][0]) -+ goto fail; -+ -+ gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]); // We prefetch past the end so provide an extra blocks worth of data -+ s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm; -+ s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc; -+ if (!s->coeffs_buf_arm[job][2]) -+ goto fail; -+ s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2]; // This points to just beyond the end of the buffer. Coefficients fill in backwards. -+ s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2]; -+ } -+ } ++ if (worker_pic_alloc_all(s, coefs_per_row) != 0) ++ goto fail; +#endif +#ifdef RPI_DEBLOCK_VPU + { @@ -3847,7 +7429,7 @@ index b478065..955e426 100644 s->bs_width = (width >> 2) + 1; s->bs_height = (height >> 2) + 1; -@@ -137,6 +434,29 @@ fail: +@@ -137,6 +560,29 @@ fail: return AVERROR(ENOMEM); } @@ -3877,16 +7459,18 @@ index b478065..955e426 100644 static void pred_weight_table(HEVCContext *s, GetBitContext *gb) { int i = 0; -@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps, +@@ -337,8 +783,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps, static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt) { #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL) - enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; +- int ret, i; + enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts; - int ret, i; ++ int ret; pic_arrays_free(s); -@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + s->ps.sps = NULL; +@@ -356,6 +802,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm switch (sps->pix_fmt) { case AV_PIX_FMT_YUV420P: case AV_PIX_FMT_YUVJ420P: @@ -3899,7 +7483,20 @@ index b478065..955e426 100644 #if CONFIG_HEVC_DXVA2_HWACCEL *fmt++ = AV_PIX_FMT_DXVA2_VLD; #endif -@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm +@@ -370,6 +822,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + #endif + break; + case AV_PIX_FMT_YUV420P10: ++#if RPI_HEVC_SAND ++ // Currently geometry calc is stuffed for big sizes ++ if (sps->width < 2048 && sps->height <= 1088) { ++ *fmt++ = AV_PIX_FMT_SAND64_10; ++ } ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -386,6 +844,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm ret = ff_thread_get_format(s->avctx, pix_fmts); if (ret < 0) goto fail; @@ -3907,22 +7504,56 @@ index b478065..955e426 100644 s->avctx->pix_fmt = ret; } else { -@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm +@@ -395,26 +854,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm + ff_hevc_pred_init(&s->hpc, sps->bit_depth); + ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth); + ff_videodsp_init (&s->vdsp, sps->bit_depth); ++#ifdef RPI ++ rpi_hevc_qpu_set_fns(s, sps->bit_depth); ++#endif + +- for (i = 0; i < 3; i++) { +- av_freep(&s->sao_pixel_buffer_h[i]); +- av_freep(&s->sao_pixel_buffer_v[i]); +- } ++ av_freep(&s->sao_pixel_buffer_h[0]); ++ av_freep(&s->sao_pixel_buffer_v[0]); + + if (sps->sao_enabled && !s->avctx->hwaccel) { +- int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; +- int c_idx; ++ const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; ++ unsigned int c_idx; ++ size_t vsize[3] = {0}; ++ size_t hsize[3] = {0}; + for(c_idx = 0; c_idx < c_count; c_idx++) { int w = sps->width >> sps->hshift[c_idx]; int h = sps->height >> sps->vshift[c_idx]; -+ // ******** Very very nasty allocation kludge for plaited Chroma - s->sao_pixel_buffer_h[c_idx] = +- s->sao_pixel_buffer_h[c_idx] = - av_malloc((w * 2 * sps->ctb_height) << -+ av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) << - sps->pixel_shift); - s->sao_pixel_buffer_v[c_idx] = +- sps->pixel_shift); +- s->sao_pixel_buffer_v[c_idx] = - av_malloc((h * 2 * sps->ctb_width) << -+ av_malloc((h * 2 * sps->ctb_width * (1 + (c_idx == 1))) << - sps->pixel_shift); +- sps->pixel_shift); ++ // ctb height & width are a min of 8 so this must a multiple of 16 ++ // so no point rounding up! ++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; ++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; } ++ ++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] ++ // when we have plaited chroma ++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); ++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); ++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; ++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; ++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; ++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; } -@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s) + + s->ps.sps = sps; +@@ -680,6 +1149,11 @@ static int hls_slice_header(HEVCContext *s) (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) { pred_weight_table(s, gb); } @@ -3934,20 +7565,25 @@ index b478065..955e426 100644 sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { -@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { +@@ -937,6 +1411,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { return 0; } +#ifdef RPI ++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s) ++{ ++ return s->jb0->intra.cmds + s->jb0->intra.n++; ++} ++ +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx) +{ + // U & V done on U call in the case of sliced frames -+ if (rpi_sliced_frame(s->frame) && c_idx > 1) ++ if (av_rpi_is_sand_frame(s->frame) && c_idx > 1) + return; + + if (s->enable_rpi) { + HEVCLocalContext *lc = s->HEVClc; -+ HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ HEVCPredCmd *cmd = rpi_new_intra_cmd(s); + cmd->type = RPI_PRED_INTRA; + cmd->size = log2_trafo_size; + cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; @@ -3956,7 +7592,7 @@ index b478065..955e426 100644 + cmd->i_pred.y = y0; + cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; + } -+ else if (rpi_sliced_frame(s->frame) && c_idx != 0) { ++ else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) { + s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx); + } + else { @@ -3969,7 +7605,7 @@ index b478065..955e426 100644 static int hls_transform_unit(HEVCContext *s, int x0, int y0, int xBase, int yBase, int cb_xBase, int cb_yBase, int log2_cb_size, int log2_trafo_size, -@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -949,8 +1456,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { int trafo_size = 1 << log2_trafo_size; ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size); @@ -3982,7 +7618,7 @@ index b478065..955e426 100644 } if (cbf_luma || cbf_cb[0] || cbf_cr[0] || -@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1036,7 +1546,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -3994,7 +7630,7 @@ index b478065..955e426 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1065,7 +1579,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); @@ -4006,7 +7642,7 @@ index b478065..955e426 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1094,7 +1612,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -4018,7 +7654,7 @@ index b478065..955e426 100644 } if (cbf_cb[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1104,7 +1626,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, if (lc->cu.pred_mode == MODE_INTRA) { ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), trafo_size_h, trafo_size_v); @@ -4030,7 +7666,7 @@ index b478065..955e426 100644 } if (cbf_cr[i]) ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, +@@ -1116,26 +1642,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v); @@ -4077,7 +7713,7 @@ index b478065..955e426 100644 } } } -@@ -1275,47 +1675,120 @@ do { +@@ -1281,47 +1827,119 @@ do { return 0; } @@ -4112,13 +7748,13 @@ index b478065..955e426 100644 - s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); - if (s->ps.sps->chroma_format_idc) { - s->hevcdsp.put_pcm(dst1, stride1, -+#ifdef RPI -+ if (rpi_sliced_frame(s->frame)) { -+ s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0), ++#if RPI_HEVC_SAND ++ if (av_rpi_is_sand_frame(s->frame)) { ++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), + s->frame->linesize[0], + cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); + -+ s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]), ++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]), + s->frame->linesize[1], cb_size >> s->ps.sps->hshift[1], cb_size >> s->ps.sps->vshift[1], @@ -4157,10 +7793,9 @@ index b478065..955e426 100644 +#ifdef RPI +int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n) +{ -+ int16_t * const coeffs = (buf_no != 3) ? -+ s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] : -+ s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n; -+ s->num_coeffs[s->pass0_job][buf_no] += n; ++ HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no; ++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); ++ cfe->n += n; + return coeffs; +} +#endif @@ -4205,7 +7840,7 @@ index b478065..955e426 100644 + + // Add command + { -+ HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(s); + cmd->type = RPI_PRED_I_PCM; + cmd->size = log2_cb_size; + cmd->i_pcm.src = coeffs; @@ -4223,99 +7858,7 @@ index b478065..955e426 100644 /** * 8.5.3.2.2.1 Luma sample unidirectional interpolation process * -@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) - * @param luma_offset additive offset applied to the luma prediction value - */ - -+#if RPI_INTER -+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+ AVFrame *ref, const Mv *mv, int x_off, int y_off, -+ int block_w, int block_h, int luma_weight, int luma_offset) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_LUMA_UNI; -+ cmd->dst = dst; -+ cmd->dststride = dststride; -+ cmd->src = ref->data[0]; -+ cmd->srcstride = ref->linesize[0]; -+ cmd->mv = *mv; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->weight = luma_weight; -+ cmd->offset = luma_offset; -+} -+ -+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+ AVFrame *ref0, const Mv *mv0, int x_off, int y_off, -+ int block_w, int block_h, AVFrame *ref1, const Mv *mv1, -+ const struct MvField * const current_mv) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_LUMA_BI; -+ cmd->dst = dst; -+ cmd->dststride = dststride; -+ cmd->src = ref0->data[0]; -+ cmd->srcstride = ref0->linesize[0]; -+ cmd->mv = *mv0; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->src1 = ref1->data[0]; -+ cmd->srcstride1 = ref1->linesize[0]; -+ cmd->mv1 = *mv1; -+ cmd->ref_idx[0] = current_mv->ref_idx[0]; -+ cmd->ref_idx[1] = current_mv->ref_idx[1]; -+} -+ -+static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0, -+ ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, -+ int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_CHROMA_UNI; -+ cmd->dst = dst0; -+ cmd->dststride = dststride; -+ cmd->src = src0; -+ cmd->srcstride = srcstride; -+ cmd->mv = *mv; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->weight = chroma_weight; -+ cmd->offset = chroma_offset; -+} -+ -+static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, -+ int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx) -+{ -+ HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++; -+ cmd->cmd = RPI_CMD_CHROMA_BI+cidx; -+ cmd->dst = dst0; -+ cmd->dststride = dststride; -+ cmd->src = ref0->data[cidx+1]; -+ cmd->srcstride = ref0->linesize[cidx+1]; -+ cmd->mv = current_mv->mv[0]; -+ cmd->mv1 = current_mv->mv[1]; -+ cmd->x_off = x_off; -+ cmd->y_off = y_off; -+ cmd->block_w = block_w; -+ cmd->block_h = block_h; -+ cmd->src1 = ref1->data[cidx+1]; -+ cmd->srcstride1 = ref1->linesize[cidx+1]; -+ cmd->ref_idx[0] = current_mv->ref_idx[0]; -+ cmd->ref_idx[1] = current_mv->ref_idx[1]; -+} -+ -+#endif -+ - static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - AVFrame *ref, const Mv *mv, int x_off, int y_off, - int block_w, int block_h, int luma_weight, int luma_offset) -@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1353,6 +1971,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag); int idx = ff_hevc_pel_weight[block_w]; @@ -4326,7 +7869,7 @@ index b478065..955e426 100644 x_off += mv->x >> 2; y_off += mv->y >> 2; src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1399,7 +2021,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, * @param mv1 motion vector1 (relative to block position) to get pixel data from * @param current_mv current motion vector structure */ @@ -4335,7 +7878,7 @@ index b478065..955e426 100644 AVFrame *ref0, const Mv *mv0, int x_off, int y_off, int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) { -@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, +@@ -1423,6 +2045,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift); uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift); @@ -4346,7 +7889,7 @@ index b478065..955e426 100644 if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER || x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER || y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) { -@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, +@@ -1508,6 +2134,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, intptr_t _mx = mx << (1 - hshift); intptr_t _my = my << (1 - vshift); @@ -4357,7 +7900,7 @@ index b478065..955e426 100644 x_off += mv->x >> (2 + hshift); y_off += mv->y >> (2 + vshift); src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF +@@ -1572,6 +2202,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF int hshift = s->ps.sps->hshift[1]; int vshift = s->ps.sps->vshift[1]; @@ -4368,7 +7911,125 @@ index b478065..955e426 100644 intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift); intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift); intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift); -@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, +@@ -1645,13 +2279,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF + _mx1, _my1, block_w); + } + +-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref, +- const Mv *mv, int y0, int height) ++#ifdef RPI ++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int val, const int field) + { +- int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); ++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { ++ HEVCContext *const fs = ref->tf.owner->priv_data; ++ HEVCRPiFrameProgressState * const pstate = fs->progress_states + field; ++ sem_t * sem = NULL; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ if (((volatile int *)ref->tf.progress->data)[field] < val) { ++ HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait; ++ ++ av_assert0(pwait->req == -1 && pwait->next == NULL); + +- if (s->threads_type == FF_THREAD_FRAME ) +- ff_thread_await_progress(&ref->tf, y, 0); ++ pwait->req = val; ++ pwait->next = NULL; ++ if (pstate->first == NULL) ++ pstate->first = pwait; ++ else ++ pstate->last->next = pwait; ++ pstate->last = pwait; ++ sem = &pwait->sem; ++ } ++ pthread_mutex_unlock(&pstate->lock); ++ ++ if (sem != NULL) { ++ while (sem_wait(sem) != 0) ++ av_assert0(errno == EINTR); ++ } ++ } ++} ++ ++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field) ++{ ++ HEVCRPiFrameProgressState *const pstate = s->progress_states + field; ++ ++ ((int *)s->ref->tf.progress->data)[field] = val; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ { ++ HEVCRPiFrameProgressWait ** ppwait = &pstate->first; ++ HEVCRPiFrameProgressWait * pwait; ++ ++ while ((pwait = *ppwait) != NULL) { ++ if (pwait->req > val) ++ { ++ ppwait = &pwait->next; ++ pstate->last = pwait; ++ } ++ else ++ { ++ *ppwait = pwait->next; ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_post(&pwait->sem); ++ } ++ } ++ } ++ pthread_mutex_unlock(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate) ++{ ++ pstate->first = NULL; ++ pstate->last = NULL; ++ pthread_mutex_init(&pstate->lock, NULL); ++} ++ ++static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait) ++{ ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_init(&pwait->sem, 0, 0); ++} ++ ++static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate) ++{ ++ av_assert0(pstate->first == NULL); ++ pthread_mutex_destroy(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait) ++{ ++ sem_destroy(&pwait->sem); ++} ++#endif ++ ++static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref, ++ const Mv * const mv, const int y0, const int height) ++{ ++ if (s->threads_type == FF_THREAD_FRAME) { ++ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); ++ ++#ifdef RPI ++ if (s->enable_rpi) { ++ int16_t *const pr = s->jb0->progress + ref->dpb_no; ++ if (*pr < y) { ++ *pr = y; ++ } ++ } ++ else ++#endif ++ // It is a const ThreadFrame but the prototype isn't ++ ff_hevc_progress_wait_mv(s, s->jb0, ref, y); ++ } + } + + static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, +@@ -1699,14 +2432,542 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, } } @@ -4378,21 +8039,106 @@ index b478065..955e426 100644 + +#if RPI_INTER + -+static HEVCRpiLumaPred * -+rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val) ++static HEVCRpiInterPredQ * ++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) +{ -+ HEVCRpiLumaPred * yp = s->curr_pred_y; -+ HEVCRpiLumaPred * ypt = yp + 1; -+ for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) { ++ HEVCRpiInterPredQ * yp = ipe->q + ipe->curr; ++ HEVCRpiInterPredQ * ypt = yp + 1; ++ for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) { + if (ypt->load < yp->load) + yp = ypt; + } + -+// yp->load += load_val; -+ ++yp->load; ++ yp->load += load_val; ++ ipe->used_grp = 1; ++ yp->qpu_mc_curr->data[-1] = fn; // Link is always last el of previous cmd ++ + return yp; +} + ++ ++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) ++{ ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr->data[-1] = q->code_sync; ++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1); ++ q->load = 0; ++ } ++} ++ ++// Returns 0 on success, -1 if Q is dangerously full ++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) ++{ ++ if (!ipe->used_grp) ++ return 0; ++ ++ if ((ipe->curr += ipe->n_grp) >= ipe->n) ++ { ++ ipe->curr = 0; ++ rpi_inter_pred_sync(ipe); ++ } ++ ipe->used = 1; ++ ipe->used_grp = 0; ++ ++ for (unsigned int i = 0; i != ipe->n_grp; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr; ++ if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) { ++ return -1; ++ } ++ } ++ return 0; ++} ++ ++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ ipe->curr = 0; ++ ipe->used = 0; ++ ipe->used_grp = 0; ++ for (i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base; ++ q->load = 0; ++ q->last_l0 = NULL; ++ q->last_l1 = NULL; ++ } ++} ++ ++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, ++ const unsigned int n_max, const unsigned int n_grp, ++ const unsigned int total_size, const unsigned int min_gap) ++{ ++ memset(ipe, 0, sizeof(*ipe)); ++ av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL); ++ ipe->n_grp = n_grp; ++ ipe->min_gap = min_gap; ++ ++#if RPI_CACHE_UNIF_MVS ++ gpu_malloc_cached(total_size, &ipe->gptr); ++#else ++ gpu_malloc_uncached(total_size, &ipe->gptr); ++#endif ++} ++ ++ ++#if RPI_QPU_EMU_Y ++#define get_mc_address_y(f) ((f)->data[0]) ++#else ++#define get_mc_address_y(f) get_vc_address_y(f) ++#endif ++#if RPI_QPU_EMU_C ++#define get_mc_address_u(f) ((f)->data[1]) ++#else ++#define get_mc_address_u(f) get_vc_address_u(f) ++#endif ++ ++static inline int offset_depth_adj(const HEVCContext *const s, const int wt) ++{ ++ return s->ps.sps->high_precision_offsets_enabled_flag ? wt : ++ wt << (s->ps.sps->bit_depth - 8); ++} ++ +static void +rpi_pred_y(HEVCContext *const s, const int x0, const int y0, + const int nPbW, const int nPbH, @@ -4401,116 +8147,155 @@ index b478065..955e426 100644 + const int weight_offset, + AVFrame *const src_frame) +{ -+ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); -+ -+// rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, -+// mv, x0, y0, nPbW, nPbH, -+// weight_mul, weight_offset); ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my << 8) | mx; ++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; ++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); ++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; ++ const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul); ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); + ++ if (my_mx == 0) + { -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my << 8) | mx; -+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; -+ const int x1_m3 = x0 + (mv->x >> 2) - 3; -+ const int y1_m3 = y0 + (mv->y >> 2) - 3; -+ const uint32_t src_vc_address_y = get_vc_address_y(src_frame); -+ uint32_t dst_addr = get_vc_address_y(s->frame) + y_off; -+ const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul); ++ const int x1 = x0 + (mv->x >> 2); ++ const int y1 = y0 + (mv->y >> 2); ++ const int bh = nPbH; + -+ // Potentially we could change the assembly code to support taller sizes in one go -+ for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16) ++ for (int start_x = 0; start_x < nPbW; start_x += 16) + { -+ const uint32_t src_yx_y = y1_m3 + start_y; -+ int start_x = 0; -+ const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H); ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; + -+#if 1 -+ // As Y-pred operates on two independant 8-wide src blocks we can merge -+ // this pred with the previous one if it the previous one is 8 pel wide, -+ // the same height as the current block, immediately to the left of our -+ // current dest block and mono-pred. -+ -+ qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p; -+ if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr) -+ { -+ const int bw = FFMIN(nPbW, 8); -+ qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx; -+ -+ last_y8_lx->next_src2_x = x1_m3; -+ last_y8_lx->next_src2_y = src_yx_y; -+ last_y8_lx->next_src2_base = src_vc_address_y; -+ last_y8_p->p.w += bw; -+ last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21); -+ last_y8_p->p.wo2 = wo; -+ -+ s->last_y8_p = NULL; -+ s->last_y8_lx = NULL; -+ start_x = bw; +#if RPI_TSTATS -+ ++s->tstats.y_pred1_y8_merge; -+#endif ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred1_x0y0; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; + } +#endif + -+ for (; start_x < nPbW; start_x += 16) -+ { -+ const int bw = FFMIN(nPbW - start_x, 16); -+ HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7); -+ qpu_mc_pred_y_t *const cmd_lx = yp->last_lx; -+ qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr; ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src_vc_address_y; ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->wo1 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ const int x1_m3 = x0 + (mv->x >> 2) - 3; ++ const int y1_m3 = y0 + (mv->y >> 2) - 3; ++ const unsigned int bh = nPbH; ++ int start_x = 0; ++ ++#if 1 ++ // As Y-pred operates on two independant 8-wide src blocks we can merge ++ // this pred with the previous one if it the previous one is 8 pel wide, ++ // the same height as the current block, immediately to the left of our ++ // current dest block and mono-pred. ++ ++ qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p; ++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) ++ { ++ const int bw = FFMIN(nPbW, 8); ++ qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1; ++ ++ last_y8_src2->x = x1_m3; ++ last_y8_src2->y = y1_m3; ++ last_y8_src2->base = src_vc_address_y; ++ last_y8_p->w += bw; ++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); ++ last_y8_p->wo2 = wo; ++ ++ s->last_y8_p = NULL; ++ s->last_y8_l1 = NULL; ++ start_x = bw; +#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ if (mx == 0 && my == 0) -+ ++ts->y_pred1_x0y0; -+ else if (mx == 0) -+ ++ts->y_pred1_x0; -+ else if (my == 0) -+ ++ts->y_pred1_y0; -+ else -+ ++ts->y_pred1_xy; -+ -+ if (nPbW > 8) -+ ++ts->y_pred1_wgt8; -+ else -+ ++ts->y_pred1_wle8; -+ -+ if (nPbH > 16) -+ ++ts->y_pred1_hgt16; -+ else -+ ++ts->y_pred1_hle16; -+ } ++ ++s->tstats.y_pred1_y8_merge; ++#endif ++ } +#endif -+ cmd_y[-1].next_fn = s->qpu_filter; -+ cmd_lx->next_src1_x = x1_m3 + start_x; -+ cmd_lx->next_src1_y = src_yx_y; -+ cmd_lx->next_src1_base = src_vc_address_y; -+ if (bw <= 8) -+ { -+ cmd_lx->next_src2_x = MC_DUMMY_X; -+ cmd_lx->next_src2_y = MC_DUMMY_Y; -+ cmd_lx->next_src2_base = s->qpu_dummy_frame; -+ } -+ else -+ { -+ cmd_lx->next_src2_x = x1_m3 + start_x + 8; -+ cmd_lx->next_src2_y = src_yx_y; -+ cmd_lx->next_src2_base = src_vc_address_y; -+ } -+ cmd_y->p.w = bw; -+ cmd_y->p.h = bh; -+ cmd_y->p.mymx21 = my2_mx2_my_mx; -+ cmd_y->p.wo1 = wo; -+ cmd_y->p.wo2 = wo; -+ cmd_y->p.dst_addr = dst_addr + start_x; -+ yp->last_lx = cmd_y; -+ yp->qpu_mc_curr = cmd_y + 1; + -+ if (bw == 8) { -+ s->last_y8_lx = cmd_lx; -+ s->last_y8_p = cmd_y; -+ } ++ for (; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ if (mx == 0 && my == 0) ++ ++ts->y_pred1_x0y0; ++ else if (mx == 0) ++ ++ts->y_pred1_x0; ++ else if (my == 0) ++ ++ts->y_pred1_y0; ++ else ++ ++ts->y_pred1_xy; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ src1->x = x1_m3 + start_x; ++ src1->y = y1_m3; ++ src1->base = src_vc_address_y; ++ if (bw <= 8) ++ { ++ src2->x = MC_DUMMY_X; ++ src2->y = MC_DUMMY_Y; ++#if RPI_QPU_EMU_Y ++ src2->base = s->qpu_dummy_frame_emu; ++#else ++ src2->base = s->qpu_dummy_frame_qpu; ++#endif ++ } ++ else ++ { ++ src2->x = x1_m3 + start_x + 8; ++ src2->y = y1_m3; ++ src2->base = src_vc_address_y; ++ } ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo; ++ cmd_y->wo2 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ ++ if (bw == 8) { ++ s->last_y8_l1 = src2; ++ s->last_y8_p = cmd_y; + } + } + } @@ -4524,168 +8309,180 @@ index b478065..955e426 100644 + AVFrame *const src_frame, + AVFrame *const src_frame2) +{ -+ const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0); ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); + const Mv * const mv = mv_field->mv + 0; + const Mv * const mv2 = mv_field->mv + 1; + -+// rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame, -+// mv, x0, y0, nPbW, nPbH, -+// src_frame2, mv2, mv_field); ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my<<8) | mx; ++ const unsigned int mx2 = mv2->x & 3; ++ const unsigned int my2 = mv2->y & 3; ++ const unsigned int my2_mx2 = (my2<<8) | mx2; ++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const unsigned int ref_idx0 = mv_field->ref_idx[0]; ++ const unsigned int ref_idx1 = mv_field->ref_idx[1]; ++ const uint32_t wt_offset = ++ offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1; ++ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++ ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip; ++ ++ if (my2_mx2_my_mx == 0) + { -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my<<8) | mx; -+ const unsigned int mx2 = mv2->x & 3; -+ const unsigned int my2 = mv2->y & 3; -+ const unsigned int my2_mx2 = (my2<<8) | mx2; -+ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const int x1 = x0 + (mv->x >> 2); ++ const int y1 = y0 + (mv->y >> 2); ++ const int x2 = x0 + (mv2->x >> 2); ++ const int y2 = y0 + (mv2->y >> 2); ++ const int bh = nPbH; ++ ++ // Can do chunks a full 16 wide if we don't want the H filter ++ for (int start_x=0; start_x < nPbW; start_x += 16) ++ { ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred2_x0y0; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 16); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = 0; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ // Filter requires a run-up of 3 + const int x1 = x0 + (mv->x >> 2) - 3; + const int y1 = y0 + (mv->y >> 2) - 3; + const int x2 = x0 + (mv2->x >> 2) - 3; + const int y2 = y0 + (mv2->y >> 2) - 3; -+ const unsigned int ref_idx0 = mv_field->ref_idx[0]; -+ const unsigned int ref_idx1 = mv_field->ref_idx[1]; -+ const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] + -+ s->sh.luma_offset_l1[ref_idx1] + 1; -+ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); -+ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++ const int bh = nPbH; + -+ uint32_t dst = get_vc_address_y(s->frame) + y_off; -+ const uint32_t src1_base = get_vc_address_y(src_frame); -+ const uint32_t src2_base = get_vc_address_y(src_frame2); -+ -+ for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H) -+ { -+ const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H); -+ -+ for (int start_x=0; start_x < nPbW; start_x += 8) -+ { // B blocks work 8 at a time -+ HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7); -+ qpu_mc_pred_y_t *const cmd_lx = yp->last_lx; -+ qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr; ++ for (int start_x=0; start_x < nPbW; start_x += 8) ++ { // B blocks work 8 at a time ++ // B weights aren't doubled as the QPU code does the same ++ // amount of work as it does for P ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; +#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ const unsigned int mmx = mx | mx2; -+ const unsigned int mmy = my | my2; -+ if (mmx == 0 && mmy == 0) -+ ++ts->y_pred2_x0y0; -+ else if (mmx == 0) -+ ++ts->y_pred2_x0; -+ else if (mmy == 0) -+ ++ts->y_pred2_y0; -+ else -+ ++ts->y_pred2_xy; ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ const unsigned int mmx = mx | mx2; ++ const unsigned int mmy = my | my2; ++ if (mmx == 0 && mmy == 0) ++ ++ts->y_pred2_x0y0; ++ else if (mmx == 0) ++ ++ts->y_pred2_x0; ++ else if (mmy == 0) ++ ++ts->y_pred2_y0; ++ else ++ ++ts->y_pred2_xy; + -+ if (nPbH > 16) -+ ++ts->y_pred2_hgt16; -+ else -+ ++ts->y_pred2_hle16; -+ } ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } +#endif -+ cmd_y[-1].next_fn = s->qpu_filter_b; -+ cmd_lx->next_src1_x = x1 + start_x; -+ cmd_lx->next_src1_y = y1 + start_y; -+ cmd_lx->next_src1_base = src1_base; -+ cmd_lx->next_src2_x = x2 + start_x; -+ cmd_lx->next_src2_y = y2 + start_y; -+ cmd_lx->next_src2_base = src2_base; -+ cmd_y->p.w = FFMIN(nPbW - start_x, 8); -+ cmd_y->p.h = bh; -+ cmd_y->p.mymx21 = my2_mx2_my_mx; -+ cmd_y->p.wo1 = wo1; -+ cmd_y->p.wo2 = wo2; -+ cmd_y->p.dst_addr = dst + start_x; -+ yp->last_lx = cmd_y; -+ yp->qpu_mc_curr = cmd_y + 1; -+ } -+ dst += s->frame->linesize[0] * 16; ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 8); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); + } + } +} + -+ -+static HEVCRpiChromaPred * -+rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val) -+{ -+ HEVCRpiChromaPred * cp = s->curr_pred_c; -+ HEVCRpiChromaPred * cpt = cp + 1; -+ for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) { -+ if (cpt->load < cp->load) -+ cp = cpt; -+ } -+ // Actual use of load_val is noticably better but we haven't sorted Q length problems yet -+ ++cp->load; -+// cp->load += load_val; -+ return cp; -+} -+ ++// h/v shifts fixed at one as that is all the qasm copes with +static void -+rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c, ++rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, + const Mv * const mv, + const int16_t * const c_weights, + const int16_t * const c_offsets, + AVFrame * const src_frame) +{ ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // = s->ps.sps->hshift[1]; ++ const int vshift = 1; // = s->ps.sps->vshift[1]; + -+ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); -+#if 0 -+ av_assert0(s->frame->linesize[1] == s->frame->linesize[2]); ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; ++ const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]); ++ const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]); ++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; + -+ rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1], -+ x0_c, y0_c, nPbW_c, nPbH_c, mv, -+ c_weights[0], c_offsets[0]); -+ -+ rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2], -+ x0_c, y0_c, nPbW_c, nPbH_c, mv, -+ c_weights[1], c_offsets[1]); -+#endif ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) + { -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); ++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; ++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; ++ qpu_mc_src_t * const last_lx = *plast_lx; ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; -+ const uint32_t src_base_u = get_vc_address_u(src_frame); -+ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; -+ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; -+ const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]); -+ const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]); -+ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; -+ -+ for(int start_y=0;start_y < nPbH_c;start_y+=16) -+ { -+ const int bh = FFMIN(nPbH_c-start_y, 16); -+ -+ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) -+ { -+ HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3); -+ qpu_mc_pred_c_t * const u = cp->qpu_mc_curr; -+ qpu_mc_pred_c_t * const last_l0 = cp->last_l0; -+ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); -+ -+ u[-1].next_fn = s->qpu_filter_uv; -+ last_l0->next_src_x = x1_c + start_x; -+ last_l0->next_src_y = y1_c + start_y; -+ last_l0->next_src_base_c = src_base_u; -+ u[0].p.h = bh; -+ u[0].p.w = bw; -+ u[0].p.coeffs_x = x_coeffs; -+ u[0].p.coeffs_y = y_coeffs; -+ u[0].p.wo_u = wo_u; -+ u[0].p.wo_v = wo_v; -+ u[0].p.dst_addr_c = dst_base_u + start_x * 2; -+ cp->last_l0 = u; -+ cp->qpu_mc_curr = u + 1; -+ } -+ -+ dst_base_u += s->frame->linesize[1] * 16; -+ } ++ last_lx->x = x1_c + start_x; ++ last_lx->y = y1_c; ++ last_lx->base = src_base_u; ++ cmd_c->h = bh; ++ cmd_c->w = bw; ++ cmd_c->coeffs_x = x_coeffs; ++ cmd_c->coeffs_y = y_coeffs; ++ cmd_c->wo_u = wo_u; ++ cmd_c->wo_v = wo_v; ++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); ++ *plast_lx = &cmd_c->next_src; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); + } -+ return; ++ return; +} + ++// h/v shifts fixed at one as that is all the qasm copes with +static void +rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, @@ -4697,89 +8494,72 @@ index b478065..955e426 100644 + AVFrame * const src_frame, + AVFrame * const src_frame2) +{ -+ const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c); -+#if 0 -+ rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2, -+ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0); ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // s->ps.sps->hshift[1]; ++ const int vshift = 1; // s->ps.sps->vshift[1]; ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; + -+ rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2, -+ x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1); -+#endif ++ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); ++ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); ++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; ++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ ++ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); ++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; ++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ ++ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ ++ const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]); ++ const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]); ++ ++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) + { -+ const int hshift = s->ps.sps->hshift[1]; -+ const int vshift = s->ps.sps->vshift[1]; -+ const Mv * const mv = mv_field->mv + 0; -+ const Mv * const mv2 = mv_field->mv + 1; ++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); + -+ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); -+ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); -+ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; -+ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); ++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; ++ qpu_mc_src_t * const src_l0 = cp->last_l0; ++ qpu_mc_src_t * const src_l1 = cp->last_l1; + -+ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); -+ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); -+ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; -+ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ src_l0->x = x1_c + start_x; ++ src_l0->y = y1_c; ++ src_l0->base = src1_base; ++ src_l1->x = x2_c + start_x; ++ src_l1->y = y2_c; ++ src_l1->base = src2_base; + -+ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; -+ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ u[0].h = bh; ++ u[0].w = bw; ++ u[0].coeffs_x1 = coefs0_x; ++ u[0].coeffs_y1 = coefs0_y; ++ u[0].weight_u1 = c_weights[0]; // Weight L0 U ++ u[0].weight_v1 = c_weights[1]; // Weight L0 V ++ u[0].coeffs_x2 = coefs1_x; ++ u[0].coeffs_y2 = coefs1_y; ++ u[0].wo_u2 = wo_u2; ++ u[0].wo_v2 = wo_v2; ++ u[0].dst_addr_c = dst_base_u + (start_x << xshl); + -+ uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off; -+ -+ for (int start_y = 0; start_y < nPbH_c; start_y += 16) { -+ const unsigned int bh = FFMIN(nPbH_c-start_y, 16); -+ -+ // We are allowed 3/4 powers of two as well as powers of 2 -+ av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2); -+ -+ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) { -+ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); -+ -+ HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3); -+ qpu_mc_pred_c_t * const u = cp->qpu_mc_curr; -+ qpu_mc_pred_c_t * const last_l0 = cp->last_l0; -+ qpu_mc_pred_c_t * const last_l1 = cp->last_l1; -+ -+ u[-1].next_fn = s->qpu_filter_uv_b0; -+ last_l0->next_src_x = x1_c + start_x; -+ last_l0->next_src_y = y1_c + start_y; -+ last_l0->next_src_base_c = get_vc_address_u(src_frame); -+ -+ u[0].next_fn = 0; // Ignored - 2 block cmd -+ u[0].next_src_x = x2_c + start_x; -+ u[0].next_src_y = y2_c + start_y; -+ u[0].next_src_base_c = get_vc_address_u(src_frame2); -+ -+ u[0].b0.h = (bh<16 ? bh : 16); -+ u[0].b0.w = (bwnext_src_x = x2_c + start_x; -+ last_l1->next_src_y = y2_c + start_y; -+ last_l1->next_src_base_c = get_vc_address_u(src_frame2); -+ -+ u[1].b1.dummy0 = 0; // w,h inherited from b0 -+ u[1].b1.coeffs_x = coefs1_x; -+ u[1].b1.coeffs_y = coefs1_y; -+ u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]); -+ u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]); -+ u[1].b1.dst_addr_c = dst_base_u + start_x * 2; -+ -+ cp->last_l0 = u; -+ cp->last_l1 = u + 1; -+ cp->qpu_mc_curr = u + 2; -+ } -+ -+ dst_base_u += s->frame->linesize[1] * 16; -+ } ++ cp->last_l0 = &u[0].next_src1; ++ cp->last_l1 = &u[0].next_src2; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); + } +} ++ ++ +#endif + + @@ -4796,7 +8576,7 @@ index b478065..955e426 100644 int merge_idx = 0; struct MvField current_mv = {{{ 0 }}}; -@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1724,8 +2985,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int y_cb = y0 >> log2_min_cb_size; int x_pu, y_pu; int i, j; @@ -4806,7 +8586,7 @@ index b478065..955e426 100644 if (!skip_flag) lc->pu.merge_flag = ff_hevc_merge_flag_decode(s); -@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1769,12 +3029,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -4828,7 +8608,7 @@ index b478065..955e426 100644 if (s->ps.sps->chroma_format_idc) { +#if RPI_INTER + if (s->enable_rpi) { -+ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, ++ rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, + s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], + ref0->frame); + return; @@ -4837,7 +8617,7 @@ index b478065..955e426 100644 chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]); -@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1788,12 +3065,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -4859,7 +8639,7 @@ index b478065..955e426 100644 if (s->ps.sps->chroma_format_idc) { +#if RPI_INTER + if (s->enable_rpi) { -+ rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, ++ rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, + s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], + ref1->frame); + return; @@ -4868,7 +8648,7 @@ index b478065..955e426 100644 chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]); -@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, +@@ -1808,11 +3102,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, int nPbW_c = nPbW >> s->ps.sps->hshift[1]; int nPbH_c = nPbH >> s->ps.sps->vshift[1]; @@ -4901,7 +8681,7 @@ index b478065..955e426 100644 chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0); -@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) +@@ -2087,7 +3401,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); ret = hls_pcm_sample(s, x0, y0, log2_cb_size); if (s->ps.sps->pcm.loop_filter_disable_flag) @@ -4911,21 +8691,22 @@ index b478065..955e426 100644 if (ret < 0) return ret; -@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, +@@ -2310,6 +3626,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]])); } +#ifdef RPI +static void rpi_execute_dblk_cmds(HEVCContext *s) +{ -+ int n; -+ int job = s->pass1_job; -+ int ctb_size = 1 << s->ps.sps->log2_ctb_size; -+ int (*p)[2] = s->dblk_cmds[job]; -+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) { -+ ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size); ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ HEVCRpiDeblkEnv *const de = &s->jb1->deblk; ++ unsigned int i; ++ ++ for (i = 0; i != de->n; ++i) ++ { ++ ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size); + } -+ s->num_dblk_cmds[job] = 0; ++ de->n = 0; +} + +#if 0 @@ -4958,21 +8739,33 @@ index b478065..955e426 100644 +#endif + + ++#define RPI_OPT_SEP_PRED 0 ++ ++ +// I-pred, transform_and_add for all blocks types done here +// All ARM ++#if RPI_OPT_SEP_PRED ++static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma) ++#else +static void rpi_execute_pred_cmds(HEVCContext * const s) ++#endif +{ + int i; -+ int job = s->pass1_job; -+ const HEVCPredCmd *cmd = s->univ_pred_cmds[job]; -+#ifdef RPI_WORKER ++ HEVCRpiIntraPredEnv * iap = &s->jb1->intra; ++ const HEVCPredCmd *cmd = iap->cmds; ++#ifdef RPI + HEVCLocalContextIntra *lc = &s->HEVClcIntra; +#else + HEVCLocalContext *lc = s->HEVClc; +#endif + -+ for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) { ++ for(i = iap->n; i > 0; i--, cmd++) { +// printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); ++#if RPI_OPT_SEP_PRED ++ if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) { ++ continue; ++ } ++#endif + + switch (cmd->type) + { @@ -4983,7 +8776,7 @@ index b478065..955e426 100644 + lc->na.cand_up_left = (cmd->na >> 2) & 1; + lc->na.cand_up = (cmd->na >> 1) & 1; + lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0) ++ if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0) + s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); + else + s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); @@ -4991,16 +8784,25 @@ index b478065..955e426 100644 + + case RPI_PRED_ADD_RESIDUAL: + s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); -+#ifdef RPI_PRECLEAR -+ memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache -+#endif + break; ++ case RPI_PRED_ADD_DC: ++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++#if RPI_HEVC_SAND + case RPI_PRED_ADD_RESIDUAL_U: -+ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); + break; + case RPI_PRED_ADD_RESIDUAL_V: -+ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); + break; ++ case RPI_PRED_ADD_RESIDUAL_C: ++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_DC_U: ++ case RPI_PRED_ADD_DC_V: ++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++#endif + + case RPI_PRED_I_PCM: + pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); @@ -5011,88 +8813,27 @@ index b478065..955e426 100644 + abort(); + } + } -+ s->num_pred_cmds[job] = 0; ++#if RPI_OPT_SEP_PRED ++ if (do_luma) ++#endif ++ { ++ iap->n = 0; ++ } +} + -+// Do any inter-pred that we want to do in software -+// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here -+// All ARM -+static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only) -+{ -+ unsigned int cidx; -+ AVFrame myref; -+ AVFrame myref1; -+ struct MvField mymv; -+ -+ for(; n>0 ; n--, cmd++) { -+ av_assert0(0); -+ -+ switch(cmd->cmd) { -+ case RPI_CMD_LUMA_UNI: -+ if (b_only) -+ break; -+ myref.data[0] = cmd->src; -+ myref.linesize[0] = cmd->srcstride; -+ luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset); -+ break; -+ case RPI_CMD_LUMA_BI: -+ myref.data[0] = cmd->src; -+ myref.linesize[0] = cmd->srcstride; -+ myref1.data[0] = cmd->src1; -+ myref1.linesize[0] = cmd->srcstride1; -+ mymv.ref_idx[0] = cmd->ref_idx[0]; -+ mymv.ref_idx[1] = cmd->ref_idx[1]; -+ luma_mc_bi(s, cmd->dst, cmd->dststride, -+ &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, -+ &myref1, &cmd->mv1, &mymv); -+ break; -+ case RPI_CMD_CHROMA_UNI: -+ if (b_only) -+ break; -+ mymv.mv[0] = cmd->mv; -+ chroma_mc_uni(s, cmd->dst, -+ cmd->dststride, cmd->src, cmd->srcstride, 0, -+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset); -+ break; -+ case RPI_CMD_CHROMA_BI: -+ case RPI_CMD_CHROMA_BI+1: -+ cidx = cmd->cmd - RPI_CMD_CHROMA_BI; -+ myref.data[cidx+1] = cmd->src; -+ myref.linesize[cidx+1] = cmd->srcstride; -+ myref1.data[cidx+1] = cmd->src1; -+ myref1.linesize[cidx+1] = cmd->srcstride1; -+ mymv.ref_idx[0] = cmd->ref_idx[0]; -+ mymv.ref_idx[1] = cmd->ref_idx[1]; -+ mymv.mv[0] = cmd->mv; -+ mymv.mv[1] = cmd->mv1; -+ chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1, -+ cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx); -+ break; -+ } -+ } -+} -+ -+static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only) -+{ -+ const int job = s->pass1_job; -+ -+ if (!qpu_luma || luma_b_only) -+ do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma); -+ s->num_mv_cmds_y[job] = 0; -+ if (!qpu_chroma || chroma_b_only) -+ do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma); -+ s->num_mv_cmds_c[job] = 0; -+} + +#endif + +#ifdef RPI ++ +// Set initial uniform job values & zero ctu_count +static void rpi_begin(HEVCContext *s) +{ +#if RPI_INTER -+ int job = s->pass0_job; -+ int i; ++ unsigned int i; ++ HEVCRpiJob * const jb = s->jb0; ++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; ++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; + + const uint16_t pic_width_y = s->ps.sps->width; + const uint16_t pic_height_y = s->ps.sps->height; @@ -5100,73 +8841,60 @@ index b478065..955e426 100644 + const uint16_t pic_width_c = s->ps.sps->width >> s->ps.sps->hshift[1]; + const uint16_t pic_height_c = s->ps.sps->height >> s->ps.sps->vshift[1]; + -+ for(i=0; i < QPU_N_UV;i++) { -+ HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i; -+ qpu_mc_pred_c_t * u = cp->qpu_mc_base; ++ rpi_inter_pred_reset(cipe); ++ for (i = 0; i < cipe->n; i++) { ++ HEVCRpiInterPredQ * const cp = cipe->q + i; ++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; + -+ // Chroma setup is a double block with L0 fetch -+ // and other stuff in the 1st block and L1 fetch -+ // in the 2nd along with a lot of dummy vars -+ // This could be packed a lot tighter but it would make -+ // L0, L1 management a lot harder ++ u->next_src1.x = 0; ++ u->next_src1.y = 0; ++ u->next_src1.base = 0; ++ u->pic_cw = pic_width_c; ++ u->pic_ch = pic_height_c; ++ u->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ u->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ u->wdenom = s->sh.chroma_log2_weight_denom; ++ cp->last_l0 = &u->next_src1; + + u->next_fn = 0; -+ u->next_src_x = 0; -+ u->next_src_y = 0; -+ u->next_src_base_c = 0; -+ u->s0.pic_cw = pic_width_c; -+ u->s0.pic_ch = pic_height_c; -+ u->s0.stride2 = rpi_sliced_frame_stride2(s->frame); -+ u->s0.stride1 = s->frame->linesize[1]; -+ u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6; -+ u->s0.dummy0 = 0; -+ cp->last_l0 = u; -+ ++u; ++ u->next_src2.x = 0; ++ u->next_src2.y = 0; ++ u->next_src2.base = 0; ++ cp->last_l1 = &u->next_src2; + -+ u->next_fn = 0; -+ u->next_src_x = 0; -+ u->next_src_y = 0; -+ u->next_src_base_c = 0; -+ u->s1.dummy0 = 0; -+ u->s1.dummy1 = 0; -+ u->s1.dummy2 = 0; -+ u->s1.dummy3 = 0; -+ u->s1.dummy4 = 0; -+ u->s1.dummy5 = 0; -+ cp->last_l1 = u; -+ ++u; -+ -+ cp->load = 0; -+ cp->qpu_mc_curr = u; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); + } -+ s->curr_pred_c = NULL; + -+ for(i=0;i < QPU_N_Y;i++) { -+ HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i; -+ qpu_mc_pred_y_t * y = yp->qpu_mc_base; ++ rpi_inter_pred_reset(yipe); ++ for (i = 0; i < yipe->n; i++) { ++ HEVCRpiInterPredQ * const yp = yipe->q + i; ++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; + -+ y->next_src1_x = 0; -+ y->next_src1_y = 0; -+ y->next_src1_base = 0; -+ y->next_src2_x = 0; -+ y->next_src2_y = 0; -+ y->next_src2_base = 0; -+ y->s.pic_h = pic_height_y; -+ y->s.pic_w = pic_width_y; -+ y->s.stride2 = rpi_sliced_frame_stride2(s->frame); -+ y->s.stride1 = s->frame->linesize[0]; -+ y->s.wdenom = s->sh.luma_log2_weight_denom + 6; -+ y->s.dummy0 = 0; ++ y->next_src1.x = 0; ++ y->next_src1.y = 0; ++ y->next_src1.base = 0; ++ y->next_src2.x = 0; ++ y->next_src2.y = 0; ++ y->next_src2.base = 0; ++ y->pic_h = pic_height_y; ++ y->pic_w = pic_width_y; ++ y->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ y->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ y->wdenom = s->sh.luma_log2_weight_denom; + y->next_fn = 0; -+ yp->last_lx = y; -+ ++y; ++ yp->last_l0 = &y->next_src1; ++ yp->last_l1 = &y->next_src2; + -+ yp->load = 0; -+ yp->qpu_mc_curr = y; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); + } -+ s->curr_pred_y = NULL; ++ + s->last_y8_p = NULL; -+ s->last_y8_lx = NULL; ++ s->last_y8_l1 = NULL; ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) { ++ jb->progress[i] = -1; ++ } ++ +#endif + s->ctu_count = 0; +} @@ -5174,78 +8902,122 @@ index b478065..955e426 100644 + + +#if RPI_INTER -+static unsigned int mc_terminate_y(HEVCContext * const s, const int job) ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_qpu(HEVCContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) +{ + unsigned int i; -+ const uint32_t exit_fn = qpu_fn(mc_exit); -+ const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12); -+ unsigned int tc = 0; -+ HEVCRpiJob * const jb = s->jobs + job; ++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; ++ unsigned int max_block = 0; + -+ // Add final commands to Q -+ for(i = 0; i != QPU_N_Y; ++i) { -+ HEVCRpiLumaPred * const yp = jb->luma_mvs + i; -+ qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx; -+ -+ // We will always have had L0 if we have L1 so only test L0 -+ if (px != yp->qpu_mc_base) -+ tc = 1; -+ -+ yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2; // Actual fn ptr -+ -+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched -+ px->next_src1_x = MC_DUMMY_X; -+ px->next_src1_y = MC_DUMMY_Y; -+ px->next_src1_base = s->qpu_dummy_frame; -+ px->next_src2_x = MC_DUMMY_X; -+ px->next_src2_y = MC_DUMMY_Y; -+ px->next_src2_base = s->qpu_dummy_frame; -+ -+ yp->last_lx = NULL; ++ if (!ipe->used) { ++ return 0; + } + -+ return tc; ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; ++ ++ if (block_size > max_block) ++ max_block = block_size; ++ ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_qpu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_qpu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ ++ // Add to mailbox list ++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); ++ mail[i][1] = yp->code_setup; ++ } ++ ++#if RPI_CACHE_UNIF_MVS ++ // We don't need invalidate here as the uniforms aren't changed by the QPU ++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing ++ // new values which seems to give us a small performance advantage ++ // ++ // In most cases we will not have a completely packed set of uniforms and as ++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the ++ // fullest ++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, ++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, ++ ipe->n, ipe->max_fill + ipe->min_gap); ++#endif ++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); ++ ++ return 1; +} ++#endif + -+#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c -+#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n) -+ -+static unsigned int mc_terminate_uv(HEVCContext * const s, const int job) ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_emu(HEVCContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) +{ + unsigned int i; -+ const uint32_t exit_fn = qpu_fn(mc_exit_c); -+ const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV)); -+ unsigned int tc = 0; -+ HEVCRpiJob * const jb = s->jobs + job; -+ -+ // Add final commands to Q -+ for(i = 0; i != QPU_N_UV; ++i) { -+ HEVCRpiChromaPred * const cp = jb->chroma_mvs + i; -+ qpu_mc_pred_c_t *const p0 = cp->last_l0; -+ qpu_mc_pred_c_t *const p1 = cp->last_l1; -+ -+ // We will always have had L0 if we have L1 so only test L0 -+ if (p0 != cp->qpu_mc_base) -+ tc = 1; -+ -+ cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2; // Actual fn ptr -+ -+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched -+ p0->next_src_x = MC_DUMMY_X; -+ p0->next_src_y = MC_DUMMY_Y; -+ p0->next_src_base_c = s->qpu_dummy_frame; -+ p1->next_src_x = MC_DUMMY_X; -+ p1->next_src_y = MC_DUMMY_Y; -+ p1->next_src_base_c = s->qpu_dummy_frame;; -+ -+ cp->last_l0 = NULL; -+ cp->last_l1 = NULL; ++ if (!ipe->used) { ++ return 0; + } + -+ return tc; ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_emu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_emu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ } ++ ++ return 1; +} +#endif + ++ ++#if RPI_QPU_EMU_Y ++#define mc_terminate_add_y mc_terminate_add_emu ++#else ++#define mc_terminate_add_y mc_terminate_add_qpu ++#endif ++#if RPI_QPU_EMU_C ++#define mc_terminate_add_c mc_terminate_add_emu ++#else ++#define mc_terminate_add_c mc_terminate_add_qpu ++#endif ++#endif ++ +#ifdef RPI + + @@ -5260,174 +9032,178 @@ index b478065..955e426 100644 +// Core execution tasks +static void worker_core(HEVCContext * const s) +{ -+ worker_global_env_t * const wg = &worker_global_env; -+ int arm_cost = 0; -+// vpu_qpu_wait_h sync_c; ++#if RPI_OPT_SEP_PRED ++ vpu_qpu_wait_h sync_c; ++#endif + vpu_qpu_wait_h sync_y; -+ int qpu_luma = 0; -+ int qpu_chroma = 0; -+ int gpu_load; -+ int arm_load; -+ static const int arm_const_cost = 2; + -+// static int z = 0; -+ -+ const int job = s->pass1_job; -+ unsigned int flush_start = 0; -+ unsigned int flush_count = 0; ++ HEVCRpiJob * const jb = s->jb1; ++ int pred_y, pred_c; + + const vpu_qpu_job_h vqj = vpu_qpu_job_new(); + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); + -+ if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) { -+ vpu_qpu_job_add_vpu(vqj, -+ vpu_get_fn(), -+ vpu_get_constants(), -+ s->coeffs_buf_vc[job][2], -+ s->num_coeffs[job][2] >> 8, -+ s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], -+ s->num_coeffs[job][3] >> 10, -+ 0); -+ -+ rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+ } -+ -+ -+#if RPI_INTER -+ pthread_mutex_lock(&wg->lock); -+ -+// ++z; -+ gpu_load = vpu_qpu_current_load(); -+ arm_load = avpriv_atomic_int_get(&wg->arm_load); -+#if 0 // Y_B_ONLY -+ qpu_luma = gpu_load + 2 < arm_load; -+ qpu_chroma = gpu_load < arm_load + 8; -+#elif 0 -+ qpu_luma = gpu_load < arm_load + 2; -+ qpu_chroma = gpu_load < arm_load + 8; -+#else -+ qpu_chroma = 1; -+ qpu_luma = 1; -+#endif -+ -+ arm_cost = !qpu_chroma * 2 + !qpu_luma * 3; -+ avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost); -+ -+ wg->gpu_c += qpu_chroma; -+ wg->gpu_y += qpu_luma; -+ wg->arm_c += !qpu_chroma; -+ wg->arm_y += !qpu_luma; -+ -+ -+// if ((z & 511) == 0) { -+// printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y); -+// } -+ -+ + { -+ int (*d)[2] = s->dblk_cmds[job]; -+ unsigned int high=(*d)[1]; -+ int n; ++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ if (cf->s[3].n + cf->s[2].n != 0) ++ { ++ const unsigned int csize = sizeof(cf->s[3].buf[0]); ++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; ++ vpu_qpu_job_add_vpu(vqj, ++ vpu_get_fn(s->ps.sps->bit_depth), ++ vpu_get_constants(), ++ cf->gptr.vc, ++ cf->s[2].n >> 8, ++ cf->gptr.vc + offset32, ++ cf->s[3].n >> 10, ++ 0); + -+ flush_start = high; -+ for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) { -+ unsigned int y = (*d)[1]; -+ flush_start = FFMIN(flush_start, y); -+ high=FFMAX(high,y); ++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); ++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); + } -+ // Avoid flushing past end of frame -+ flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start; + } + -+#if !DISABLE_CHROMA -+ if (qpu_chroma && mc_terminate_uv(s, job) != 0) -+ { -+ HEVCRpiJob * const jb = s->jobs + job; -+ const uint32_t code = qpu_fn(mc_setup_c); -+ uint32_t * p; -+ unsigned int i; -+ uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS]; -+ -+ for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) { -+ *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm); -+ *p++ = code; -+ } -+ -+ vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv); -+ -+#if RPI_CACHE_UNIF_MVS -+ rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+#endif -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ flush_start, flush_count, s->ps.sps->vshift[1], 0, 1); -+ } -+#endif ++ pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip); + +// We can take a sync here and try to locally overlap QPU processing with ARM +// but testing showed a slightly negative benefit with noticable extra complexity -+// vpu_qpu_job_add_sync_this(vqj, &sync_c); -+ -+ if (qpu_luma && mc_terminate_y(s, job) != 0) -+ { -+ HEVCRpiJob * const jb = s->jobs + job; -+ const uint32_t code = qpu_fn(mc_setup); -+ uint32_t * p; -+ unsigned int i; -+ uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS]; -+ -+ for (p = mail_y, i = 0; i != QPU_N_Y; ++i) { -+ *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm); -+ *p++ = code; -+ } -+ -+ vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y); -+ -+#if RPI_CACHE_UNIF_MVS -+ rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++#if RPI_OPT_SEP_PRED ++ vpu_qpu_job_add_sync_this(vqj, &sync_c); +#endif -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ flush_start, flush_count, s->ps.sps->vshift[1], 1, 0); -+ } + -+ pthread_mutex_unlock(&wg->lock); -+ -+#endif ++ pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip); + + vpu_qpu_job_add_sync_this(vqj, &sync_y); + ++ ++ // We are expecting a contiguous Z-shaped set of blocks ++ // So generate up to 3 blocks: ++ // 1st line ++ // body ++ // last line ++ // This will work even if we don't have the expected geometry ++ if (pred_y || pred_c) ++ { ++ const HEVCRpiDeblkEnv *const de = &jb->deblk; ++ const HEVCRpiDeblkBlk * db = de->blks + 0; ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ unsigned int x0 = db->x_ctb; ++ unsigned int xx = x0 + ctb_size; ++ unsigned int y0 = db->y_ctb; ++ ++ unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}}; ++ unsigned int b = 0; ++ unsigned int i; ++ ++ for (i = 1, ++db; i < de->n; ++i, ++db) ++ { ++ if (db->x_ctb == xx && db->y_ctb == y0) { ++ xx += ctb_size; ++ } ++ else ++ { ++ unsigned int * const tlbr = blks_tlbr[b]; ++ if (tlbr[0] > y0) ++ tlbr[0] = y0; ++ if (tlbr[1] > x0) ++ tlbr[1] = x0; ++ if (tlbr[2] < y0 + ctb_size) ++ tlbr[2] = y0 + ctb_size; ++ if (tlbr[3] < xx) ++ tlbr[3] = xx; ++ x0 = db->x_ctb; ++ xx = x0 + ctb_size; ++ y0 = db->y_ctb; ++ b = 1; ++ } ++ } ++ ++ if (blks_tlbr[b][0] != ~0U) ++ ++b; ++ ++ { ++ unsigned int * const tlbr = blks_tlbr[b]; ++ tlbr[0] = y0; ++ tlbr[1] = x0; ++ tlbr[2] = y0 + ctb_size; ++ tlbr[3] = xx; ++ } ++ ++ // ??? Coalesce blocks ??? ++ for (i = 0; i <= b; ++i) { ++ const unsigned int * const tlbr = blks_tlbr[i]; ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, ++ tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c); ++ } ++ } ++ ++ + // Having accumulated some commands - do them + rpi_cache_flush_finish(rfe); ++ ++ // Await progress as required ++ { ++ unsigned int i; ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) { ++ if (jb->progress[i] >= 0) { ++ ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]); ++ } ++ } ++ } ++ + vpu_qpu_job_finish(vqj); + -+ memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job])); //???? Surely we haven't done the smaller ++ worker_pic_reset(&jb->coeffs); + -+#if Y_B_ONLY -+ if (qpu_luma) -+ vpu_qpu_wait(&sync_y); ++ // If we have emulated VPU ops - do it here ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ if (av_rpi_is_sand8_frame(s->frame)) ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ rpi_shader_c8(s, &jb->luma_ip, NULL); ++#else ++ rpi_shader_c8(s, NULL, &jb->chroma_ip); ++#endif ++ else ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ rpi_shader_c16(s, &jb->luma_ip, NULL); ++#else ++ rpi_shader_c16(s, NULL, &jb->chroma_ip); ++#endif +#endif -+ // Perform inter prediction -+ rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0); + ++#if RPI_OPT_SEP_PRED + // Wait for transform completion ++ vpu_qpu_wait(&sync_c); + + // Perform intra prediction and residual reconstruction -+ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost); -+#if Y_B_ONLY -+ if (!qpu_luma) -+ vpu_qpu_wait(&sync_y); -+#else ++ rpi_execute_pred_cmds(s, 0, 1); ++ ++ // Wait for transform completion + vpu_qpu_wait(&sync_y); -+#endif ++ ++ // Perform intra prediction and residual reconstruction ++ rpi_execute_pred_cmds(s, 1, 0); ++#else ++ // Wait for transform completion ++ vpu_qpu_wait(&sync_y); ++ ++ // Perform intra prediction and residual reconstruction + rpi_execute_pred_cmds(s); ++#endif + + // Perform deblocking for CTBs in this row + rpi_execute_dblk_cmds(s); -+ -+ avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost); +} + +static void rpi_do_all_passes(HEVCContext *s) +{ ++ // Called from main thread - must be no pending background jobs ++ av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending); ++ + // Do the various passes - common with the worker code + worker_core(s); + // Prepare next batch @@ -5435,99 +9211,90 @@ index b478065..955e426 100644 +} + + -+ +#endif + static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) { HEVCContext *s = avctxt->priv_data; -@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2319,6 +4153,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) int y_ctb = 0; int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; +#ifdef RPI -+ s->enable_rpi = s->ps.sps->bit_depth == 8 && -+ s->frame->format == AV_PIX_FMT_SAND128 && -+ !s->ps.pps->cross_component_prediction_enabled_flag; -+ -+ if (!s->enable_rpi) { -+ if (s->ps.pps->cross_component_prediction_enabled_flag) -+ printf("Cross component\n"); -+ } ++ // * We don't support cross_component_prediction_enabled_flag but as that ++ // must be 0 unless we have 4:4:4 there is no point testing for it as we ++ // only deal with sand which is never 4:4:4 ++ // [support wouldn't be hard] ++ s->enable_rpi = ++ ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) || ++ (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10)); +#endif + //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]); + if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); return AVERROR_INVALIDDATA; -@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2332,8 +4177,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) } } -+#ifdef RPI_WORKER -+ s->pass0_job = 0; -+ s->pass1_job = 0; -+#endif +#ifdef RPI ++ // Worker must be idle at start ++ av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending); + rpi_begin(s); +#endif + while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) { - int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; +- int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) + x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; - hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts); - -+ - ff_hevc_cabac_init(s, ctb_addr_ts); - - hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); -@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; +@@ -2348,6 +4199,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; -+#if RPI_INTER -+ s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV; -+ s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y; -+#endif -+ more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); + +#ifdef RPI ++ // Report progress so we can use our MVs in other frames ++ // If we are tiled then this isn't really optimal but given that tiling ++ // can change on a per pic basis (described in PPS) other schemes are ++ // quite a lot harder ++ if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) { ++ ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1); ++ } ++ + if (s->enable_rpi) { -+ //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0); -+ //av_assert0(s->num_dblk_cmds[s->pass0_job]pass0_jobpass0_job>=0); -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb; -+ s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb; -+ s->ctu_count++; ++ int q_full = (++s->ctu_count >= s->max_ctu_count); + -+ if ( s->ctu_count >= s->max_ctu_count ) { -+#ifdef RPI_WORKER -+ if (s->used_for_ref) -+ { -+// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb); ++ if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0) ++ q_full = 1; ++ if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0) ++ q_full = 1; + -+// worker_wait(s); -+ // Split work load onto separate threads so we make as rapid progress as possible with this frame -+ // Pass on this job to worker thread -+ worker_submit_job(s); ++ s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb; ++ s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb; + -+ // Make sure we have space to prepare the next job -+ worker_pass0_ready(s); ++ if (q_full) { ++ if (s->used_for_ref) ++ { ++// printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb); + -+ // Prepare the next batch of commands -+ rpi_begin(s); -+ } else { -+ // Non-ref frame so do it all on this thread -+ rpi_do_all_passes(s); ++// worker_wait(s); ++ // Split work load onto separate threads so we make as rapid progress as possible with this frame ++ // Pass on this job to worker thread ++ worker_submit_job(s); ++ ++ // Make sure we have space to prepare the next job ++ worker_pass0_ready(s); ++ ++ // Prepare the next batch of commands ++ rpi_begin(s); ++ } else { ++ // Non-ref frame so do it all on this thread ++ rpi_do_all_passes(s); ++ } + } -+#else -+ rpi_do_all_passes(s); -+#endif -+ } + + } +#endif @@ -5536,7 +9303,7 @@ index b478065..955e426 100644 if (more_data < 0) { s->tab_slice_address[ctb_addr_rs] = -1; return more_data; -@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) +@@ -2356,9 +4253,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) ctb_addr_ts++; ff_hevc_save_states(s, ctb_addr_ts); @@ -5549,12 +9316,10 @@ index b478065..955e426 100644 +#ifdef RPI + -+#ifdef RPI_WORKER + // Wait for the worker to finish all its jobs + if (s->enable_rpi) { + worker_wait(s); + } -+#endif + + // Finish off any half-completed rows + if (s->enable_rpi && s->ctu_count) { @@ -5579,7 +9344,7 @@ index b478065..955e426 100644 if (x_ctb + ctb_size >= s->ps.sps->width && y_ctb + ctb_size >= s->ps.sps->height) ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); -@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int +@@ -2393,6 +4321,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int s = s1->sList[self_id]; lc = s->HEVClc; @@ -5591,10 +9356,13 @@ index b478065..955e426 100644 if(ctb_row) { ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]); -@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) +@@ -2773,9 +4706,47 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) if (ret < 0) return ret; +- if (s->max_ra == INT_MAX) { +- if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) { +- s->max_ra = s->poc; + // The definition of _N unit types is "non-reference for other frames + // with the same temporal_id" so they may/will be ref frames for pics + // with a higher temporal_id. @@ -5621,47 +9389,95 @@ index b478065..955e426 100644 + s->is_decoded = 0; + break; + } - if (s->max_ra == INT_MAX) { - if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) { - s->max_ra = s->poc; -@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) ++ ++ if (s->sh.first_slice_in_pic_flag) { ++ if (s->max_ra == INT_MAX) { ++ if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) { ++ s->max_ra = s->poc; ++ } else { ++ if (IS_IDR(s)) ++ s->max_ra = INT_MIN; ++ } ++ } ++ ++ if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) && ++ s->poc <= s->max_ra) { ++ s->is_decoded = 0; ++ break; + } else { + if (IS_IDR(s)) + s->max_ra = INT_MIN; +@@ -2896,10 +4867,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) } } -fail: - if (s->ref && s->threads_type == FF_THREAD_FRAME) -+fail: // Also success path -+ if (s->ref && s->threads_type == FF_THREAD_FRAME) { -+#if RPI_INTER -+ rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height); -+#endif - ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); +- ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); - -+ } -+#if RPI_INTER -+ else if (s->ref && s->enable_rpi) { -+ // When running single threaded we need to flush the whole frame -+ flush_frame(s,s->frame); -+ } ++fail: // Also success path ++ if (s->ref != NULL) { ++ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) { ++#ifdef RPI ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height); +#endif ++ ff_hevc_progress_signal_all_done(s); ++ } ++#ifdef RPI ++ // * Flush frame will become confused if we pass it something ++ // that doesn't have an expected number of planes (e.g. 400) ++ // So only flush if we are sure we can. ++ else if (s->enable_rpi) { ++ // Flush frame to real memory as we expect to be able to pass ++ // it straight on to mmal ++ flush_frame(s, s->frame); ++ } ++#endif ++ } return ret; } -@@ -3064,6 +4764,41 @@ fail: +@@ -3070,6 +5056,83 @@ fail: return AVERROR(ENOMEM); } -+#ifdef RPI_WORKER -+static av_cold void hevc_init_worker(HEVCContext *s) ++#ifdef RPI ++static av_cold void hevc_init_worker(HEVCContext * const s) +{ + int err; -+ pthread_cond_init(&s->worker_cond_head, NULL); -+ pthread_cond_init(&s->worker_cond_tail, NULL); -+ pthread_mutex_init(&s->worker_mutex, NULL); + -+ s->worker_tail=0; -+ s->worker_head=0; -+ s->kill_worker=0; ++ memset(s->jobs, 0, sizeof(s->jobs)); ++ ++ for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) { ++ HEVCRpiJob * const jb = s->jobs + job; ++ ++ sem_init(&jb->sem_in, 0, 0); ++ sem_init(&jb->sem_out, 0, 0); ++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait); ++ ++ jb->intra.n = 0; ++ jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS); ++ ++ // ** Sizeof the union structure might be overkill but at the moment it ++ // is correct (it certainly isn't going to be too small) ++ ++ rpi_inter_pred_alloc(&jb->chroma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t), ++ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t)); ++ rpi_inter_pred_alloc(&jb->luma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t), ++ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t)); ++ ++ jb->deblk.n = 0; ++ jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS); ++ } ++ s->pass0_job = 0; ++ s->pass1_job = 0; ++ s->jb0 = s->jobs + 0; ++ s->jb1 = s->jobs + 0; ++ + err = pthread_create(&s->worker_thread, NULL, worker_start, s); + if (err) { + printf("Failed to create worker thread\n"); @@ -5669,83 +9485,74 @@ index b478065..955e426 100644 + } +} + ++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) ++{ ++ av_freep(&ipe->q); ++ gpu_free(&ipe->gptr); ++} ++ +static av_cold void hevc_exit_worker(HEVCContext *s) +{ + void *res; -+ s->kill_worker=1; -+ pthread_cond_broadcast(&s->worker_cond_tail); ++ unsigned int i; ++ ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ s->jobs[i].terminate = 1; ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ sem_post(&s->jobs[i].sem_in); + pthread_join(s->worker_thread, &res); + -+ pthread_cond_destroy(&s->worker_cond_head); -+ pthread_cond_destroy(&s->worker_cond_tail); -+ pthread_mutex_destroy(&s->worker_mutex); ++ for(i = 0; i < RPI_MAX_JOBS; i++) ++ { ++ HEVCRpiJob * const jb = s->jobs + i; + -+ s->worker_tail=0; -+ s->worker_head=0; -+ s->kill_worker=0; ++ sem_destroy(&jb->sem_in); ++ sem_destroy(&jb->sem_out); ++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); ++ av_freep(&jb->intra.cmds); ++ av_freep(&jb->deblk.blks); ++ rpi_free_inter_pred(&jb->chroma_ip); ++ rpi_free_inter_pred(&jb->luma_ip); ++ } +} ++ +#endif + static av_cold int hevc_decode_free(AVCodecContext *avctx) { HEVCContext *s = avctx->priv_data; -@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) +@@ -3081,10 +5144,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) av_freep(&s->cabac_state); +- for (i = 0; i < 3; i++) { +- av_freep(&s->sao_pixel_buffer_h[i]); +- av_freep(&s->sao_pixel_buffer_v[i]); +#ifdef RPI + -+#ifdef RPI_WORKER + hevc_exit_worker(s); -+#endif -+ -+ for(i=0;iunif_mv_cmds_y[i]); -+ av_freep(&s->unif_mv_cmds_c[i]); -+ av_freep(&s->univ_pred_cmds[i]); -+ -+#if RPI_INTER -+ gpu_free(&s->jobs[i].chroma_mvs_gptr); -+ gpu_free(&s->jobs[i].luma_mvs_gptr); -+#endif -+ } -+ + vpu_qpu_term(); ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_kill_state(s->progress_states + i); + } + + av_rpi_zc_uninit(avctx); +#endif + - for (i = 0; i < 3; i++) { - av_freep(&s->sao_pixel_buffer_h[i]); - av_freep(&s->sao_pixel_buffer_v[i]); -@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) ++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] ++ av_freep(&s->sao_pixel_buffer_v[0]); + av_frame_free(&s->output_frame); + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { +@@ -3122,6 +5194,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) return 0; } -+#ifdef RPI -+#ifdef RPI_PRECLEAR -+static av_cold void memclear16(int16_t *p, int n) -+{ -+ vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1); -+ //int i; -+ //for(i=0;ipriv_data; - int i; -+#ifdef RPI -+ unsigned int job; -+#endif - - s->avctx = avctx; - -@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) +@@ -3135,6 +5208,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) s->HEVClcList[0] = s->HEVClc; s->sList[0] = s; @@ -5759,71 +9566,39 @@ index b478065..955e426 100644 + if (vpu_qpu_init() != 0) + goto fail; + -+ for(job = 0; job < RPI_MAX_JOBS; job++) { -+ s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y); -+ if (!s->unif_mv_cmds_y[job]) -+ goto fail; -+ s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C); -+ if (!s->unif_mv_cmds_c[job]) -+ goto fail; -+ s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS); -+ if (!s->univ_pred_cmds[job]) -+ goto fail; -+ } -+ +#if RPI_INTER -+ // We divide the image into blocks 256 wide and 64 high -+ // We support up to 2048 widths -+ // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted -+ // Also add space for the startup command for each stream. -+ -+ for (job = 0; job < RPI_MAX_JOBS; job++) { -+ HEVCRpiJob * const jb = s->jobs + job; -+#if RPI_CACHE_UNIF_MVS -+ gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr); -+ gpu_malloc_cached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr); -+#else -+ gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr); -+ gpu_malloc_uncached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr); -+#endif -+ -+ { -+ qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm; -+ for(i = 0; i < QPU_N_UV; i++) { -+ jb->chroma_mvs[i].qpu_mc_base = p; -+ jb->chroma_mvs[i].qpu_mc_curr = p; -+ p += UV_COMMANDS_PER_QPU; -+ } -+ } -+ { -+ qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm; -+ for(i = 0; i < QPU_N_Y; i++) { -+ jb->luma_mvs[i].qpu_mc_base = p; -+ jb->luma_mvs[i].qpu_mc_curr = p; -+ p += Y_COMMANDS_PER_QPU; -+ } -+ } ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ { ++ static const uint32_t dframe[1] = {0x80808080}; ++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe; + } -+ s->qpu_filter_uv = qpu_fn(mc_filter_uv); -+ s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0); -+ s->qpu_dummy_frame = qpu_fn(mc_setup_c); // Use our code as a dummy frame -+ s->qpu_filter = qpu_fn(mc_filter); -+ s->qpu_filter_b = qpu_fn(mc_filter_b); ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ s->qpu_dummy_frame_qpu = qpu_fn(mc_start); // Use our code as a dummy frame ++#endif +#endif + //gpu_malloc_uncached(2048*64,&s->dummy); + + s->enable_rpi = 0; + -+#ifdef RPI_WORKER ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_init_state(s->progress_states + i); ++ } + hevc_init_worker(s); +#endif -+ -+#endif + s->cabac_state = av_malloc(HEVC_CONTEXTS); if (!s->cabac_state) goto fail; -@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) +@@ -3148,6 +5252,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) + if (!s->DPB[i].frame) + goto fail; + s->DPB[i].tf.f = s->DPB[i].frame; ++ s->DPB[i].dpb_no = i; + } + + s->max_ra = INT_MAX; +@@ -3349,9 +5454,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) } if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) @@ -5836,7 +9611,7 @@ index b478065..955e426 100644 return 0; } -@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = { +@@ -3410,6 +5515,8 @@ AVCodec ff_hevc_decoder = { .update_thread_context = hevc_update_thread_context, .init_thread_copy = hevc_init_thread_copy, .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | @@ -5846,88 +9621,63 @@ index b478065..955e426 100644 .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), }; diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h -index be91010..dd7d152 100644 +index 162ca0e582..d647232638 100644 --- a/libavcodec/hevc.h +++ b/libavcodec/hevc.h -@@ -23,6 +23,9 @@ +@@ -23,6 +23,7 @@ #ifndef AVCODEC_HEVC_H #define AVCODEC_HEVC_H -+// define RPI to split the CABAC/prediction/transform into separate stages -+#include "config.h" -+ ++#include "rpi_opts.h" #include "libavutil/buffer.h" #include "libavutil/md5.h" -@@ -37,6 +40,45 @@ +@@ -37,6 +38,10 @@ #include "thread.h" #include "videodsp.h" -+// define RPI to split the CABAC/prediction/transform into separate stages -+#ifndef RPI -+ -+ #define RPI_INTER 0 -+ #define RPI_TSTATS 0 -+ #define RPI_HEVC_SAND 0 -+ -+#else -+ -+ #include "rpi_qpu.h" -+ #define RPI_INTER 1 // 0 use ARM for UV inter-pred, 1 use QPU -+ -+ // Define RPI_WORKER to launch a worker thread for pixel processing tasks -+ #define RPI_WORKER -+ // By passing jobs to a worker thread we hope to be able to catch up during slow frames -+ // This has no effect unless RPI_WORKER is defined -+ // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as -+ // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one -+ // free for the foreground to fill in. -+ #define RPI_MAX_JOBS 2 -+ -+ // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs -+ // As it stands there is something mildy broken in VPU deblock - looks mostly OK -+ // but reliably fails some conformance tests (e.g. DBLK_A/B/C_) -+ // With VPU luma & chroma pred it is much the same speed to deblock on the ARM -+// #define RPI_DEBLOCK_VPU -+ -+ #define RPI_VPU_DEBLOCK_CACHED 1 -+ -+ #if HAVE_NEON -+ #define RPI_HEVC_SAND 1 -+ #else -+ // Sand bust on Pi1 currently - reasons unknown -+ #define RPI_HEVC_SAND 0 -+ #endif -+ -+ #define RPI_TSTATS 0 ++#ifdef RPI ++#include "rpi_qpu.h" +#endif + #define MAX_DPB_SIZE 16 // A.4.1 #define MAX_REFS 16 -@@ -660,17 +702,6 @@ typedef struct CodingUnit { +@@ -463,6 +468,7 @@ typedef struct HEVCSPS { + int implicit_rdpcm_enabled_flag; + int explicit_rdpcm_enabled_flag; + int intra_smoothing_disabled_flag; ++ int high_precision_offsets_enabled_flag; + int persistent_rice_adaptation_enabled_flag; + + ///< coded frame dimension in various units +@@ -660,6 +666,7 @@ typedef struct CodingUnit { uint8_t cu_transquant_bypass_flag; } CodingUnit; --typedef struct Mv { -- int16_t x; ///< horizontal component of motion vector -- int16_t y; ///< vertical component of motion vector --} Mv; -- --typedef struct MvField { -- DECLARE_ALIGNED(4, Mv, mv)[2]; -- int8_t ref_idx[2]; -- int8_t pred_flag; --} MvField; -- ++#if 0 + typedef struct Mv { + int16_t x; ///< horizontal component of motion vector + int16_t y; ///< vertical component of motion vector +@@ -670,6 +677,7 @@ typedef struct MvField { + int8_t ref_idx[2]; + int8_t pred_flag; + } MvField; ++#endif + typedef struct NeighbourAvailable { int cand_bottom_left; - int cand_left; -@@ -747,7 +778,17 @@ typedef struct HEVCFrame { +@@ -745,9 +753,23 @@ typedef struct HEVCFrame { + * A combination of HEVC_FRAME_FLAG_* + */ uint8_t flags; ++ ++ // Entry no in DPB - can be used as a small unique ++ // frame identifier (within the current thread) ++ uint8_t dpb_no; } HEVCFrame; -+#ifdef RPI_WORKER ++#ifdef RPI +typedef struct HEVCLocalContextIntra { + TransformUnit tu; + NeighbourAvailable na; @@ -5935,21 +9685,22 @@ index be91010..dd7d152 100644 +#endif + typedef struct HEVCLocalContext { -+ TransformUnit tu; -+ NeighbourAvailable na; // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra ++ TransformUnit tu; // Moved to start to match HEVCLocalContextIntra (yuk!) ++ NeighbourAvailable na; + uint8_t cabac_state[HEVC_CONTEXTS]; uint8_t stat_coeff[4]; -@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext { +@@ -762,8 +784,6 @@ typedef struct HEVCLocalContext { int qPy_pred; - TransformUnit tu; - +- uint8_t ctb_left_flag; uint8_t ctb_up_flag; -@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext { + uint8_t ctb_up_right_flag; +@@ -779,7 +799,6 @@ typedef struct HEVCLocalContext { int ct_depth; CodingUnit cu; PredictionUnit pu; @@ -5957,20 +9708,18 @@ index be91010..dd7d152 100644 #define BOUNDARY_LEFT_SLICE (1 << 0) #define BOUNDARY_LEFT_TILE (1 << 1) -@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext { +@@ -790,6 +809,207 @@ typedef struct HEVCLocalContext { int boundary_flags; } HEVCLocalContext; -+ +#ifdef RPI + +// The processing is done in chunks -+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma) -+// This is a distance of 1536 pixels across the screen +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing, +// but allocate more memory and increase the latency before data in the next frame can be processed +#define RPI_NUM_CHUNKS 4 +#define RPI_CHUNK_SIZE 12 ++#define RPI_ROUND_TO_LINES 0 + +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code +#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE) @@ -5989,9 +9738,6 @@ index be91010..dd7d152 100644 +#define RPI_CMD_CHROMA_BI 3 +#define RPI_CMD_V_BI 4 + -+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed? -+// #define RPI_PRECLEAR -+ +// Command for inter prediction +typedef struct HEVCMvCmd { + uint8_t cmd; @@ -6019,6 +9765,10 @@ index be91010..dd7d152 100644 + RPI_PRED_ADD_RESIDUAL, + RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx + RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V ++ RPI_PRED_ADD_DC, ++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C ++ RPI_PRED_ADD_DC_V, + RPI_PRED_INTRA, + RPI_PRED_I_PCM, + RPI_PRED_CMD_MAX @@ -6033,8 +9783,14 @@ index be91010..dd7d152 100644 + struct { // TRANSFORM_ADD + uint8_t * dst; + const int16_t * buf; -+ uint32_t stride; ++ uint16_t stride; // Should be good enough for all pic fmts we use ++ int16_t dc; + } ta; ++ struct { ++ uint8_t * dst; ++ uint32_t stride; ++ int dc; ++ } dc; + struct { // INTRA + uint16_t x; + uint16_t y; @@ -6052,32 +9808,87 @@ index be91010..dd7d152 100644 +#endif + +#ifdef RPI ++#include + -+struct qpu_mc_pred_c_s; -+struct qpu_mc_pred_y_s; ++union qpu_mc_pred_cmd_s; ++struct qpu_mc_pred_y_p_s; ++struct qpu_mc_src_s; + -+typedef struct HEVCRpiLumaPred ++typedef struct HEVCRpiInterPredQ +{ -+ struct qpu_mc_pred_y_s *qpu_mc_base; -+ struct qpu_mc_pred_y_s *qpu_mc_curr; -+ struct qpu_mc_pred_y_s *last_lx; ++ union qpu_mc_pred_cmd_u *qpu_mc_base; ++ union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ struct qpu_mc_src_s *last_l0; ++ struct qpu_mc_src_s *last_l1; + unsigned int load; -+} HEVCRpiLumaPred; ++ uint32_t code_setup; ++ uint32_t code_sync; ++ uint32_t code_exit; ++} HEVCRpiInterPredQ; + -+typedef struct HEVCRpiChromaPred ++typedef struct HEVCRpiInterPredEnv +{ -+ struct qpu_mc_pred_c_s *qpu_mc_base; -+ struct qpu_mc_pred_c_s *qpu_mc_curr; -+ struct qpu_mc_pred_c_s *last_l0; -+ struct qpu_mc_pred_c_s *last_l1; -+ unsigned int load; -+} HEVCRpiChromaPred; ++ HEVCRpiInterPredQ * q; ++ unsigned int n; // Number of Qs ++ unsigned int n_grp; // Number of Q in a group ++ unsigned int curr; // Current Q number (0..n-1) ++ int used; // 0 if nothing in any Q, 1 otherwise ++ int used_grp; // 0 if nothing in any Q in the current group ++ unsigned int max_fill; ++ unsigned int min_gap; ++ GPU_MEM_PTR_T gptr; ++} HEVCRpiInterPredEnv; ++ ++typedef struct HEVCRpiIntraPredEnv { ++ unsigned int n; // Number of commands ++ HEVCPredCmd * cmds; ++} HEVCRpiIntraPredEnv; ++ ++typedef struct HEVCRpiCeoffEnv { ++ unsigned int n; ++ uint16_t * buf; ++} HEVCRpiCoeffEnv; ++ ++typedef struct HEVCRpiCeoffsEnv { ++ HEVCRpiCoeffEnv s[4]; ++ GPU_MEM_PTR_T gptr; ++ void * mptr; ++} HEVCRpiCoeffsEnv; ++ ++typedef struct HEVCRpiDeblkBlk { ++ uint16_t x_ctb; ++ uint16_t y_ctb; ++} HEVCRpiDeblkBlk; ++ ++typedef struct HEVCRpiDeblkEnv { ++ unsigned int n; ++ HEVCRpiDeblkBlk * blks; ++} HEVCRpiDeblkEnv; ++ ++typedef struct HEVCRPiFrameProgressWait { ++ int req; ++ struct HEVCRPiFrameProgressWait * next; ++ sem_t sem; ++} HEVCRPiFrameProgressWait; ++ ++typedef struct HEVCRPiFrameProgressState { ++ struct HEVCRPiFrameProgressWait * first; ++ struct HEVCRPiFrameProgressWait * last; ++ pthread_mutex_t lock; ++} HEVCRPiFrameProgressState; + +typedef struct HEVCRpiJob { -+ GPU_MEM_PTR_T chroma_mvs_gptr; -+ GPU_MEM_PTR_T luma_mvs_gptr; -+ HEVCRpiChromaPred chroma_mvs[QPU_N_UV]; -+ HEVCRpiLumaPred luma_mvs[QPU_N_Y]; ++ volatile int terminate; ++ int pending; ++ sem_t sem_in; // set by main ++ sem_t sem_out; // set by worker ++ HEVCRpiInterPredEnv chroma_ip; ++ HEVCRpiInterPredEnv luma_ip; ++ int16_t progress[32]; // index by dpb_no ++ HEVCRpiIntraPredEnv intra; ++ HEVCRpiCoeffsEnv coeffs; ++ HEVCRpiDeblkEnv deblk; ++ HEVCRPiFrameProgressWait progress_wait; +} HEVCRpiJob; + +#if RPI_TSTATS @@ -6105,78 +9916,42 @@ index be91010..dd7d152 100644 typedef struct HEVCContext { const AVClass *c; // needed by private avoptions AVCodecContext *avctx; -@@ -798,13 +978,103 @@ typedef struct HEVCContext { - - HEVCLocalContext *HEVClcList[MAX_NB_THREADS]; - HEVCLocalContext *HEVClc; -- -+#ifdef RPI_WORKER -+ HEVCLocalContextIntra HEVClcIntra; -+#endif - uint8_t threads_type; - uint8_t threads_number; - +@@ -805,6 +1025,69 @@ typedef struct HEVCContext { int width; int height; -+ int used_for_ref; -+ ++ int used_for_ref; // rpi +#ifdef RPI + int enable_rpi; -+ HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS]; -+ HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS]; -+ HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS]; -+ int buf_width; -+ GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS]; -+ GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS]; -+ int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4]; -+ unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4]; -+ int num_coeffs[RPI_MAX_JOBS][4]; -+ int num_xfm_cmds[RPI_MAX_JOBS]; -+ int num_mv_cmds_y[RPI_MAX_JOBS]; -+ int num_mv_cmds_c[RPI_MAX_JOBS]; -+ int num_pred_cmds[RPI_MAX_JOBS]; -+ int num_dblk_cmds[RPI_MAX_JOBS]; -+ int vpu_id; -+ int pass0_job; // Pass0 does coefficient decode -+ int pass1_job; // Pass1 does pixel processing ++ unsigned int pass0_job; // Pass0 does coefficient decode ++ unsigned int pass1_job; // Pass1 does pixel processing + int ctu_count; // Number of CTUs done in pass0 so far + int max_ctu_count; // Number of CTUs when we trigger a round of processing -+ int ctu_per_y_chan; // Number of CTUs per luma QPU -+ int ctu_per_uv_chan; // Number of CTUs per chroma QPU + ++ HEVCRpiJob * jb0; ++ HEVCRpiJob * jb1; + HEVCRpiJob jobs[RPI_MAX_JOBS]; +#if RPI_TSTATS + HEVCRpiStats tstats; +#endif +#if RPI_INTER -+ HEVCRpiChromaPred * curr_pred_c; -+ HEVCRpiLumaPred * curr_pred_y; -+ struct qpu_mc_pred_y_s * last_y8_p; -+ struct qpu_mc_pred_y_s * last_y8_lx; ++ struct qpu_mc_pred_y_p_s * last_y8_p; ++ struct qpu_mc_src_s * last_y8_l1; + + // Function pointers -+ uint32_t qpu_filter_uv; -+ uint32_t qpu_filter_uv_b0; -+ uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory -+ uint32_t qpu_filter; -+ uint32_t qpu_filter_b; ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ const uint8_t * qpu_dummy_frame_emu; ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory ++#endif ++ HEVCRpiQpu qpu; +#endif + -+#ifdef RPI_WORKER + pthread_t worker_thread; -+ pthread_cond_t worker_cond_head; -+ pthread_cond_t worker_cond_tail; -+ pthread_mutex_t worker_mutex; -+ -+ int worker_tail; // Contains the number of posted jobs -+ int worker_head; // Contains the number of completed jobs -+ int kill_worker; // set to 1 to terminate the worker -+#endif -+ -+#define RPI_DEBLOCK_VPU_Q_COUNT 2 + +#ifdef RPI_DEBLOCK_VPU ++#define RPI_DEBLOCK_VPU_Q_COUNT 2 + int enable_rpi_deblock; + + int uv_setup_width; @@ -6204,34 +9979,25 @@ index be91010..dd7d152 100644 + unsigned int dvq_n; + +#endif -+ ++ HEVCLocalContextIntra HEVClcIntra; ++ HEVCRPiFrameProgressState progress_states[2]; +#endif + uint8_t *cabac_state; /** 1 if the independent slice segment header was successfully parsed */ -@@ -922,6 +1192,9 @@ typedef struct HEVCContext { - uint32_t max_mastering_luminance; - uint32_t min_mastering_luminance; - -+#ifdef RPI -+ int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2]; -+#endif - } HEVCContext; - - int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, -@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - int log2_trafo_size, enum ScanType scan_idx, - int c_idx); +@@ -1053,6 +1336,10 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); + int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id, + uint8_t *buf, int buf_size); +#if RPI_INTER +extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n); +#endif + - void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); - -@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16]; + /** + * Reset SEI values that are stored on the Context. +@@ -1072,4 +1359,89 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16]; extern const uint8_t ff_hevc_diag_scan8x8_x[64]; extern const uint8_t ff_hevc_diag_scan8x8_y[64]; @@ -6244,11 +10010,85 @@ index be91010..dd7d152 100644 +extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); +#endif + ++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int val, const int field); ++ ++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field); ++ ++// All of these expect that s->threads_type == FF_THREAD_FRAME ++ ++static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int y) ++{ ++ if (s->enable_rpi) ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); ++ else ++ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0); ++} ++ ++static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y) ++{ ++ if (s->enable_rpi && s->used_for_ref) ++ ff_hevc_rpi_progress_signal_field(s, y, 1); ++} ++ ++static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int y) ++{ ++ if (s->enable_rpi) ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); ++ else ++ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0); ++} ++ ++static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y) ++{ ++ if (s->used_for_ref) ++ { ++ if (s->enable_rpi) ++ ff_hevc_rpi_progress_signal_field(s, y, 0); ++ else ++ ff_thread_report_progress(&s->ref->tf, y, 0); ++ } ++} ++ ++static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s) ++{ ++ if (s->enable_rpi) ++ { ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); ++ } ++ else ++ ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); ++} ++ ++#else ++ ++// Use #define as that allows us to discard "jb" which won't exist in non-RPI world ++#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0) ++#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0) ++#define ff_hevc_progress_signal_mv(s, y) ++#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0) ++#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0) ++ +#endif ++ ++// Set all done - signal nothing (used in missing refs) ++// Works for both rpi & non-rpi ++static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref) ++{ ++ if (ref->tf.progress != NULL) ++ { ++ int * const p = (int *)&ref->tf.progress->data; ++ p[0] = INT_MAX; ++ p[1] = INT_MAX; ++ } ++} + #endif /* AVCODEC_HEVC_H */ diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c -index 05b2821..733efde 100644 +index 05b2821840..c84886817d 100644 --- a/libavcodec/hevc_cabac.c +++ b/libavcodec/hevc_cabac.c @@ -21,14 +21,76 @@ @@ -6260,12 +10100,11 @@ index 05b2821..733efde 100644 #include "libavutil/attributes.h" #include "libavutil/common.h" --#include "cabac_functions.h" + #include "cabac_functions.h" #include "hevc.h" -+#include "cabac_functions.h" -+ + +#ifdef RPI -+#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" +#endif + +// BY22 is probably faster than simple bypass if the processor has @@ -6287,7 +10126,7 @@ index 05b2821..733efde 100644 +#if ARCH_ARM +#include "arm/hevc_cabac.h" +#endif - ++ #define CABAC_MAX_BIN 31 + @@ -6610,7 +10449,7 @@ index 05b2821..733efde 100644 { return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); } -@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, +@@ -966,90 +1227,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc); } @@ -6623,7 +10462,7 @@ index 05b2821..733efde 100644 + +#ifndef coeff_abs_level_remaining_decode_bypass +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param) -+{ + { + CABACContext * const c = &s->HEVClc->cc; + uint32_t y; + unsigned int prefix; @@ -6664,7 +10503,7 @@ index 05b2821..733efde 100644 +#endif + +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param) - { ++{ + CABACContext * const c = &s->HEVClc->cc; int prefix = 0; int suffix = 0; @@ -6740,7 +10579,7 @@ index 05b2821..733efde 100644 + rv = (rv << 1) | b; + } + return rv; -+} + } +#endif + + @@ -6881,22 +10720,21 @@ index 05b2821..733efde 100644 + int * const pPrev_sig) +{ + while (--i >= 0) { -+ unsigned int x_cg = scan_x_cg[i]; -+ unsigned int y_cg = scan_y_cg[i]; ++ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag; ++ const unsigned int x_cg = scan_x_cg[i]; + + // For the flag decode we only care about Z/NZ but -+ // we use the full Right + Down * 2 when calculating -+ // significant coeff flags so we obtain it here -+ //. ++ // we use the full Right * 2 + Down when calculating ++ // significant coeff flags so we obtain it here. ++ // + // The group flag array is one longer than it needs to + // be so we don't need to check for y_cg limits -+ unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) | -+ (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1); ++ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1); + + if (i == 0 || + significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig)) + { -+ significant_coeff_group_flag[y_cg] |= (1 << x_cg); ++ gf_y[0] |= (1 << x_cg); + *pPrev_sig = prev_sig; + break; + } @@ -6914,35 +10752,128 @@ index 05b2821..733efde 100644 + unsigned int stride = frame->linesize[c_idx]; + unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; + unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; -+ const int is_sliced = rpi_sliced_frame(frame); ++ const int is_sliced = av_rpi_is_sand_frame(frame); + uint8_t * dst = !is_sliced ? + s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(frame, x, y) : -+ rpi_sliced_frame_pos_c(frame, x, y); ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); + -+// if (c_idx != 0) { -+// return; -+// } + if (s->enable_rpi) { -+ HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++; -+ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); -+ cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; -+ cmd->ta.buf = coeffs; -+ cmd->ta.dst = dst; -+ cmd->ta.stride = stride; ++ const unsigned int i = s->jb0->intra.n; ++ HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); ++ ++ pc->type = RPI_PRED_ADD_RESIDUAL_C; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride); ++ ++ // Rewrite as add residual - must rewrite all fields as different union member ++ pc->type = RPI_PRED_ADD_RESIDUAL_V; ++ pc->c_idx = c_idx; ++ pc->ta.buf = coeffs; ++ pc->ta.dst = dst; ++ pc->ta.stride = stride; ++ pc->ta.dc = dc; ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ s->jb0->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); ++ cmd->size = log2_trafo_size; ++ cmd->c_idx = c_idx; ++ cmd->ta.buf = coeffs; ++ cmd->ta.dst = dst; ++ cmd->ta.stride = stride; ++ cmd->ta.dc = 0; ++ } + } + else if (!is_sliced || c_idx == 0) { + s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); + } ++#if RPI_HEVC_SAND ++ // * These should probably never happen + else if (c_idx == 1) { -+ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0); + } + else { -+ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0); + } - } ++#endif ++} ++ ++ ++static void rpi_add_dc(HEVCContext * const s, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ const unsigned int stride = frame->linesize[c_idx]; ++ const unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; ++ const unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; ++ const int is_sliced = av_rpi_is_sand_frame(frame); ++ uint8_t * const dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); ++ ++ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0); ++ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1); ++ ++ if (s->enable_rpi) { ++ const unsigned int i = s->jb0->intra.n; ++ HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); ++ ++ pc->ta.dc = (int16_t)coeff; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride && ++ (pc->dc.dc & ~0xffff) == 0); ++ ++ pc->dc.dc |= (coeff << 16); ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ s->jb0->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_DC + c_idx; ++ cmd->size = log2_trafo_size; ++ cmd->c_idx = c_idx; ++ cmd->dc.dst = dst; ++ cmd->dc.stride = stride; ++ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; ++ } ++ } ++} ++ ++ +#endif void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, @@ -6985,6 +10916,7 @@ index 05b2821..733efde 100644 +#endif +#ifdef RPI + int use_vpu; ++ int use_dc = 0; +#endif + int16_t *coeffs; + uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero @@ -7006,7 +10938,6 @@ index 05b2821..733efde 100644 + const int c_idx_nz = (c_idx != 0); + + int may_hide_sign; -+ // Derive QP for dequant if (!lc->cu.cu_transquant_bypass_flag) { @@ -7015,7 +10946,7 @@ index 05b2821..733efde 100644 static const uint8_t rem6[51 + 4 * 6 + 1] = { 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, -@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1065,9 +1706,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, }; int qp_y = lc->qp_y; @@ -7036,7 +10967,7 @@ index 05b2821..733efde 100644 } if (c_idx == 0) { -@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1100,39 +1751,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, qp += s->ps.sps->qp_bd_offset; } @@ -7127,7 +11058,7 @@ index 05b2821..733efde 100644 &last_significant_coeff_x, &last_significant_coeff_y); if (last_significant_coeff_x > 3) { -@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1160,119 +1848,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, int last_x_c = last_significant_coeff_x & 3; int last_y_c = last_significant_coeff_y & 3; @@ -7184,53 +11115,35 @@ index 05b2821..733efde 100644 - for (i = num_last_subset; i >= 0; i--) { - int n, m; - int x_cg, y_cg, x_c, y_c, pos; -+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant -+ -+ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; -+ -+ { -+ const unsigned int ccount = 1 << (log2_trafo_size * 2); -+#ifdef RPI -+ use_vpu = 0; -+ if (s->enable_rpi) { -+ use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4; -+ coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount); -+#if HAVE_NEON -+ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); -+#else -+ memset(coeffs, 0, ccount * sizeof(int16_t)); -+#endif -+ } -+ else -+#endif -+ { -+ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); -+ memset(coeffs, 0, ccount * sizeof(int16_t)); -+ } -+ } -+ -+ i = num_last_subset; -+ do { - int implicit_non_zero_coeff = 0; +- int implicit_non_zero_coeff = 0; - int64_t trans_coeff_level; - int prev_sig = 0; - int offset = i << 4; - int rice_init = 0; -+ int n_end; - - uint8_t significant_coeff_flag_idx[16]; -- uint8_t nb_significant_coeff_flag = 0; - +- uint8_t significant_coeff_flag_idx[16]; +- uint8_t nb_significant_coeff_flag = 0; ++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant + - x_cg = scan_x_cg[i]; - y_cg = scan_y_cg[i]; -- ++ scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; + - if ((i < num_last_subset) && (i > 0)) { - int ctx_cg = 0; - if (x_cg < (1 << (log2_trafo_size - 2)) - 1) - ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg]; - if (y_cg < (1 << (log2_trafo_size - 2)) - 1) - ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1]; -- ++ { ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++#ifdef RPI ++ use_vpu = 0; ++ if (s->enable_rpi) { ++ const int special = trans_skip_or_bypass || lc->tu.cross_pf; // These need special processinmg ++ use_dc = (num_coeff == 1) && !special && ++ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); + - significant_coeff_group_flag[x_cg][y_cg] = - significant_coeff_group_flag_decode(s, c_idx, ctx_cg); - implicit_non_zero_coeff = 1; @@ -7238,9 +11151,37 @@ index 05b2821..733efde 100644 - significant_coeff_group_flag[x_cg][y_cg] = - ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) || - (x_cg == 0 && y_cg == 0)); -- } -- ++ if (use_dc) { ++ // Just need a little empty space ++ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ // No need to clear ++ } ++ else ++ { ++ use_vpu = !special && log2_trafo_size >= 4; ++ coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount); ++#if HAVE_NEON ++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); ++#else ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++#endif ++ } + } ++ else ++#endif ++ { ++ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++ } ++ } + - last_scan_pos = num_coeff - offset - 1; ++ i = num_last_subset; ++ do { ++ int implicit_non_zero_coeff = 0; ++ int n_end; ++ ++ uint8_t significant_coeff_flag_idx[16]; + unsigned int nb_significant_coeff_flag = 0; if (i == num_last_subset) { @@ -7272,23 +11213,24 @@ index 05b2821..733efde 100644 + H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 + V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2 + }; ++ // N.B. prev_sig = Right * 2 + Down + static const uint8_t ctx_idx_maps[3][4][16] = { + { + D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1 -+ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2 ++ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + }, + { + H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1 -+ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2 ++ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + }, + { + V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 -+ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1 -+ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2 ++ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + } }; @@ -7326,7 +11268,7 @@ index 05b2821..733efde 100644 if (log2_trafo_size == 3) { scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; } else { -@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1286,34 +2002,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } } @@ -7375,11 +11317,12 @@ index 05b2821..733efde 100644 significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; nb_significant_coeff_flag++; } -@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1323,141 +2035,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } - n_end = nb_significant_coeff_flag; +- + if (nb_significant_coeff_flag != 0) { + const unsigned int gt1_idx_delta = (c_idx_nz << 2) | + ((i != 0 && !c_idx_nz) ? 2 : 0) | @@ -7427,9 +11370,6 @@ index 05b2821..733efde 100644 + coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2); + } -+ // Probably not worth the overhead of starting by22 for just one value -+ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); - - if (n_end) { - int first_nz_pos_in_cg; - int last_nz_pos_in_cg; @@ -7440,6 +11380,9 @@ index 05b2821..733efde 100644 - int sum_abs = 0; - int sign_hidden; - int sb_type; ++ // Probably not worth the overhead of starting by22 for just one value ++ coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc); + + if (coded_val) + { + if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { @@ -7450,18 +11393,13 @@ index 05b2821..733efde 100644 + const unsigned int c_rice_param = *stat_coeff >> 2; + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param); +- // initialize first elem of coeff_bas_level_greater1_flag +- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; + trans_coeff_level = 3 + last_coeff_abs_level_remaining; + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + } + } -- // initialize first elem of coeff_bas_level_greater1_flag -- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; -+ { -+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; -+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; -+ const unsigned int scale_m = blk_scale[xy_off->scale]; - - if (s->ps.sps->persistent_rice_adaptation_enabled_flag) { - if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag) - sb_type = 2 * (c_idx == 0 ? 1 : 0); @@ -7469,7 +11407,11 @@ index 05b2821..733efde 100644 - sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1; - c_rice_param = lc->stat_coeff[sb_type] / 4; - } -- ++ { ++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; ++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; ++ const unsigned int scale_m = blk_scale[xy_off->scale]; + - if (!(i == num_last_subset) && greater1_ctx == 0) - ctx_set++; - greater1_ctx = 1; @@ -7551,9 +11493,6 @@ index 05b2821..733efde 100644 + { + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param); + const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1; -+ -+ sum_abs += last_coeff_abs_level_remaining + 1; -+ *level = trans_coeff_level; - for (m = 0; m < n_end; m++) { - n = significant_coeff_flag_idx[m]; @@ -7574,6 +11513,9 @@ index 05b2821..733efde 100644 - if (lc->stat_coeff[sb_type] > 0) - lc->stat_coeff[sb_type]--; - rice_init = 1; ++ sum_abs += last_coeff_abs_level_remaining + 1; ++ *level = trans_coeff_level; ++ + if (stat_coeff != NULL) + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + stat_coeff = NULL; @@ -7678,7 +11620,7 @@ index 05b2821..733efde 100644 if (lc->cu.cu_transquant_bypass_flag) { if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, +@@ -1467,7 +2223,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); } } else { @@ -7687,56 +11629,41 @@ index 05b2821..733efde 100644 int rot = s->ps.sps->transform_skip_rotation_enabled_flag && log2_trafo_size == 2 && lc->cu.pred_mode == MODE_INTRA; -@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - for (i = 0; i < 8; i++) - FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); - } -- - s->hevcdsp.transform_skip(coeffs, log2_trafo_size); - - if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); +@@ -1487,10 +2243,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, } } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { -- s->hevcdsp.idct_4x4_luma(coeffs); -+ s->hevcdsp.idct_4x4_luma(coeffs); - } else { + s->hevcdsp.idct_4x4_luma(coeffs); +- } else { ++ } +#ifdef RPI -+ if (!use_vpu) { -+ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); -+ if (max_xy == 0) { -+ s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs); -+ } else { -+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; -+ if (max_xy < 4) -+ col_limit = FFMIN(4, col_limit); -+ else if (max_xy < 8) -+ col_limit = FFMIN(8, col_limit); -+ else if (max_xy < 12) -+ col_limit = FFMIN(24, col_limit); -+ -+ s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit); -+ } -+ } ++ else if (!use_vpu) +#else ++ else ++#endif ++ { int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); if (max_xy == 0) - s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs); -@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - col_limit = FFMIN(24, col_limit); - s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit); - } +- s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs); ++ { ++#ifdef RPI ++ if (use_dc) ++ rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++ else +#endif - } - } - if (lc->tu.cross_pf) { -@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, ++ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); ++ } + else { + int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; + if (max_xy < 4) +@@ -1510,7 +2279,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); } } +#ifdef RPI -+ rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++ if (!use_dc) ++ { ++ rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs); ++ } +#else s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride); +#endif @@ -7744,7 +11671,7 @@ index 05b2821..733efde 100644 void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size) diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c -index 1f33b0c..3143b4f 100644 +index 9fbcd1d8b8..df129e2e46 100644 --- a/libavcodec/hevc_filter.c +++ b/libavcodec/hevc_filter.c @@ -22,6 +22,12 @@ @@ -7760,26 +11687,31 @@ index 1f33b0c..3143b4f 100644 #include "libavutil/common.h" #include "libavutil/internal.h" -@@ -31,6 +37,11 @@ +@@ -31,6 +37,16 @@ #include "bit_depth_template.c" +#ifdef RPI +#include "rpi_qpu.h" ++#endif ++#if RPI_HEVC_SAND +#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" ++#else ++#define RPI_ZC_SAND_8_IN_10_BUF 0 +#endif + #define LUMA 0 #define CB 1 #define CR 2 -@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC) +@@ -139,6 +155,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC) return s->qp_y_tab[x + y * s->ps.sps->min_cb_width]; } +static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx) +{ -+#ifdef RPI -+ return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift; ++#if RPI_HEVC_SAND ++ return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; +#else + return s->ps.sps->pixel_shift; +#endif @@ -7788,7 +11720,75 @@ index 1f33b0c..3143b4f 100644 static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, intptr_t stride_dst, intptr_t stride_src) { -@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, +@@ -161,12 +186,21 @@ int i, j; + } + } + ++// "DSP" these? + static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) + { +- if (pixel_shift) +- *(uint16_t *)dst = *(uint16_t *)src; +- else +- *dst = *src; ++ switch (pixel_shift) ++ { ++ case 2: ++ *(uint32_t *)dst = *(uint32_t *)src; ++ break; ++ case 1: ++ *(uint16_t *)dst = *(uint16_t *)src; ++ break; ++ default: ++ *dst = *src; ++ break; ++ } + } + + static void copy_vert(uint8_t *dst, const uint8_t *src, +@@ -174,18 +208,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src, + int stride_dst, int stride_src) + { + int i; +- if (pixel_shift == 0) { +- for (i = 0; i < height; i++) { +- *dst = *src; +- dst += stride_dst; +- src += stride_src; +- } +- } else { +- for (i = 0; i < height; i++) { +- *(uint16_t *)dst = *(uint16_t *)src; +- dst += stride_dst; +- src += stride_src; +- } ++ switch (pixel_shift) ++ { ++ case 2: ++ for (i = 0; i < height; i++) { ++ *(uint32_t *)dst = *(uint32_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ case 1: ++ for (i = 0; i < height; i++) { ++ *(uint16_t *)dst = *(uint16_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ default: ++ for (i = 0; i < height; i++) { ++ *dst = *src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; + } + } + +@@ -193,7 +238,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, int stride_src, int x, int y, int width, int height, int c_idx, int x_ctb, int y_ctb) { @@ -7797,7 +11797,7 @@ index 1f33b0c..3143b4f 100644 int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; -@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s, +@@ -224,13 +269,14 @@ static void restore_tqb_pixels(HEVCContext *s, int y_min = ((y0 ) >> s->ps.sps->log2_min_pu_size); int x_max = ((x0 + width ) >> s->ps.sps->log2_min_pu_size); int y_max = ((y0 + height) >> s->ps.sps->log2_min_pu_size); @@ -7815,21 +11815,27 @@ index 1f33b0c..3143b4f 100644 for (n = 0; n < (min_pu_size >> vshift); n++) { memcpy(src, dst, len); src += stride_src; -@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s, +@@ -246,7 +292,13 @@ static void restore_tqb_pixels(HEVCContext *s, static void sao_filter_CTB(HEVCContext *s, int x, int y) { - static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; ++#if SAO_FILTER_N == 5 + static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#elif SAO_FILTER_N == 6 ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#else ++#error Confused by size of sao fn array ++#endif HEVCLocalContext *lc = s->HEVClc; int c_idx; int edges[4]; // 0 left 1 top 2 right 3 bottom -@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -267,12 +319,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) uint8_t right_tile_edge = 0; uint8_t up_tile_edge = 0; uint8_t bottom_tile_edge = 0; -+#ifdef RPI -+ const int sliced = rpi_sliced_frame(s->frame); ++#if RPI_HEVC_SAND ++ const int sliced = av_rpi_is_sand_frame(s->frame); + const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1); +#else + const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1); @@ -7847,7 +11853,7 @@ index 1f33b0c..3143b4f 100644 if (restore) { if (!edges[0]) { left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; -@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -304,7 +366,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) } } @@ -7856,7 +11862,7 @@ index 1f33b0c..3143b4f 100644 int x0 = x >> s->ps.sps->hshift[c_idx]; int y0 = y >> s->ps.sps->vshift[c_idx]; int stride_src = s->frame->linesize[c_idx]; -@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -313,28 +375,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) int width = FFMIN(ctb_size_h, (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0); int height = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0); int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; @@ -7865,24 +11871,24 @@ index 1f33b0c..3143b4f 100644 + ptrdiff_t stride_dst; uint8_t *dst; -+#ifdef RPI -+ const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift; ++#if RPI_HEVC_SAND ++ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0); + const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; + uint8_t * const src = !sliced ? -+ &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] : ++ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(s->frame, x0, y0) : -+ rpi_sliced_frame_pos_c(s->frame, x0, y0); ++ av_rpi_sand_frame_pos_y(s->frame, x0, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0, y0); + const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : + !sliced ? src - (1 << sh) : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) : -+ rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0); ++ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0); + const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : + !sliced ? src + (width << sh) : + c_idx == 0 ? -+ rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) : -+ rpi_sliced_frame_pos_c(s->frame, x0 + width, y0); ++ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0); + + + if (sliced && c_idx > 1) { @@ -7913,7 +11919,7 @@ index 1f33b0c..3143b4f 100644 + dst = lc->edge_emu_buffer; + stride_dst = 2*MAX_PB_SIZE; + copy_CTB(dst, src, width << sh, height, stride_dst, stride_src); -+#ifdef RPI ++#if RPI_HEVC_SAND + if (sliced && c_idx != 0) + { + s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, @@ -7934,9 +11940,11 @@ index 1f33b0c..3143b4f 100644 - s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, - sao->offset_val[c_idx], sao->band_position[c_idx], - width, height); -+#ifdef RPI ++#if RPI_HEVC_SAND + if (sliced && c_idx != 0) + { ++// printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src); ++ + s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, + sao->offset_val[1], sao->band_position[1], + sao->offset_val[2], sao->band_position[2], @@ -7952,7 +11960,7 @@ index 1f33b0c..3143b4f 100644 } sao->type_idx[c_idx] = SAO_APPLIED; break; -@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -342,108 +460,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) { int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; @@ -8091,7 +12099,7 @@ index 1f33b0c..3143b4f 100644 - vert_edge, - horiz_edge, - diag_edge); -+#ifdef RPI ++#if RPI_HEVC_SAND + if (sliced && c_idx != 0) + { + // Class always the same for both U & V (which is just as well :-)) @@ -8121,18 +12129,42 @@ index 1f33b0c..3143b4f 100644 + horiz_edge, + diag_edge); + } ++ // ??? Does this actually work for chroma ??? restore_tqb_pixels(s, src, dst, stride_src, stride_dst, x, y, width, height, c_idx); sao->type_idx[c_idx] = SAO_APPLIED; -@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) +@@ -451,8 +579,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) + } + } } ++ ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL && ++ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2])) ++ { ++ const unsigned int stride1 = s->frame->linesize[0]; ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame); ++ const unsigned int xoff = (x >> 8) * stride2 * stride1; ++ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1; ++ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1; ++ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1; ++ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1; ++ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255); ++ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y; ++ ++// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size); ++ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3); ++ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3); ++ } ++#endif } +// Returns 2 or 0. static int get_pcm(HEVCContext *s, int x, int y) { int log2_min_pu_size = s->ps.sps->log2_min_pu_size; -@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -479,7 +629,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) uint8_t *src; int x, y; int chroma, beta; @@ -8141,7 +12173,7 @@ index 1f33b0c..3143b4f 100644 uint8_t no_p[2] = { 0 }; uint8_t no_q[2] = { 0 }; -@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -496,6 +646,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->ps.sps->pcm.loop_filter_disable_flag) || s->ps.pps->transquant_bypass_enable_flag; @@ -8157,7 +12189,7 @@ index 1f33b0c..3143b4f 100644 if (x0) { left_tc_offset = s->deblock[ctb - 1].tc_offset; left_beta_offset = s->deblock[ctb - 1].beta_offset; -@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -529,19 +688,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; @@ -8175,14 +12207,14 @@ index 1f33b0c..3143b4f 100644 - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); + } -+#ifdef RPI -+ if (rpi_sliced_frame(s->frame)) { ++#if RPI_HEVC_SAND ++ if (av_rpi_is_sand_frame(s->frame)) { + + // This copes properly with no_p/no_q -+ s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y), ++ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), + s->frame->linesize[LUMA], + beta, tc, no_p, no_q, -+ rpi_sliced_frame_pos_y(s->frame, x - 4, y)); ++ av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); + } + else +#endif @@ -8217,21 +12249,21 @@ index 1f33b0c..3143b4f 100644 } } -@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -561,7 +752,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; - src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; + src = -+#ifdef RPI -+ rpi_sliced_frame(s->frame) ? -+ rpi_sliced_frame_pos_y(s->frame, x, y) : ++#if RPI_HEVC_SAND ++ av_rpi_is_sand_frame(s->frame) ? ++ av_rpi_sand_frame_pos_y(s->frame, x, y) : +#endif + &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x, y - 1); no_p[1] = get_pcm(s, x + 4, y - 1); -@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -571,6 +767,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[LUMA], beta, tc, no_p, no_q); } else @@ -8251,17 +12283,19 @@ index 1f33b0c..3143b4f 100644 s->hevcdsp.hevc_h_loop_filter_luma(src, s->frame->linesize[LUMA], beta, tc, no_p, no_q); -@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -579,6 +788,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) } if (s->ps.sps->chroma_format_idc) { -+#ifdef RPI -+ if (rpi_sliced_frame(s->frame)) { ++#if RPI_HEVC_SAND ++ if (av_rpi_is_sand_frame(s->frame)) { + const int v = 2; + const int h = 2; + + // vertical filtering chroma + for (y = y0; y < y_end; y += 8 * v) { ++// const int demi_y = y + 4 * v >= s->ps.sps->height; ++ const int demi_y = 0; + for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) { + const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; + const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2]; @@ -8269,7 +12303,7 @@ index 1f33b0c..3143b4f 100644 + if ((bs0 == 2) || (bs1 == 2)) { + const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; + const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1; -+ unsigned int no_f = 0; ++ unsigned int no_f = !demi_y ? 0 : 2 | 8; + + // tc_offset here should be set to cur_tc_offset I think + const uint32_t tc4 = @@ -8289,10 +12323,10 @@ index 1f33b0c..3143b4f 100644 + continue; + } + -+ s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + s->frame->linesize[1], + tc4, -+ rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), + no_f); + } + } @@ -8307,6 +12341,9 @@ index 1f33b0c..3143b4f 100644 + x_end2 = x_end - 8 * h; + + for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) { ++// const int demi_x = x + 4 * v >= s->ps.sps->width; ++ const int demi_x = 0; ++ + const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; + const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2]; + if ((bs0 == 2) || (bs1 == 2)) { @@ -8315,7 +12352,7 @@ index 1f33b0c..3143b4f 100644 + const uint32_t tc4 = + ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) | + ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); -+ unsigned int no_f = 0; ++ unsigned int no_f = !demi_x ? 0 : 2 | 8; + + if (tc4 == 0) + continue; @@ -8331,7 +12368,7 @@ index 1f33b0c..3143b4f 100644 + continue; + } + -+ s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1), ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), + s->frame->linesize[1], + tc4, no_f); + } @@ -8343,21 +12380,21 @@ index 1f33b0c..3143b4f 100644 for (chroma = 1; chroma <= 2; chroma++) { int h = 1 << s->ps.sps->hshift[chroma]; int v = 1 << s->ps.sps->vshift[chroma]; -@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -595,7 +894,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0; c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0; - src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; + src = -+#ifdef RPI -+ rpi_sliced_frame(s->frame) ? -+ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#if RPI_HEVC_SAND ++ av_rpi_is_sand_frame(s->frame) ? ++ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : +#endif + &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x - 1, y); no_p[1] = get_pcm(s, x - 1, y + (4 * v)); -@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -605,9 +909,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -8381,21 +12418,21 @@ index 1f33b0c..3143b4f 100644 } } -@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -628,7 +946,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) c_tc[0] = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset) : 0; c_tc[1] = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0; - src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; + src = -+#ifdef RPI -+ rpi_sliced_frame(s->frame) ? -+ rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : ++#if RPI_HEVC_SAND ++ av_rpi_is_sand_frame(s->frame) ? ++ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : +#endif + &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; if (pcmf) { no_p[0] = get_pcm(s, x, y - 1); no_p[1] = get_pcm(s, x + (4 * h), y - 1); -@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -638,6 +961,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) s->frame->linesize[chroma], c_tc, no_p, no_q); } else @@ -8415,7 +12452,7 @@ index 1f33b0c..3143b4f 100644 s->hevcdsp.hevc_h_loop_filter_chroma(src, s->frame->linesize[chroma], c_tc, no_p, no_q); -@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) +@@ -648,69 +984,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) } } @@ -8485,7 +12522,7 @@ index 1f33b0c..3143b4f 100644 void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_trafo_size) -@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -721,10 +994,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, int log2_min_tu_size = s->ps.sps->log2_min_tb_size; int min_pu_width = s->ps.sps->min_pu_width; int min_tu_width = s->ps.sps->min_tb_width; @@ -8511,7 +12548,7 @@ index 1f33b0c..3143b4f 100644 boundary_upper = y0 > 0 && !(y0 & 7); if (boundary_upper && -@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -736,34 +1021,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_upper = 0; @@ -8588,7 +12625,7 @@ index 1f33b0c..3143b4f 100644 boundary_left = x0 > 0 && !(x0 & 7); if (boundary_left && ((!s->sh.slice_loop_filter_across_slices_enabled_flag && -@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -774,64 +1081,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) boundary_left = 0; @@ -8691,7 +12728,7 @@ index 1f33b0c..3143b4f 100644 } } } -@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, +@@ -840,11 +1137,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, #undef CB #undef CR @@ -8701,8 +12738,8 @@ index 1f33b0c..3143b4f 100644 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma) +{ + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ 0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma); + rpi_cache_flush_finish(rfe); +} +#endif @@ -8716,10 +12753,11 @@ index 1f33b0c..3143b4f 100644 + const int d0 = ((int *)f->progress->data)[0]; + const unsigned int curr_y = d0 == -1 ? 0 : d0; // At start of time progress is -1 + -+ if (curr_y < (unsigned int)f->f->height) { ++ if (curr_y < (unsigned int)s->ps.sps->height) { + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ 0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y, ++ s->ps.sps->vshift[1], 1, 1); + rpi_cache_flush_finish(rfe); + } + } @@ -8759,7 +12797,7 @@ index 1f33b0c..3143b4f 100644 + // Call VPU + { + const vpu_qpu_job_h vqj = vpu_qpu_job_new(); -+ vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands ++ vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5); // 5 means to do all the commands + vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id); + vpu_qpu_job_finish(vqj); + } @@ -8796,61 +12834,167 @@ index 1f33b0c..3143b4f 100644 if (s->ps.sps->sao_enabled) { int y_end = y >= s->ps.sps->height - ctb_size; if (y && x) -@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) +@@ -853,16 +1244,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) sao_filter_CTB(s, x - ctb_size, y); if (y && x_end) { sao_filter_CTB(s, x, y - ctb_size); - if (s->threads_type & FF_THREAD_FRAME ) +- ff_thread_report_progress(&s->ref->tf, y, 0); + if (s->threads_type == FF_THREAD_FRAME ) { +#if RPI_INTER + rpi_flush_ref_frame_progress(s,&s->ref->tf, y); +#endif - ff_thread_report_progress(&s->ref->tf, y, 0); ++ ff_hevc_progress_signal_recon(s, y); + } } if (x_end && y_end) { sao_filter_CTB(s, x , y); - if (s->threads_type & FF_THREAD_FRAME ) +- ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); + if (s->threads_type == FF_THREAD_FRAME ) { +#if RPI_INTER + rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size); +#endif - ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); ++ ff_hevc_progress_signal_recon(s, y + ctb_size); + } -+ } + } +- } else if (s->threads_type & FF_THREAD_FRAME && x_end) +- ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); + } else if (s->threads_type == FF_THREAD_FRAME && x_end) { + //int newh = y + ctb_size - 4; + //int currh = s->ref->tf.progress->data[0]; + //if (((y + ctb_size)&63)==0) +#ifdef RPI_DEBLOCK_VPU + if (s->enable_rpi_deblock) { -+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi -+ if (done_deblock) { -+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); -+ } ++ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi ++ if (done_deblock) { ++ ff_hevc_progress_signal_recon(s, y + ctb_size - 4); ++ } + } else { +#if RPI_INTER -+ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); ++ rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); +#endif -+ ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); - } -- } else if (s->threads_type & FF_THREAD_FRAME && x_end) ++ ff_hevc_progress_signal_recon(s, y + ctb_size - 4); ++ } +#else +#if RPI_INTER + rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4); -+ // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi +#endif - ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); ++ ff_hevc_progress_signal_recon(s, y + ctb_size - 4); +#endif + } } void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size) +diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c +index 4a6dde0f67..8ee37ebfbc 100644 +--- a/libavcodec/hevc_mvs.c ++++ b/libavcodec/hevc_mvs.c +@@ -111,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField + return 0; + } + +-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb) ++static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb) + { + int tx, scale_factor; + +@@ -125,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb) + (scale_factor * src->y < 0)) >> 8); + } + +-static int check_mvset(Mv *mvLXCol, Mv *mvCol, +- int colPic, int poc, +- RefPicList *refPicList, int X, int refIdxLx, +- RefPicList *refPicList_col, int listCol, int refidxCol) ++static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol, ++ const int colPic, const int poc, ++ const RefPicList * const refPicList, const int X, const int refIdxLx, ++ const RefPicList * const refPicList_col, const int listCol, const int refidxCol) + { + int cur_lt = refPicList[X].isLongTerm[refIdxLx]; + int col_lt = refPicList_col[listCol].isLongTerm[refidxCol]; +@@ -159,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol, + refPicList_col, L ## l, temp_col.ref_idx[l]) + + // derive the motion vectors section 8.5.3.1.8 +-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col, +- int refIdxLx, Mv *mvLXCol, int X, +- int colPic, RefPicList *refPicList_col) ++static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col, ++ const int refIdxLx, Mv * const mvLXCol, const int X, ++ const int colPic, const RefPicList * const refPicList_col) + { +- RefPicList *refPicList = s->ref->refPicList; ++ const RefPicList * const refPicList = s->ref->refPicList; + + if (temp_col.pred_flag == PF_INTRA) + return 0; +@@ -214,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col, + /* + * 8.5.3.1.7 temporal luma motion vector prediction + */ +-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, +- int nPbW, int nPbH, int refIdxLx, +- Mv *mvLXCol, int X) ++static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0, ++ const int nPbW, const int nPbH, const int refIdxLx, ++ Mv * const mvLXCol, const int X) + { + MvField *tab_mvf; + MvField temp_col; + int x, y, x_pu, y_pu; +- int min_pu_width = s->ps.sps->min_pu_width; ++ const int min_pu_width = s->ps.sps->min_pu_width; + int availableFlagLXCol = 0; + int colPic; + +- HEVCFrame *ref = s->ref->collocated_ref; ++ HEVCFrame * const ref = s->ref->collocated_ref; + +- if (!ref) { ++ if (ref == NULL || ref->tab_mvf == NULL) { + memset(mvLXCol, 0, sizeof(*mvLXCol)); + return 0; + } +@@ -239,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, + x = x0 + nPbW; + y = y0 + nPbH; + +- if (tab_mvf && +- (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && ++ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && + y < s->ps.sps->height && + x < s->ps.sps->width) { + x &= ~15; + y &= ~15; + if (s->threads_type == FF_THREAD_FRAME) +- ff_thread_await_progress(&ref->tf, y, 0); ++ ff_hevc_progress_wait_mv(s, s->jb0, ref, y); + x_pu = x >> s->ps.sps->log2_min_pu_size; + y_pu = y >> s->ps.sps->log2_min_pu_size; + temp_col = TAB_MVF(x_pu, y_pu); +@@ -254,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, + } + + // derive center collocated motion vector +- if (tab_mvf && !availableFlagLXCol) { ++ if (!availableFlagLXCol) { + x = x0 + (nPbW >> 1); + y = y0 + (nPbH >> 1); + x &= ~15; + y &= ~15; + if (s->threads_type == FF_THREAD_FRAME) +- ff_thread_await_progress(&ref->tf, y, 0); ++ ff_hevc_progress_wait_mv(s, s->jb0, ref, y); + x_pu = x >> s->ps.sps->log2_min_pu_size; + y_pu = y >> s->ps.sps->log2_min_pu_size; + temp_col = TAB_MVF(x_pu, y_pu); diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c -index 83f2ec2..bcf53dc 100644 +index c1b69a0199..455cdaea1c 100644 --- a/libavcodec/hevc_ps.c +++ b/libavcodec/hevc_ps.c -@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) +@@ -785,7 +785,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) switch (sps->bit_depth) { case 8: if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8; @@ -8863,17 +13007,112 @@ index 83f2ec2..bcf53dc 100644 if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P; if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P; break; -@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, - sps->amp_enabled_flag = get_bits1(gb); - sps->sao_enabled = get_bits1(gb); +@@ -797,7 +802,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) + break; + case 10: + if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16; ++#if RPI_HEVC_SAND ++ // *** Horrid kludge s.t. we start out with sand format ++ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10; ++#else + if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10; ++#endif + if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10; + if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10; + break; +@@ -1064,7 +1074,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, + skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); + if (sps_extension_flag[0]) { + int extended_precision_processing_flag; +- int high_precision_offsets_enabled_flag; + int cabac_bypass_alignment_enabled_flag; -+ av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled); -+ - sps->pcm_enabled_flag = get_bits1(gb); - if (sps->pcm_enabled_flag) { - sps->pcm.bit_depth = get_bits(gb, 4) + 1; + sps->transform_skip_rotation_enabled_flag = get_bits1(gb); +@@ -1079,10 +1088,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, + "extended_precision_processing_flag not yet implemented\n"); + + sps->intra_smoothing_disabled_flag = get_bits1(gb); +- high_precision_offsets_enabled_flag = get_bits1(gb); +- if (high_precision_offsets_enabled_flag) ++ sps->high_precision_offsets_enabled_flag = get_bits1(gb); ++ if (sps->high_precision_offsets_enabled_flag) + av_log(avctx, AV_LOG_WARNING, +- "high_precision_offsets_enabled_flag not yet implemented\n"); ++ "high_precision_offsets_enabled_flag not fully implemented\n"); + + sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); + +diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c +index df52e401ad..8869a4a602 100644 +--- a/libavcodec/hevc_refs.c ++++ b/libavcodec/hevc_refs.c +@@ -23,7 +23,7 @@ + + #include "libavutil/avassert.h" + #include "libavutil/pixdesc.h" +- ++#include "libavutil/rpi_sand_fns.h" + #include "internal.h" + #include "thread.h" + #include "hevc.h" +@@ -205,7 +205,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush) + HEVCFrame *frame = &s->DPB[min_idx]; + AVFrame *dst = out; + AVFrame *src = frame->frame; +- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format); ++ const int fmt = src->format; ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt); + int pixel_shift = !!(desc->comp[0].depth > 8); + + ret = av_frame_ref(out, src); +@@ -216,12 +217,29 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush) + if (ret < 0) + return ret; + +- for (i = 0; i < 3; i++) { +- int hshift = (i > 0) ? desc->log2_chroma_w : 0; +- int vshift = (i > 0) ? desc->log2_chroma_h : 0; +- int off = ((frame->window.left_offset >> hshift) << pixel_shift) + +- (frame->window.top_offset >> vshift) * dst->linesize[i]; +- dst->data[i] += off; ++ if (av_rpi_is_sand_format(fmt)) ++ { ++ // Sand cannot be windowed by offset so add side data if we have an offset ++ const HEVCWindow * const window = &frame->window; ++ if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0) ++ { ++ AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan)); ++ AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data; ++ si->left_offset = window->left_offset; ++ si->top_offset = window->top_offset; ++ si->pic_width = s->ps.sps->width; ++ si->pic_height = s->ps.sps->height; ++ } ++ } ++ else ++ { ++ for (i = 0; i < 3; i++) { ++ int hshift = (i > 0) ? desc->log2_chroma_w : 0; ++ int vshift = (i > 0) ? desc->log2_chroma_h : 0; ++ int off = ((frame->window.left_offset >> hshift) << pixel_shift) + ++ (frame->window.top_offset >> vshift) * dst->linesize[i]; ++ dst->data[i] += off; ++ } + } + av_log(s->avctx, AV_LOG_DEBUG, + "Output frame with POC %d.\n", frame->poc); +@@ -426,8 +444,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc) + frame->sequence = s->seq_decode; + frame->flags = 0; + +- if (s->threads_type == FF_THREAD_FRAME) +- ff_thread_report_progress(&frame->tf, INT_MAX, 0); ++ ff_hevc_progress_set_all_done(frame); + + return frame; + } diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c -index 9d773d9..c4d7250 100644 +index 9d773d960e..c9661c3ab1 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = { @@ -8997,28 +13236,16 @@ index 9d773d9..c4d7250 100644 void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) { #undef FUNC -@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +@@ -193,15 +307,57 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) -+#ifndef RPI ++#if !RPI_HEVC_SAND +#define SLICED_LOOP_FILTERS(depth) ++#define SLICED_ADD_RESIDUAL(depth) ++#define SLICED_SAO(depth) +#else -+#define SLICED_LOOP_FILTERS(depth)\ -+ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ -+ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ -+ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) -+#endif -+ -+ - #define HEVC_DSP(depth) \ - hevcdsp->put_pcm = FUNC(put_pcm, depth); \ - hevcdsp->transform_add[0] = FUNC(transform_add4x4, depth); \ -@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) - hevcdsp->transform_add[2] = FUNC(transform_add16x16, depth); \ - hevcdsp->transform_add[3] = FUNC(transform_add32x32, depth); \ - hevcdsp->transform_skip = FUNC(transform_skip, depth); \ -+ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth); \ ++#define SLICED_ADD_RESIDUAL(depth)\ + hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ + hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ + hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ @@ -9027,30 +13254,77 @@ index 9d773d9..c4d7250 100644 + hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ + hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ + hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ ++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ ++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ ++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ ++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ ++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ ++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) ++#define SLICED_LOOP_FILTERS(depth)\ ++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ ++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) ++#define SLICED_SAO(depth)\ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ ++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ ++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) ++ ++#endif ++ + #define HEVC_DSP(depth) \ + hevcdsp->put_pcm = FUNC(put_pcm, depth); \ +- hevcdsp->transform_add[0] = FUNC(transform_add4x4, depth); \ +- hevcdsp->transform_add[1] = FUNC(transform_add8x8, depth); \ +- hevcdsp->transform_add[2] = FUNC(transform_add16x16, depth); \ +- hevcdsp->transform_add[3] = FUNC(transform_add32x32, depth); \ +- hevcdsp->transform_skip = FUNC(transform_skip, depth); \ ++ hevcdsp->transform_add[0] = FUNC(add_residual4x4, depth); \ ++ hevcdsp->transform_add[1] = FUNC(add_residual8x8, depth); \ ++ hevcdsp->transform_add[2] = FUNC(add_residual16x16, depth); \ ++ hevcdsp->transform_add[3] = FUNC(add_residual32x32, depth); \ ++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ ++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ ++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ ++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ ++ SLICED_ADD_RESIDUAL(depth); \ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ - hevcdsp->idct_4x4_luma = FUNC(transform_4x4_luma, depth); \ +- hevcdsp->idct_4x4_luma = FUNC(transform_4x4_luma, depth); \ ++ hevcdsp->transform_skip = FUNC(transform_skip, depth); \ ++ hevcdsp->idct_4x4_luma = FUNC(idct_4x4_luma, depth); \ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ -@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + hevcdsp->idct[1] = FUNC(idct_8x8, depth); \ + hevcdsp->idct[2] = FUNC(idct_16x16, depth); \ +@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) + hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ + hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ + \ +- hevcdsp->sao_band_filter[0] = \ +- hevcdsp->sao_band_filter[1] = \ +- hevcdsp->sao_band_filter[2] = \ +- hevcdsp->sao_band_filter[3] = \ +- hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth); \ +- hevcdsp->sao_edge_filter[0] = \ +- hevcdsp->sao_edge_filter[1] = \ +- hevcdsp->sao_edge_filter[2] = \ +- hevcdsp->sao_edge_filter[3] = \ +- hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth); \ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ ++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ ++ } \ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ ++ SLICED_SAO(depth); \ \ -+ hevcdsp->sao_band_filter_c[0] = \ -+ hevcdsp->sao_band_filter_c[1] = \ -+ hevcdsp->sao_band_filter_c[2] = \ -+ hevcdsp->sao_band_filter_c[3] = \ -+ hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth); \ -+ hevcdsp->sao_edge_filter_c[0] = \ -+ hevcdsp->sao_edge_filter_c[1] = \ -+ hevcdsp->sao_edge_filter_c[2] = \ -+ hevcdsp->sao_edge_filter_c[3] = \ -+ hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth); \ -+ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ -+ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth); \ -+ \ QPEL_FUNCS(depth); \ QPEL_UNI_FUNCS(depth); \ - QPEL_BI_FUNCS(depth); \ -@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) +@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) EPEL_UNI_FUNCS(depth); \ EPEL_BI_FUNCS(depth); \ \ @@ -9058,7 +13332,7 @@ index 9d773d9..c4d7250 100644 hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ -@@ -257,6 +404,8 @@ int i = 0; +@@ -257,6 +409,8 @@ int i = 0; break; } @@ -9068,10 +13342,18 @@ index 9d773d9..c4d7250 100644 ff_hevc_dsp_init_x86(hevcdsp, bit_depth); if (ARCH_ARM) diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h -index 9f1f6dd..639ecf1 100644 +index 9f1f6dd59f..c4a1b0f09d 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h -@@ -42,11 +42,26 @@ typedef struct SAOParams { +@@ -25,6 +25,7 @@ + #ifndef AVCODEC_HEVCDSP_H + #define AVCODEC_HEVCDSP_H + ++#include "rpi_opts.h" + #include "get_bits.h" + + #define MAX_PB_SIZE 64 +@@ -42,11 +43,40 @@ typedef struct SAOParams { uint8_t type_idx[3]; ///< sao_type_idx } SAOParams; @@ -9085,45 +13367,69 @@ index 9f1f6dd..639ecf1 100644 + int8_t ref_idx[2]; + int8_t pred_flag; +} MvField; ++ ++#ifdef RPI ++#define SAO_FILTER_N 6 ++#else ++#define SAO_FILTER_N 5 ++#endif ++ + typedef struct HEVCDSPContext { void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, struct GetBitContext *gb, int pcm_bit_depth); -+ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, -+ struct GetBitContext *gb, int pcm_bit_depth); - void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride); ++ // add_residual was transform_add - import 3.3 names + void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); -+ void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); -+ void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride); ++ void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc); ++#if RPI_HEVC_SAND ++ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v); ++ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u); ++ ++ void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride); ++ void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv); ++ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height, ++ struct GetBitContext *gb, int pcm_bit_depth); ++#endif void (*transform_skip)(int16_t *coeffs, int16_t log2_size); -@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext { +@@ -58,16 +88,31 @@ typedef struct HEVCDSPContext { - void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + void (*idct_dc[4])(int16_t *coeffs); + +- void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++ void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, int16_t *sao_offset_val, int sao_left_class, int width, int height); -+ void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, ++#if RPI_HEVC_SAND ++ void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); ++#endif /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */ - void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, +- void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, ++ void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, int16_t *sao_offset_val, int sao_eo_class, int width, int height); -+ void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, ++#if RPI_HEVC_SAND ++ void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst, + const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height); ++#endif void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); ++#if RPI_HEVC_SAND + void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, + struct SAOParams *sao, int *borders, int _width, int _height, int c_idx, + uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge); ++#endif void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width); -@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext { +@@ -120,6 +165,22 @@ typedef struct HEVCDSPContext { void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride, int32_t *tc, uint8_t *no_p, uint8_t *no_q); @@ -9147,23 +13453,24 @@ index 9f1f6dd..639ecf1 100644 void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth); diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c -index b840d17..32b9e47 100644 +index 5bca02342d..122fbe8154 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c -@@ -26,6 +26,9 @@ +@@ -26,6 +26,7 @@ #include "bit_depth_template.c" #include "hevcdsp.h" -+#ifdef RPI -+#include "rpi_zc.h" -+#endif ++#include "rpi_shader_template.h" static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height, GetBitContext *gb, int pcm_bit_depth) -@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height +@@ -42,8 +43,32 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height } } +-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs, +- ptrdiff_t stride, int size) ++#if RPI_HEVC_SAND +static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height, + GetBitContext *gb, int pcm_bit_depth) +{ @@ -9185,17 +13492,20 @@ index b840d17..32b9e47 100644 + dst += stride; + } +} ++#endif + -+ - static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride, int size) ++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride, int size) { -@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe + int x, y; + pixel *dst = (pixel *)_dst; +@@ -59,30 +84,255 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe } } -+static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride, int size) +-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs, +- ptrdiff_t stride) ++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size) +{ + int x, y; + pixel *dst = (pixel *)_dst; @@ -9203,77 +13513,300 @@ index b840d17..32b9e47 100644 + stride /= sizeof(pixel); + + for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ dst[x] = av_clip_pixel(dst[x] + dc); ++ } ++ dst += stride; ++ } ++} ++ ++ ++#if RPI_HEVC_SAND ++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res, ++ ptrdiff_t stride, const int dc_v, int size) + { +- FUNC(transquant_bypass)(_dst, coeffs, stride, 4); ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { + for (x = 0; x < size * 2; x += 2) { + dst[x] = av_clip_pixel(dst[x] + *res); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); ++ res++; ++ } ++ dst += stride; ++ } + } + +-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs, ++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res, ++ ptrdiff_t stride, const int dc_u, int size) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + dc_u); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *res); + res++; + } + dst += stride; + } +} + - static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs, ++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res, ++ ptrdiff_t stride, unsigned int size) ++{ ++ unsigned int x, y; ++ pixel *dst = (pixel *)_dst; ++ const int16_t * ru = res; ++ const int16_t * rv = res + size * size; ++ ++// rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1); ++// rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0); ++// rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0); ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++); ++ } ++ dst += stride; ++ } ++ ++// rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1); ++} ++ ++ ++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ const int dc_v = dc >> 16; ++ const int dc_u = (dc << 16) >> 16; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + dc_u); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); ++ } ++ dst += stride; ++ } ++} ++ ++ ++#endif ++ ++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, coeffs, stride, 4); ++} ++ ++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride) { -@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs, - FUNC(transquant_bypass)(_dst, coeffs, stride, 32); +- FUNC(transquant_bypass)(_dst, coeffs, stride, 8); ++ FUNC(add_residual)(_dst, coeffs, stride, 8); } +-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs, ++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride) + { +- FUNC(transquant_bypass)(_dst, coeffs, stride, 16); ++ FUNC(add_residual)(_dst, coeffs, stride, 16); + } + +-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs, ++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *coeffs, + ptrdiff_t stride) + { +- FUNC(transquant_bypass)(_dst, coeffs, stride, 32); ++ FUNC(add_residual)(_dst, coeffs, stride, 32); + } + ++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 32); ++} ++ ++#if RPI_HEVC_SAND +// -- U -- (plaited) + -+static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) ++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) +{ -+ FUNC(add_residual_uv)(_dst, res, stride, 4); ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); +} + -+static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) ++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) +{ -+ FUNC(add_residual_uv)(_dst, res, stride, 8); ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); +} + -+static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) ++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) +{ -+ FUNC(add_residual_uv)(_dst, res, stride, 16); ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); +} + -+static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res, -+ ptrdiff_t stride) ++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) +{ -+ FUNC(add_residual_uv)(_dst, res, stride, 32); ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); +} + +// -- V -- (plaited) + -+static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res, ++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); ++} ++ ++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); ++} ++ ++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); ++} ++ ++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- C -- (plaited - both U & V) ++ ++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ -+ FUNC(add_residual_uv)(_dst + 1, res, stride, 4); ++ FUNC(add_residual_c)(_dst, res, stride, 4); +} + -+static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res, ++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ -+ FUNC(add_residual_uv)(_dst + 1, res, stride, 8); ++ FUNC(add_residual_c)(_dst, res, stride, 8); +} + -+static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res, ++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ -+ FUNC(add_residual_uv)(_dst + 1, res, stride, 16); ++ FUNC(add_residual_c)(_dst, res, stride, 16); +} + -+static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res, ++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, + ptrdiff_t stride) +{ -+ FUNC(add_residual_uv)(_dst + 1, res, stride, 32); ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); +} ++ ++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++#endif + static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) { -@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, +@@ -152,7 +402,7 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size) + assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \ + } while (0) + +-static void FUNC(transform_4x4_luma)(int16_t *coeffs) ++static void FUNC(idct_4x4_luma)(int16_t *coeffs) + { + int i; + int shift = 7; +@@ -358,6 +608,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride + } + } + ++ ++#if BIT_DEPTH == 10 ++#if RPI_HEVC_SAND ++// We need a 32 bit variation for the _c restores so hijack bit depth 10 ++#undef pixel ++#undef BIT_DEPTH ++#define pixel uint32_t ++#define BIT_DEPTH 32 ++#endif ++// All 16 bit variations are the same ++#define sao_edge_restore_0_10 sao_edge_restore_0_9 ++#define sao_edge_restore_1_10 sao_edge_restore_1_9 ++#define sao_edge_restore_0_11 sao_edge_restore_0_9 ++#define sao_edge_restore_1_11 sao_edge_restore_1_9 ++#define sao_edge_restore_0_12 sao_edge_restore_0_9 ++#define sao_edge_restore_1_12 sao_edge_restore_1_9 ++#define sao_edge_restore_0_13 sao_edge_restore_0_9 ++#define sao_edge_restore_1_13 sao_edge_restore_1_9 ++#define sao_edge_restore_0_14 sao_edge_restore_0_9 ++#define sao_edge_restore_1_14 sao_edge_restore_1_9 ++#define sao_edge_restore_0_15 sao_edge_restore_0_9 ++#define sao_edge_restore_1_15 sao_edge_restore_1_9 ++#define sao_edge_restore_0_16 sao_edge_restore_0_9 ++#define sao_edge_restore_1_16 sao_edge_restore_1_9 ++#endif ++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 + static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, + int *borders, int _width, int _height, +@@ -367,7 +643,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, int x, y; pixel *dst = (pixel *)_dst; pixel *src = (pixel *)_src; @@ -9281,7 +13814,7 @@ index b840d17..32b9e47 100644 int sao_eo_class = sao->eo_class[c_idx]; int init_x = 0, width = _width, height = _height; -@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, +@@ -376,33 +651,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, if (sao_eo_class != SAO_EO_VERT) { if (borders[0]) { @@ -9321,7 +13854,7 @@ index b840d17..32b9e47 100644 height--; } } -@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, +@@ -417,7 +688,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, int x, y; pixel *dst = (pixel *)_dst; pixel *src = (pixel *)_src; @@ -9329,7 +13862,7 @@ index b840d17..32b9e47 100644 int sao_eo_class = sao->eo_class[c_idx]; int init_x = 0, init_y = 0, width = _width, height = _height; -@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, +@@ -426,34 +696,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, if (sao_eo_class != SAO_EO_VERT) { if (borders[0]) { @@ -9370,24 +13903,22 @@ index b840d17..32b9e47 100644 height--; } } -@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, +@@ -493,6 +759,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, + } } - ++#endif ++#if BIT_DEPTH == 32 ++#undef BIT_DEPTH ++#undef pixel ++#define BIT_DEPTH 10 ++#define pixel uint16_t ++#endif + +// --- Plaited chroma versions + -+#if BIT_DEPTH != 8 -+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height) -+{ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#else ++#if RPI_HEVC_SAND ++ +static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, @@ -9413,23 +13944,17 @@ index b840d17..32b9e47 100644 + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 2) + { -+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]); -+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]); ++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); ++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); ++ // *** & 31 shouldn't be wanted but just now we generate broken input that ++ // crashes us in 10-bit world ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); + } + dst += stride_dst; + src += stride_src; + } +} -+#endif -+ -+#if BIT_DEPTH != 8 -+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, -+ int eo, int width, int height) { -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#else + +static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, @@ -9447,9 +13972,12 @@ index b840d17..32b9e47 100644 + int a_stride, b_stride; + int x, y; + ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ + stride_dst /= sizeof(pixel); + width *= 2; + ++ av_assert0(width <= 64); ++ + a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; + b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; + for (y = 0; y < height; y++) { @@ -9467,43 +13995,42 @@ index b840d17..32b9e47 100644 + dst += stride_dst; + } +} -+#endif + -+#if BIT_DEPTH != 8 -+static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, -+ int *borders, int _width, int _height, -+ int c_idx, uint8_t *vert_edge, -+ uint8_t *horiz_edge, uint8_t *diag_edge) -+{ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, -+ int *borders, int _width, int _height, -+ int c_idx, uint8_t *vert_edge, -+ uint8_t *horiz_edge, uint8_t *diag_edge) -+{ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#else ++// Do once ++#if BIT_DEPTH == 8 +// Any old 2 byte 'normal' restore will work for these -+#define sao_edge_restore_c_0_8 sao_edge_restore_0_10 -+#define sao_edge_restore_c_1_8 sao_edge_restore_1_10 ++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 ++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 ++// We need 32 bit for 9 bit+ ++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 +#endif + ++#endif // RPI_HEVC_SAND + + #undef CMP - //////////////////////////////////////////////////////////////////////////////// -@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, +@@ -1694,3 +2075,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, #undef TQ1 #undef TQ2 #undef TQ3 + -+#ifdef RPI ++#if RPI_HEVC_SAND + +// line zero +#define P3 pix_l[0 * xstride] @@ -9717,7 +14244,7 @@ index b840d17..32b9e47 100644 +#endif + diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c -index 02c1766..cea16ea 100644 +index 02c1766059..cea16eade4 100644 --- a/libavcodec/hevcpred.c +++ b/libavcodec/hevcpred.c @@ -24,6 +24,7 @@ @@ -9799,7 +14326,7 @@ index 02c1766..cea16ea 100644 case 9: HEVC_PRED(9); diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h -index eb17663..00ba3f9 100644 +index eb17663683..00ba3f94c0 100644 --- a/libavcodec/hevcpred.h +++ b/libavcodec/hevcpred.h @@ -38,6 +38,17 @@ typedef struct HEVCPredContext { @@ -9821,10 +14348,10 @@ index eb17663..00ba3f9 100644 void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c -index 6ae87cc..c14dddd 100644 +index 6fe33546b1..2f9f5f2798 100644 --- a/libavcodec/hevcpred_template.c +++ b/libavcodec/hevcpred_template.c -@@ -20,13 +20,55 @@ +@@ -20,13 +20,110 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -9836,34 +14363,90 @@ index 6ae87cc..c14dddd 100644 #include "hevcpred.h" +#ifdef RPI -+#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" +#endif + +#define DUMP_PRED 0 + #define POS(x, y) src[(x) + stride * (y)] -+#if PRED_C -+ ++// REPEAT_INCLUDE defined at EOF ++#if defined(RPI) && !defined(INCLUDED_ONCE) +typedef uint8_t (* c8_dst_ptr_t)[2]; +typedef const uint8_t (* c8_src_ptr_t)[2]; ++typedef uint16_t (* c16_dst_ptr_t)[2]; ++typedef const uint16_t (* c16_src_ptr_t)[2]; ++ ++// *** On ARM make these NEON registers ++typedef struct pixel4_16 { ++ uint16_t x[4]; ++} pixel4_16; ++typedef struct pixel4_32 { ++ uint32_t x[4]; ++} pixel4_32; ++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) ++{ ++ pixel4_16 t = {{x, x, x, x}}; ++ return t; ++} ++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) ++{ ++ pixel4_32 t = {{x, x, x, x}}; ++ return t; ++} ++#endif ++ ++#if PRED_C ++// For chroma we double pixel size so we copy pairs ++#undef pixel ++#undef pixel2 ++#undef pixel4 ++#undef dctcoef ++#undef INIT_CLIP ++#undef no_rnd_avg_pixel4 ++#undef rnd_avg_pixel4 ++#undef AV_RN2P ++#undef AV_RN4P ++#undef AV_RN4PA ++#undef AV_WN2P ++#undef AV_WN4P ++#undef AV_WN4PA ++#undef CLIP ++#undef FUNC ++#undef FUNCC ++#undef av_clip_pixel ++#undef PIXEL_SPLAT_X4 + +#if BIT_DEPTH == 8 -+#undef BIT_DEPTH -+#define BIT_DEPTH 16 -+#include "bit_depth_template.c" -+#undef FUNC -+#define FUNC(a) FUNC3(a, 8, _c) ++#define pixel uint16_t ++#define pixel4 pixel4_16 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 ++#define cpel uint8_t ++#define c_src_ptr_t c8_src_ptr_t ++#define c_dst_ptr_t c8_dst_ptr_t +#else -+#undef FUNC -+#define FUNC FUNCC ++#define pixel uint32_t ++#define pixel4 pixel4_32 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 ++#define cpel uint16_t ++#define c_src_ptr_t c16_dst_ptr_t ++#define c_dst_ptr_t c16_dst_ptr_t ++#endif ++#define AV_RN4P(p) (*(pixel4*)(p)) ++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) ++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) +#endif + ++ ++// Get PW prior to horrid PRED_C trickery ++#if BIT_DEPTH == 8 ++#define PW 1 ++#else ++#define PW 2 +#endif + -+#if DUMP_PRED -+#ifndef DEBUG_ONCE -+#define DEBUG_ONCE ++ ++#if DUMP_PRED && !defined(INCLUDE_ONCE) +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) +{ + for (unsigned int y = 0; y != size; y++, data += stride * 2) { @@ -9875,17 +14458,16 @@ index 6ae87cc..c14dddd 100644 + printf("\n"); +} +#endif -+#endif + static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0, int log2_size, int c_idx) { -@@ -69,8 +111,11 @@ do { \ +@@ -69,8 +166,11 @@ do { \ AV_WN4P(&ptr[i], a); \ else \ a = PIXEL_SPLAT_X4(ptr[i + 3]) - -+#ifdef RPI_WORKER ++#ifdef RPI + HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ; +#else HEVCLocalContext *lc = s->HEVClc; @@ -9893,7 +14475,7 @@ index 6ae87cc..c14dddd 100644 int i; int hshift = s->ps.sps->hshift[c_idx]; int vshift = s->ps.sps->vshift[c_idx]; -@@ -79,15 +124,23 @@ do { \ +@@ -79,15 +179,23 @@ do { \ int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; int size_in_luma_v = size << vshift; int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; @@ -9909,18 +14491,18 @@ index 6ae87cc..c14dddd 100644 - ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); + const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); +#if defined(RPI) -+ pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ? ++ pixel *const src = !av_rpi_is_sand_frame(s->frame) ? + (pixel*)s->frame->data[c_idx] + x + y * stride : + c_idx == 0 ? -+ (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) : -+ (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y); ++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : ++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); +#else pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride; +#endif int min_pu_width = s->ps.sps->min_pu_width; -@@ -95,14 +148,20 @@ do { \ +@@ -95,14 +203,20 @@ do { \ lc->tu.intra_pred_mode; pixel4 a; pixel left_array[2 * MAX_TB_SIZE + 1]; @@ -9941,7 +14523,7 @@ index 6ae87cc..c14dddd 100644 int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); int cand_left = lc->na.cand_left; int cand_up_left = lc->na.cand_up_left; -@@ -114,6 +173,26 @@ do { \ +@@ -114,6 +228,27 @@ do { \ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - (x0 + size_in_luma_h)) >> hshift; @@ -9954,10 +14536,11 @@ index 6ae87cc..c14dddd 100644 +#endif + +#if defined(RPI) -+ if (s->frame->format == AV_PIX_FMT_SAND128) { ++ if (av_rpi_is_sand_frame(s->frame)) { ++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs + const AVFrame * const frame = s->frame; + const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 -+ const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride; ++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; + if ((x & mask) == 0) + src_l -= stripe_adj; + if (((x + size) & mask) == 0) @@ -9968,7 +14551,7 @@ index 6ae87cc..c14dddd 100644 if (s->ps.pps->constrained_intra_pred_flag == 1) { int size_in_luma_pu_v = PU(size_in_luma_v); int size_in_luma_pu_h = PU(size_in_luma_h); -@@ -163,23 +242,24 @@ do { \ +@@ -163,23 +298,24 @@ do { \ top[-1] = 128; } if (cand_up_left) { @@ -10000,29 +14583,29 @@ index 6ae87cc..c14dddd 100644 size - bottom_left_size); } -@@ -268,7 +348,11 @@ do { \ +@@ -268,7 +404,11 @@ do { \ cand_up_left = 1; cand_left = 1; } else { // No samples available -+#if PRED_C && BIT_DEPTH == 16 -+ left[-1] = 0x8080; ++#if PRED_C ++ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8)); +#else left[-1] = (1 << (BIT_DEPTH - 1)); +#endif EXTEND(top, left[-1], 2 * size); EXTEND(left, left[-1], 2 * size); } -@@ -287,6 +371,9 @@ do { \ +@@ -287,6 +427,9 @@ do { \ top[-1] = left[-1]; // Filtering process -+ // Sand128 can only apply to chroma_format_idc == 1 so we don't need to ++ // Sand can only apply to chroma_format_idc == 1 so we don't need to + // worry about chroma smoothing for that case +#if !PRED_C if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { if (mode != INTRA_DC && size != 4){ int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; -@@ -342,13 +429,46 @@ do { \ +@@ -342,6 +485,30 @@ do { \ mode); break; } @@ -10052,24 +14635,8 @@ index 6ae87cc..c14dddd 100644 +#endif } -+#if !PRED_C || BIT_DEPTH == 16 #define INTRA_PRED(size) \ - static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ - { \ - FUNC(intra_pred)(s, x0, y0, size, c_idx); \ - } -+#else -+#define INTRA_PRED(size) \ -+static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ -+{ \ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__); \ -+ abort(); \ -+} -+#endif - - INTRA_PRED(2) - INTRA_PRED(3) -@@ -357,6 +477,7 @@ INTRA_PRED(5) +@@ -357,6 +524,7 @@ INTRA_PRED(5) #undef INTRA_PRED @@ -10077,7 +14644,7 @@ index 6ae87cc..c14dddd 100644 static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int trafo_size) -@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to +@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); } @@ -10088,9 +14655,9 @@ index 6ae87cc..c14dddd 100644 +{ + int x, y; + int size = 1 << trafo_size; -+ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; -+ const c8_src_ptr_t top = (c8_src_ptr_t)_top; -+ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; + + for (y = 0; y < size; y++, src += stride) + { @@ -10105,26 +14672,9 @@ index 6ae87cc..c14dddd 100644 +} +#endif -+#if !PRED_C || BIT_DEPTH == 16 #define PRED_PLANAR(size)\ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ - const uint8_t *left, ptrdiff_t stride) \ - { \ - FUNC(pred_planar)(src, top, left, stride, size + 2); \ - } -+#else -+#define PRED_PLANAR(size)\ -+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ -+ const uint8_t *left, ptrdiff_t stride) \ -+{ \ -+ av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__); \ -+ abort(); \ -+} -+#endif - - PRED_PLANAR(0) - PRED_PLANAR(1) -@@ -386,6 +540,7 @@ PRED_PLANAR(3) +@@ -386,6 +577,7 @@ PRED_PLANAR(3) #undef PRED_PLANAR @@ -10132,7 +14682,7 @@ index 6ae87cc..c14dddd 100644 static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, ptrdiff_t stride, int log2_size, int c_idx) -@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, +@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, POS(0, y) = (left[y] + 3 * dc + 2) >> 2; } } @@ -10143,9 +14693,9 @@ index 6ae87cc..c14dddd 100644 +{ + unsigned int i, j; + const unsigned int size = (1 << log2_size); -+ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; -+ const c8_src_ptr_t top = (c8_src_ptr_t)_top; -+ const c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; + unsigned int dc0 = size; + unsigned int dc1 = size; + @@ -10186,7 +14736,7 @@ index 6ae87cc..c14dddd 100644 static av_always_inline void FUNC(pred_angular)(uint8_t *_src, const uint8_t *_top, const uint8_t *_left, -@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, +@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, const pixel *top = (const pixel *)_top; const pixel *left = (const pixel *)_left; @@ -10202,7 +14752,7 @@ index 6ae87cc..c14dddd 100644 int angle = intra_pred_angle[mode - 2]; pixel ref_array[3 * MAX_TB_SIZE + 4]; pixel *ref_tmp = ref_array + size; -@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, +@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, } } } @@ -10214,26 +14764,26 @@ index 6ae87cc..c14dddd 100644 + int mode, int size) +{ + int x, y; -+ c8_dst_ptr_t src = (c8_dst_ptr_t)_src; -+ c8_src_ptr_t top = (c8_src_ptr_t)_top; -+ c8_src_ptr_t left = (c8_src_ptr_t)_left; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ c_src_ptr_t top = (c_src_ptr_t)_top; ++ c_src_ptr_t left = (c_src_ptr_t)_left; + + const int angle = intra_pred_angle[mode - 2]; -+ uint8_t ref_array[3 * MAX_TB_SIZE + 4][2]; -+ c8_dst_ptr_t ref_tmp = ref_array + size; -+ c8_src_ptr_t ref; ++ cpel ref_array[3 * MAX_TB_SIZE + 4][2]; ++ c_dst_ptr_t ref_tmp = ref_array + size; ++ c_src_ptr_t ref; + const int last = (size * angle) >> 5; + + if (mode >= 18) { + ref = top - 1; + if (angle < 0 && last < -1) { -+ memcpy(ref_tmp, top - 1, (size + 1) * 2); ++ memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW); + for (x = last; x <= -1; x++) + { + ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; + ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; + } -+ ref = (c8_src_ptr_t)ref_tmp; ++ ref = (c_src_ptr_t)ref_tmp; + } + + for (y = 0; y < size; y++, src += stride) { @@ -10247,19 +14797,19 @@ index 6ae87cc..c14dddd 100644 + fact * ref[x + idx + 2][1] + 16) >> 5; + } + } else { -+ memcpy(src, ref + idx + 1, size * 2); ++ memcpy(src, ref + idx + 1, size * 2 * PW); + } + } + } else { + ref = left - 1; + if (angle < 0 && last < -1) { -+ memcpy(ref_tmp, left - 1, (size + 1) * 2); ++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); + for (x = last; x <= -1; x++) + { + ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; + ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; + } -+ ref = (c8_src_ptr_t)ref_tmp; ++ ref = (c_src_ptr_t)ref_tmp; + } + + for (x = 0; x < size; x++, src++) { @@ -10286,124 +14836,135 @@ index 6ae87cc..c14dddd 100644 static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, const uint8_t *left, -diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c -index 099a8c5..bdff2d2 100644 ---- a/libavcodec/mmaldec.c -+++ b/libavcodec/mmaldec.c -@@ -24,6 +24,9 @@ - * MMAL Video Decoder - */ +@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, + FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5); + } -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" - #include - #include - #include -@@ -31,6 +34,7 @@ - #include - #include - #include -+#pragma GCC diagnostic pop - - #include "avcodec.h" - #include "internal.h" -diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c -index 3adf28d..2f9195f 100644 ---- a/libavcodec/mpeg4videodec.c -+++ b/libavcodec/mpeg4videodec.c -@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) - - if (ctx->divx_version >= 0) - s->workaround_bugs |= FF_BUG_HPEL_CHROMA; ++#undef cpel ++#undef c_src_ptr_t ++#undef c_dst_ptr_t ++ + #undef EXTEND_LEFT_CIP + #undef EXTEND_RIGHT_CIP + #undef EXTEND_UP_CIP +@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, + #undef EXTEND + #undef MIN_TB_ADDR_ZS + #undef POS ++#undef PW ++ ++#ifndef INCLUDED_ONCE ++#define INCLUDED_ONCE ++#endif + -+ if (ctx->num_sprite_warping_points > 1) -+ s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED; - } - - if (s->workaround_bugs & FF_BUG_STD_QPEL) { -@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) - s->workaround_bugs, ctx->lavc_build, ctx->xvid_build, - ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : ""); - -+ avctx->workaround_bugs = s->workaround_bugs; - if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 && - s->codec_id == AV_CODEC_ID_MPEG4 && - avctx->idct_algo == FF_IDCT_AUTO) { diff --git a/libavcodec/raw.c b/libavcodec/raw.c -index bfa2537..1bca89e 100644 +index d36b68bfae..b526dc393d 100644 --- a/libavcodec/raw.c +++ b/libavcodec/raw.c -@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { +@@ -260,6 +260,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, + /* RPI */ +#ifdef RPI + { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, +#endif + /* special */ { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c -index d837056..81256b5 100644 +index d83705645c..8dcdf66158 100644 --- a/libavcodec/rawenc.c +++ b/libavcodec/rawenc.c -@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS +@@ -31,6 +31,8 @@ + #include "libavutil/intreadwrite.h" + #include "libavutil/imgutils.h" + #include "libavutil/internal.h" ++#include "libavutil/avassert.h" ++#include "libavutil/rpi_sand_fns.h" + + static av_cold int raw_encode_init(AVCodecContext *avctx) + { +@@ -47,6 +49,71 @@ FF_ENABLE_DEPRECATION_WARNINGS return 0; } -+static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off) -+{ -+ for (int y = 0; y != frame->height / 2; ++y) { -+ for (int x = 0; x < frame->width; x += frame->linesize[0]) { -+ const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off; -+ const int w = FFMIN(frame->linesize[0], frame->width - x) / 2; -+ for (int i = 0; i < w; ++i) -+ *dst++ = p[i * 2]; -+ } -+ } -+ return dst; -+} -+ -+static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame) +{ -+ int size = frame->width * frame->height * 3 / 2; ++ const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO); ++ int size; ++ int width = frame->width; ++ int height = frame->height; ++ int x0 = 0; ++ int y0 = 0; + uint8_t * dst; + int ret; + ++ if (sd != NULL) { ++ const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data; ++ ++ x0 = si->left_offset; ++ y0 = si->top_offset; ++ } ++ ++ size = width * height * 3 / 2; + if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) + return ret; + + dst = pkt->data; + -+ // Luma is "easy" -+ for (int y = 0; y != frame->height; ++y) { -+ for (int x = 0; x < frame->width; x += frame->linesize[0]) { -+ const int w = FFMIN(frame->linesize[0], frame->width - x); -+ memcpy(dst, -+ frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w); -+ dst += w; -+ } -+ } -+ // Chroma is dull -+ dst = cpy_sand_c(dst, frame, 0); -+ dst = cpy_sand_c(dst, frame, 1); -+ ++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height; ++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); + return 0; +} ++ ++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO); ++ int size; ++ int width = frame->width; ++ int height = frame->height; ++ int x0 = 0; ++ int y0 = 0; ++ uint8_t * dst; ++ int ret; ++ ++ if (sd != NULL) { ++ const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data; ++ ++ x0 = si->left_offset; ++ y0 = si->top_offset; ++ } ++ ++ size = width * height * 3; ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); ++ dst += width * height * 2; ++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); ++ return 0; ++} ++ + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, const AVFrame *frame, int *got_packet) { -@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, +@@ -56,6 +123,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, if (ret < 0) return ret; -+ if (frame->format == AV_PIX_FMT_SAND128) { -+ ret = raw_sand_as_yuv420(avctx, pkt, frame); ++ if (av_rpi_is_sand_frame(frame)) { ++ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame); + *got_packet = (ret == 0); + return ret; + } @@ -10411,13 +14972,4018 @@ index d837056..81256b5 100644 if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) return ret; if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, -diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h +diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s new file mode 100644 -index 0000000..4309f1c +index 0000000000..391f761df9 --- /dev/null -+++ b/libavcodec/rpi_hevc_transform.h ++++ b/libavcodec/rpi_hevc_transform.s +@@ -0,0 +1,923 @@ ++# ****************************************************************************** ++# Argon Design Ltd. ++# (c) Copyright 2015 Argon Design Ltd. All rights reserved. ++# ++# Module : HEVC ++# Author : Peter de Rivaz ++# ****************************************************************************** ++ ++# HEVC VPU Transform ++# fe ++# Transform matrix can be thought of as ++# output row vector = input row vector * transMatrix2 ++# ++# The even rows of the matrix are symmetric ++# The odd rows of the matrix are antisymmetric ++# ++# So only need to compute the first half of the results, then can compute the remainder with a butterfly ++# ++# EXAMPLE ++# (a b c d) (1 2 2 1) ++# (3 4 -4 -3) ++# (5 6 6 5) ++# (7 8 -8 -7) ++# ++# x=(a c)(1 2) = 1a+5c 2a+6c ++# (5 6) ++# ++# y=(b d)(3 4) = 3b+7d 4b+8d ++# (7 8) ++# ++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d ++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d ++# ++# Final results are (u , v[::-1]) ++# ++# ++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) ++# Apply the even matrix first and stop before rounding ++# Then apply the odd matrix in a full manner: ++# ++# First step is to compute partial products with the first input (16 cycles) ++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output ++# 2a 4b 6c 8d ++# 2a -4b 6c -8d ++# 1a -3b 5c -7d ++# ++# Second step is to sum partial products into final position (8 cycles) ++# 1a+3b+5c+7d ++# 2a+4b+6c+8d ++# 2a-4b+6c-8d ++# 1a-3b+5c-7d ++# ++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) ++# ++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) ++# ++# For 8x8 we could compute two in parallel. ++# ++# ++ ++# Columns are transformed first ++# ++# Store top left half of transMatrix2 in ++# Store bottom left half of transMatrix2 in HX(32,32) ++# ++# For 16x16 ++# HX(0:15,0) contains input data before transform ++# HY(0:15,0) contains 32bit output data after transform ++# HX(32,0) contains even rows of left half of transMatrix2 ++# HX(32,32) contains odd rows of left half of transMatrix2 ++# HY(48,0) contains partial products ready for summing ++# ++ ++ ++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# coeffs32 ++# num32: number of 32x32 transforms ++# command 0 for transform, 1 for memclear16(int16_t *dst,num16) ++# ++ ++.equ TRANS_SHIFT, 20 - BIT_DEPTH ++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) ++.equ TRANS_ASL2, 16 - TRANS_SHIFT ++ ++ ++hevc_trans_16x16: ++ cmp r5,1 ++ beq memclear16 ++ cmp r5,2 ++ beq hevc_deblock_16x16 ++ cmp r5,3 ++ beq hevc_uv_deblock_16x16 ++ cmp r5,4 ++ beq hevc_uv_deblock_16x16_with_clear ++ cmp r5,5 ++ beq hevc_run_command_list ++ ++ push r6-r15, lr # TODO cut down number of used registers ++ mov r14,r3 # coeffs32 ++ mov r15,r4 # num32 ++ mov r3, 16*2 # Stride of transMatrix2 in bytes ++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix ++ ++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix ++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ # Now use r0 to describe which matrix we are working on. ++ # Allows us to prefetch the next block of coefficients for efficiency. ++ mov r0,0 # This describes the location where we read our coefficients from ++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) ++ mov r7,16*16*2 # Total block size ++ mov r8,64*16 # Value used to swap from current to next VRF location ++ vldh HX(0++,0)+r0,(r1 += r3) REP 16 ++ mov r4,64 # Constant used for rounding first pass ++ mov r5,TRANS_RND2 # Constant used for rounding second pass ++ ++ # At start of block r0,r1 point to the current block (that has already been loaded) ++block_loop: ++ eor r0,r8 ++ add r1,r7 ++ # Prefetch the next block ++ vldh HX(0++,0)+r0,(r1 += r3) REP 16 ++ eor r0,r8 ++ sub r1,r7 ++ ++ # Transform the current block ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? ++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position ++ ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) ++ ++ # Save results - note there has been a transposition during the processing so we save columns ++ vsth VX(0,32++)+r0, (r1 += r3) REP 16 ++ ++ # Move onto next block ++ eor r0,r8 ++ add r1,r7 ++ ++ addcmpbgt r2,-1,0,block_loop ++ ++ # Now go and do any 32x32 transforms ++ b hevc_trans_32x32 ++ ++ pop r6-r15, pc ++ ++# r1,r2,r3 r7,r8 should be preserved ++# HX(0++,0)+r0 is the block to be transformed ++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients ++# Use HY(48,0) for intermediate results ++# r0 can be used, but should be returned to its original value at the end ++col_trans_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++col_trans_odd_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_odd_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_odd_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# ++hevc_trans_32x32: ++ mov r1,r14 # coeffs ++ mov r2,r15 # num ++ ++ # Fetch odd transform matrix ++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) ++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix ++ #add r0, 16*16*2 ++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer ++ mov r7, 16*16*2 # Total block size ++ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) ++ # set r8 to 32byte aligned stack pointer ++ add r8,sp,31 ++ lsr r8,5 ++ lsl r8,5 ++ mov r9,r8 # Backup of the temporary storage ++ mov r10,r1 # Backup of the coefficient buffer ++block_loop32: ++ ++ # COLUMN TRANSFORM ++ mov r4, 64 # Constant used for rounding first pass ++ mov r5, 9 # left shift used for rounding first pass ++ ++ # Transform the first 16 columns ++ mov r1,r10 # Input Coefficient buffer ++ mov r8,r9 # Output temporary storage ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ # ROW TRANSFORM ++ mov r4, TRANS_RND2 # Constant used for rounding second pass ++ mov r5, TRANS_ASL2 # left shift used for rounding second pass ++ ++ mov r1,r9 # Input temporary storage ++ mov r8,r10 # Output Coefficient buffer ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ add r10, 32*32*2 # move onto next block of coefficients ++ addcmpbgt r2,-1,0,block_loop32 ++ ++ add sp,sp,32*32*2+32 # Restore stack ++ ++ pop r6-r15, pc ++ ++trans32: ++ push lr ++ # We can no longer afford the VRF space to do prefetching when doing 32x32 ++ # Fetch the even rows ++ vldh HX(0++,0),(r1 += r3) REP 16 ++ # Fetch the odd rows ++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 ++ ++ # Transform the even rows using even matrix ++ mov r0, 0 # Even rows ++ bl col_trans_16 ++ ++ # Now transform the odd rows using odd matrix ++ mov r0, 64*16 # Odd rows ++ bl col_trans_odd_16 ++ ++ # Now apply butterfly to compute the first 16 results ++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ # 16bit results now in HX(48,32) ++ mov r0,r8 ++ mov r6,32*2 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ ++ # Now apply butterfly to compute the second 16 results (in reverse order) ++ vsub HY(63,0),HY(0 ,0),HY(16,0) ++ vsub HY(62,0),HY(1 ,0),HY(17,0) ++ vsub HY(61,0),HY(2 ,0),HY(18,0) ++ vsub HY(60,0),HY(3 ,0),HY(19,0) ++ vsub HY(59,0),HY(4 ,0),HY(20,0) ++ vsub HY(58,0),HY(5 ,0),HY(21,0) ++ vsub HY(57,0),HY(6 ,0),HY(22,0) ++ vsub HY(56,0),HY(7 ,0),HY(23,0) ++ vsub HY(55,0),HY(8 ,0),HY(24,0) ++ vsub HY(54,0),HY(9 ,0),HY(25,0) ++ vsub HY(53,0),HY(10,0),HY(26,0) ++ vsub HY(52,0),HY(11,0),HY(27,0) ++ vsub HY(51,0),HY(12,0),HY(28,0) ++ vsub HY(50,0),HY(13,0),HY(29,0) ++ vsub HY(49,0),HY(14,0),HY(30,0) ++ vsub HY(48,0),HY(15,0),HY(31,0) ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ add r0,r8,32 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ pop pc ++ ++memclear16: ++ # r0 is address ++ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified) ++ vmov HX(0++,0),0 REP 16 ++ mov r2,32 ++loop: ++ vsth HX(0++,0),(r0+=r2) REP 16 ++ add r0,16*16*2 ++ sub r1,16*16 ++ cmp r1,0 ++ bgt loop ++ b lr ++ ++ ++################################################################################ ++# HEVC VPU Deblock ++# ++# Vertical edges before horizontal ++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked ++# ++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge. ++# The VPU code works in units of 16x16 blocks. ++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time). ++# One final horizontal filter is required at the end. ++# PCM is not allowed in this code. ++# ++# ++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering) ++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering. ++ ++.set P0,63 ++.set P1,62 ++.set P2,61 ++.set P3,60 ++.set Q0,59 ++.set Q1,58 ++.set Q2,57 ++.set Q3,56 ++ ++.set dp,32 ++.set dq,33 ++.set d,34 ++.set decision,35 ++.set beta,36 ++.set beta2,37 ++.set beta3,38 ++.set ptest,39 ++.set qtest,40 ++.set pqtest,41 ++.set thresh,42 ++.set deltatest, 44 ++.set deltap1, 45 ++.set tc25, 46 ++.set setup,47 ++.set tc,48 ++.set tc25,49 ++.set tc2, 50 ++.set do_filter, 51 ++.set delta, 52 ++.set tc10, 53 ++.set delta0, 54 ++.set delta1, 55 ++.set zeros, 0 ++.set setup_input, 1 ++.set deltaq1, 2 ++ ++ ++ ++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image. ++# Row has num16 16x16 blocks across ++# Beta goes from 0 to 64 ++# tc goes from 0 to 24 ++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number] ++# has 8 bytes per edge ++# has 16 bytes per direction ++# has 32 bytes per 16x16 block ++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4)) ++hevc_deblock_16x16: ++ push r6-r15, lr ++ mov r9,r4 ++ mov r4,r3 ++ mov r13,r2 ++ mov r2,r0 ++ mov r10,r0 ++ subscale4 r0,r1 ++ mov r8,63 ++ mov r6,-3 ++ vmov H(zeros,0),0 ++# r7 is number of blocks still to load ++# r0 is location of current block - 4 * stride ++# r1 is stride ++# r2 is location of current block ++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical ++# r4 is setup ++# r5 is for temporary calculations ++# r8 holds 63 ++# r6 holds -3 ++# r9 holds the number of 16 high rows to process ++# r10 holds the original img base ++# r11 returns 0 if no filtering was done on the edge ++# r12 saves a copy of this ++# r13 is copy of width ++ ++process_row: ++ # First iteration does not do horizontal filtering on previous ++ mov r7, r13 ++ mov r3,0 ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) # We may wish to prefetch these ++ vstb H(zeros,0),(r4) ++ bl vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 ++ bl vert_filter ++ sub r3,8 ++ b start_deblock_loop ++deblock_loop: ++ # Middle iterations do vertical on current block and horizontal on preceding ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) ++ vstb H(zeros,0),(r4) ++ bl vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl vert_filter ++ sub r3,8 ++ vldb H(setup_input,0), -16(r4) ++ vstb H(zeros,0),-16(r4) ++ bl horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl horz_filter ++ sub r3,8*64 ++ addcmpbeq r12,0,0,skip_save_top ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++skip_save_top: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++start_deblock_loop: ++ # move onto next 16x16 (could do this with circular buffer support instead) ++ add r3,16 ++ and r3,r8 ++ add r4,32 ++ # Perform loop counter operations (may work with an addcmpbgt as well?) ++ add r0,16 ++ add r2,16 ++ sub r7,1 ++ cmp r7,0 # Are there still more blocks to load ++ bgt deblock_loop ++ ++ # Final iteration needs to just do horizontal filtering ++ vldb H(setup_input,0), -16(r4) ++ vstb H(zeros,0),-16(r4) ++ bl horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl horz_filter ++ sub r3,64*8 ++ addcmpbeq r12,0,0,skip_save_top2 ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++skip_save_top2: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++ ++# Now look to see if we should do another row ++ sub r9,1 ++ cmp r9,0 ++ bgt start_again ++ pop r6-r15, pc ++start_again: ++ # Need to sort out r0,r2 to point to next row down ++ addscale16 r10,r1 ++ mov r2,r10 ++ subscale4 r0,r2,r1 ++ b process_row ++ ++ ++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered ++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations ++ ++vert_filter: ++ push lr ++ ++ vmov HX(P3,0), V(16,12)+r3 ++ vmov HX(P2,0), V(16,13)+r3 ++ vmov HX(P1,0), V(16,14)+r3 ++ vmov HX(P0,0), V(16,15)+r3 ++ vmov HX(Q0,0), V(16,16)+r3 ++ vmov HX(Q1,0), V(16,17)+r3 ++ vmov HX(Q2,0), V(16,18)+r3 ++ vmov HX(Q3,0), V(16,19)+r3 ++ ++ bl do_luma_filter ++ ++ vadds V(16,13)+r3, HX(P2,0), 0 ++ vadds V(16,14)+r3, HX(P1,0), 0 ++ vadds V(16,15)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds V(16,16)+r3, HX(Q0,0), 0 ++ vadds V(16,17)+r3, HX(Q1,0), 0 ++ vadds V(16,18)+r3, HX(Q2,0), 0 ++ ++ pop pc ++ ++# Filter edge at H(16,0)+r3 ++horz_filter: ++ push lr ++ ++ vmov HX(P3,0), H(12,0)+r3 ++ vmov HX(P2,0), H(13,0)+r3 ++ vmov HX(P1,0), H(14,0)+r3 ++ vmov HX(P0,0), H(15,0)+r3 ++ vmov HX(Q0,0), H(16,0)+r3 ++ vmov HX(Q1,0), H(17,0)+r3 ++ vmov HX(Q2,0), H(18,0)+r3 ++ vmov HX(Q3,0), H(19,0)+r3 ++ ++ bl do_luma_filter ++ ++ vadds H(13,0)+r3, HX(P2,0), 0 ++ vadds H(14,0)+r3, HX(P1,0), 0 ++ vadds H(15,0)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds H(16,0)+r3, HX(Q0,0), 0 ++ vadds H(17,0)+r3, HX(Q1,0), 0 ++ vadds H(18,0)+r3, HX(Q2,0), 0 ++ ++ pop pc ++ ++# r4 points to array of beta/tc for each 4 length edge ++do_luma_filter: ++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8 ++ valtl HX(beta,0),H(setup,0),H(setup,0) ++ valtu HX(tc,0),H(setup,0),H(setup,0) ++ vmul HX(tc25,0), HX(tc,0), 5 ++ vadd HX(tc25,0),HX(tc25,0), 1 ++ vasr HX(tc25,0), HX(tc25,0), 1 ++ ++ # Compute decision ++ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1 ++ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1 ++ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0 ++ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0 ++ ++ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1 ++ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1 ++ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0 ++ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0 ++ ++ vadd HX(d,0), HX(dp,0), HX(dq,0) ++ vasr HX(beta2,0),HX(beta,0),2 ++ vasr HX(beta3,0),HX(beta,0),3 ++ ++ # Compute flags that are negative if all conditions pass ++ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC ++ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC ++ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF ++ ++ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN ++ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF ++ vadd HX(decision,0), HX(d,0), HX(d,0) IFN ++ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF ++ vmov HX(decision,0), 1 IFNN ++ vadd H(decision,0),H(decision,3),0 IFN ++ vadd H(decision,16),H(decision,19),0 IFN ++ vmov -,HX(decision,0) SETF # N marks strong filter ++ vmov HX(decision,0), 1 IFNN # NN marks normal filter ++ ++ vadd HX(do_filter,0), HX(d,3), HX(d,0) ++ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter ++ vmov HX(decision,0),0 IFNN # Z marks no filter ++ ++ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3 ++ # First extract out even terms ++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3 ++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123 ++ # Now expand back ++ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233 ++ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333 ++ ++ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering ++ ++ # Do a quick check to see if there is anything to do ++ mov r11, 0 # Signal no filtering ++ vmov -,1 IFNZ SUMS r5 ++ cmp r5,0 ++ beq filtering_done ++ mov r11, 1 # Signal some filtering ++ # And whether there is any strong filtering ++ vmov -,1 IFN SUMS r5 ++ cmp r5,0 ++ beq normal_filtering ++ ++ ############################################################################## ++ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!) ++ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2 ++ ++ # Take a copy of the original pixels for use in decision calculation ++ vmov HX(P0,32),HX(P0,0) ++ vmov HX(Q0,32),HX(Q0,0) ++ vmov HX(P1,32),HX(P1,0) ++ vmov HX(Q1,32),HX(Q1,0) ++ vmov HX(P2,32),HX(P2,0) ++ vmov HX(Q2,32),HX(Q2,0) ++ ++ vadd -,HX(P2,32),4 CLRA SACC ++ vshl -,HX(P1,32),1 SACC ++ vshl -,HX(P0,32),1 SACC ++ vshl -,HX(Q0,32),1 SACC ++ vshl HX(delta,0),HX(Q1,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(P0,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN ++ ++ vadd -,HX(P2,32),2 CLRA SACC ++ vadd -,HX(P1,32),HX(P0,32) SACC ++ vshl HX(delta,0),HX(Q0,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 2 ++ vsub HX(delta,0),HX(delta,0),HX(P1,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN ++ ++ vadd -,HX(Q0,32),4 CLRA SACC ++ vadd -,HX(P1,32),HX(P0,32) SACC ++ vmul -,HX(P2,32),3 SACC ++ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(P2,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN ++ #vmov HX(P2,0),3 IFN ++ ++ # Now reverse all P/Qs ++ ++ vadd -,HX(Q2,32),4 CLRA SACC ++ vshl -,HX(Q1,32),1 SACC ++ vshl -,HX(Q0,32),1 SACC ++ vshl -,HX(P0,32),1 SACC ++ vshl HX(delta,0),HX(P1,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(Q0,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN ++ ++ vadd -,HX(Q2,32),2 CLRA SACC ++ vadd -,HX(Q1,32),HX(Q0,32) SACC ++ vshl HX(delta,0),HX(P0,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 2 ++ vsub HX(delta,0),HX(delta,0),HX(Q1,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN ++ ++ vadd -,HX(P0,32),4 CLRA SACC ++ vadd -,HX(Q1,32),HX(Q0,32) SACC ++ vmul -,HX(Q2,32),3 SACC ++ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(Q2,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN ++ ++ ############################################################################## ++ # Normal filtering ++normal_filtering: ++ # Invert the decision flags ++ # make instruction more complicated as assembler has error and loses SETF ++ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering ++ vmov -, HX(tc10,0) SETF # IFN means normal filtering ++ ++ vmov -,1 IFN SUMS r5 ++ cmp r5,0 ++ beq filtering_done ++ ++ vasr HX(tc2,0), HX(tc,0), 1 ++ vmul HX(tc10,0), HX(tc,0), 10 ++ ++ vasr HX(thresh,0), HX(beta,0), 1 ++ vadd HX(thresh,0), HX(thresh,0), HX(beta,0) ++ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC ++ ++ vadd HX(ptest,0),HX(dp,3),HX(dp,0) ++ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel ++ vadd HX(qtest,0),HX(dq,3),HX(dq,0) ++ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel ++ # Expand ptest and qtest together ++ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q ++ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........ ++ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq ++ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0) ++ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0) ++ ++ vsub HX(delta0,0), HX(Q0,0), HX(P0,0) ++ vsub HX(delta1,0), HX(Q1,0), HX(P1,0) ++ vmov -,8 CLRA SACC ++ vmul -,HX(delta0,0), 9 SACC ++ vmul HX(delta0,0),HX(delta1,0), r6 SACC ++ vasr HX(delta0,0), HX(delta0,0), 4 ++ vdist HX(deltatest,0), HX(delta0,0), 0 ++ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something ++ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later ++ ++ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0) ++ ++ vadd HX(deltap1,0), HX(P2,0), HX(P0,0) ++ vadd HX(deltap1,0), HX(deltap1,0), 1 ++ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC ++ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC ++ vasr HX(deltap1,0), HX(deltap1,0), 1 ++ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0) ++ ++ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0) ++ vadd HX(deltaq1,0), HX(deltaq1,0), 1 ++ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC ++ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0) ++ vrsub -, HX(delta0,0), 0 SACC ++ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC ++ vasr HX(deltaq1,0), HX(deltaq1,0), 1 ++ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0) ++ ++ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN ++ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN ++ ++ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1 ++ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN ++ ++ vmov -,HX(deltatest,0) SETF ++ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1 ++ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN ++ ++ #vmov HX(P2,0),1 IFN ++ ++filtering_done: ++ b lr ++ ++ ++hevc_uv_deblock_16x16: ++ push r6-r15, lr ++ mov r14,0 ++ b hevc_uv_start ++hevc_uv_deblock_16x16_with_clear: ++ push r6-r15, lr ++ mov r14,1 ++ b hevc_uv_start ++ ++hevc_uv_start: ++ mov r9,r4 ++ mov r4,r3 ++ mov r13,r2 ++ mov r2,r0 ++ mov r10,r0 ++ subscale4 r0,r1 ++ mov r8,63 ++ mov r6,-3 ++ vmov H(zeros,0),0 ++# r7 is number of blocks still to load ++# r0 is location of current block - 4 * stride ++# r1 is stride ++# r2 is location of current block ++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical ++# r4 is setup ++# r5 is for temporary calculations ++# r8 holds 63 ++# r6 holds -3 ++# r9 holds the number of 16 high rows to process ++# r10 holds the original img base ++# r11 returns 0 if no filtering was done on the edge ++# r12 saves a copy of this ++# r13 is copy of width ++# r14 is 1 if we should clear the old contents, or 0 if not ++ ++uv_process_row: ++ # First iteration does not do horizontal filtering on previous ++ mov r7, r13 ++ mov r3,0 ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) # We may wish to prefetch these ++ cmp r14,1 ++ bne uv_skip0 ++ vstb H(zeros,0),(r4) ++uv_skip0: ++ bl uv_vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 ++ bl uv_vert_filter ++ sub r3,8 ++ b uv_start_deblock_loop ++uv_deblock_loop: ++ # Middle iterations do vertical on current block and horizontal on preceding ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) ++ cmp r14,1 ++ bne uv_skip1 ++ vstb H(zeros,0),(r4) ++uv_skip1: ++ bl uv_vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_vert_filter ++ sub r3,8 ++ vldb H(setup_input,0), -16(r4) ++ cmp r14,1 ++ bne uv_skip3 ++ vstb H(zeros,0),-16(r4) ++uv_skip3: ++ bl uv_horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_horz_filter ++ sub r3,8*64 ++ addcmpbeq r12,0,0,uv_skip_save_top ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++uv_skip_save_top: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++uv_start_deblock_loop: ++ # move onto next 16x16 (could do this with circular buffer support instead) ++ add r3,16 ++ and r3,r8 ++ add r4,32 ++ # Perform loop counter operations (may work with an addcmpbgt as well?) ++ add r0,16 ++ add r2,16 ++ sub r7,1 ++ cmp r7,0 # Are there still more blocks to load ++ bgt uv_deblock_loop ++ ++ # Final iteration needs to just do horizontal filtering ++ vldb H(setup_input,0), -16(r4) ++ cmp r14,1 ++ bne uv_skip2 ++ vstb H(zeros,0),-16(r4) ++uv_skip2: ++ bl uv_horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_horz_filter ++ sub r3,64*8 ++ addcmpbeq r12,0,0,uv_skip_save_top2 ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++uv_skip_save_top2: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++ ++# Now look to see if we should do another row ++ sub r9,1 ++ cmp r9,0 ++ bgt uv_start_again ++ pop r6-r15, pc ++uv_start_again: ++ # Need to sort out r0,r2 to point to next row down ++ addscale16 r10,r1 ++ mov r2,r10 ++ subscale4 r0,r2,r1 ++ b uv_process_row ++ ++ ++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered ++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations ++ ++uv_vert_filter: ++ push lr ++ ++ vmov HX(P1,0), V(16,14)+r3 ++ vmov HX(P0,0), V(16,15)+r3 ++ vmov HX(Q0,0), V(16,16)+r3 ++ vmov HX(Q1,0), V(16,17)+r3 ++ ++ bl do_chroma_filter ++ ++ vadds V(16,15)+r3, HX(P0,0), 0 ++ vadds V(16,16)+r3, HX(Q0,0), 0 ++ ++ pop pc ++ ++# Filter edge at H(16,0)+r3 ++uv_horz_filter: ++ push lr ++ ++ vmov HX(P1,0), H(14,0)+r3 ++ vmov HX(P0,0), H(15,0)+r3 ++ vmov HX(Q0,0), H(16,0)+r3 ++ vmov HX(Q1,0), H(17,0)+r3 ++ ++ bl do_chroma_filter ++ ++ vadds H(15,0)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds H(16,0)+r3, HX(Q0,0), 0 ++ ++ pop pc ++ ++# r4 points to array of beta/tc for each 4 length edge ++do_chroma_filter: ++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8 ++ valtl HX(tc,0),H(setup,0),H(setup,0) ++ ++ vsub HX(delta,0),HX(Q0,0),HX(P0,0) ++ vshl HX(delta,0),HX(delta,0),2 CLRA SACC ++ vsub -,HX(P1,0),HX(Q1,0) SACC ++ vmov HX(delta,0),4 SACC ++ vasr HX(delta,0),HX(delta,0),3 ++ vclamps HX(delta,0), HX(delta,0), HX(tc,0) ++ vadd HX(P0,0),HX(P0,0),HX(delta,0) ++ vsub HX(Q0,0),HX(Q0,0),HX(delta,0) ++ b lr ++ ++# r0 = list ++# r1 = number ++hevc_run_command_list: ++ push r6-r7, lr ++ mov r6, r0 ++ mov r7, r1 ++loop_cmds: ++ ld r0,(r6) # How to encode r6++? ++ add r6,4 ++ ld r1,(r6) ++ add r6,4 ++ ld r2,(r6) ++ add r6,4 ++ ld r3,(r6) ++ add r6,4 ++ ld r4,(r6) ++ add r6,4 ++ ld r5,(r6) ++ add r6,4 ++ bl hevc_trans_16x16 ++ sub r7,1 ++ cmp r7,0 ++ bgt loop_cmds ++ ++ pop r6-r7, pc +diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h +new file mode 100644 +index 0000000000..b0e9902d82 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform10.h @@ -0,0 +1,3070 @@ -+unsigned char rpi_hevc_transform [] = { ++static const unsigned char rpi_hevc_transform10 [] = { ++21, ++106, ++0, ++144, ++47, ++1, ++37, ++106, ++0, ++144, ++66, ++1, ++53, ++106, ++0, ++144, ++192, ++4, ++69, ++106, ++0, ++144, ++192, ++4, ++85, ++106, ++0, ++144, ++220, ++5, ++169, ++3, ++62, ++64, ++79, ++64, ++3, ++232, ++32, ++0, ++0, ++0, ++12, ++248, ++0, ++136, ++0, ++0, ++192, ++248, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++12, ++248, ++0, ++168, ++0, ++0, ++192, ++248, ++0, ++0, ++0, ++96, ++3, ++232, ++32, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++8, ++232, ++0, ++4, ++0, ++0, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++4, ++232, ++64, ++0, ++0, ++0, ++5, ++232, ++0, ++2, ++0, ++0, ++128, ++69, ++113, ++66, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++128, ++69, ++113, ++70, ++128, ++144, ++40, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++16, ++0, ++76, ++254, ++48, ++192, ++9, ++4, ++32, ++8, ++0, ++0, ++4, ++254, ++0, ++144, ++128, ++2, ++0, ++8, ++2, ++0, ++128, ++144, ++23, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++20, ++0, ++76, ++254, ++48, ++192, ++6, ++4, ++32, ++8, ++0, ++0, ++140, ++248, ++44, ++0, ++0, ++0, ++32, ++48, ++4, ++0, ++128, ++69, ++113, ++66, ++242, ++140, ++211, ++192, ++34, ++31, ++41, ++3, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++96, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++224, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++225, ++64, ++242, ++64, ++3, ++232, ++128, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++57, ++239, ++224, ++247, ++255, ++255, ++72, ++192, ++95, ++207, ++88, ++122, ++88, ++124, ++137, ++64, ++26, ++64, ++4, ++232, ++64, ++0, ++0, ++0, ++149, ++96, ++161, ++64, ++152, ++64, ++128, ++144, ++35, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++27, ++0, ++4, ++232, ++0, ++2, ++0, ++0, ++101, ++96, ++145, ++64, ++168, ++64, ++128, ++144, ++19, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++11, ++0, ++74, ++232, ++0, ++8, ++0, ++0, ++242, ++140, ++221, ++192, ++57, ++239, ++32, ++8, ++0, ++0, ++41, ++3, ++239, ++3, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++248, ++4, ++0, ++12, ++248, ++0, ++132, ++64, ++0, ++192, ++248, ++4, ++0, ++0, ++96, ++255, ++159, ++154, ++255, ++0, ++232, ++0, ++4, ++0, ++0, ++255, ++159, ++165, ++255, ++4, ++255, ++48, ++204, ++16, ++3, ++224, ++251, ++62, ++0, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++128, ++64, ++6, ++232, ++64, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++32, ++247, ++240, ++207, ++16, ++3, ++32, ++247, ++176, ++207, ++17, ++19, ++32, ++247, ++112, ++207, ++18, ++35, ++32, ++247, ++48, ++207, ++19, ++51, ++32, ++247, ++240, ++206, ++20, ++67, ++32, ++247, ++176, ++206, ++21, ++83, ++32, ++247, ++112, ++206, ++22, ++99, ++32, ++247, ++48, ++206, ++23, ++115, ++32, ++247, ++240, ++205, ++24, ++131, ++32, ++247, ++176, ++205, ++25, ++147, ++32, ++247, ++112, ++205, ++26, ++163, ++32, ++247, ++48, ++205, ++27, ++179, ++32, ++247, ++240, ++204, ++28, ++195, ++32, ++247, ++176, ++204, ++29, ++211, ++32, ++247, ++112, ++204, ++30, ++227, ++32, ++247, ++48, ++204, ++31, ++243, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++0, ++237, ++32, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++111, ++3, ++4, ++254, ++0, ++128, ++0, ++4, ++0, ++248, ++0, ++0, ++2, ++232, ++32, ++0, ++0, ++0, ++140, ++248, ++32, ++0, ++0, ++0, ++224, ++35, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++193, ++232, ++0, ++1, ++0, ++0, ++1, ++106, ++116, ++30, ++90, ++0, ++169, ++3, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++137, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++129, ++0, ++131, ++102, ++0, ++158, ++67, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++108, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++100, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++161, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++150, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++182, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++112, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++101, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++103, ++255, ++239, ++3, ++0, ++254, ++0, ++143, ++92, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++93, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++210, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++211, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++107, ++0, ++8, ++255, ++99, ++23, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++23, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++52, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++52, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++0, ++143, ++12, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++13, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++18, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++19, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++33, ++0, ++8, ++255, ++99, ++3, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++3, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++4, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++4, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++137, ++47, ++240, ++40, ++246, ++2, ++140, ++47, ++240, ++128, ++245, ++99, ++140, ++5, ++4, ++0, ++247, ++99, ++140, ++1, ++20, ++88, ++246, ++99, ++140, ++1, ++20, ++0, ++247, ++35, ++136, ++62, ++226, ++32, ++247, ++35, ++136, ++32, ++210, ++0, ++247, ++34, ++136, ++63, ++2, ++208, ++246, ++34, ++136, ++0, ++4, ++0, ++247, ++99, ++136, ++58, ++162, ++32, ++247, ++99, ++136, ++33, ++146, ++0, ++247, ++98, ++136, ++59, ++18, ++208, ++246, ++98, ++136, ++0, ++20, ++0, ++247, ++162, ++136, ++33, ++2, ++88, ++246, ++98, ++137, ++2, ++68, ++88, ++246, ++162, ++137, ++3, ++68, ++208, ++254, ++227, ++136, ++60, ++242, ++192, ++243, ++188, ++11, ++208, ++254, ++227, ++136, ++56, ++178, ++192, ++243, ++188, ++10, ++32, ++255, ++226, ++136, ++38, ++58, ++192, ++243, ++60, ++0, ++208, ++254, ++227, ++136, ++59, ++242, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++49, ++58, ++192, ++243, ++60, ++128, ++0, ++255, ++226, ++136, ++34, ++34, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++37, ++58, ++192, ++243, ++60, ++128, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++194, ++8, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++255, ++202, ++40, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++254, ++0, ++240, ++35, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++226, ++140, ++34, ++34, ++195, ++243, ++60, ++0, ++32, ++255, ++227, ++140, ++36, ++58, ++192, ++243, ++60, ++0, ++0, ++254, ++192, ++136, ++0, ++4, ++0, ++240, ++0, ++160, ++16, ++246, ++226, ++136, ++35, ++50, ++16, ++246, ++226, ++136, ++35, ++50, ++32, ++246, ++226, ++136, ++35, ++50, ++32, ++254, ++226, ++136, ++35, ++58, ++192, ++243, ++60, ++0, ++11, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++115, ++5, ++106, ++0, ++144, ++173, ++1, ++27, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++227, ++0, ++64, ++246, ++163, ++140, ++1, ++4, ++0, ++246, ++192, ++175, ++63, ++2, ++0, ++246, ++192, ++174, ++59, ++2, ++0, ++246, ++128, ++175, ++62, ++2, ++0, ++246, ++128, ++174, ++58, ++2, ++0, ++246, ++64, ++175, ++61, ++2, ++0, ++246, ++64, ++174, ++57, ++2, ++0, ++255, ++43, ++240, ++4, ++212, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++228, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++191, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++143, ++52, ++242, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++212, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++180, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++190, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++143, ++52, ++226, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++180, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++212, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++196, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++189, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++143, ++52, ++210, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++148, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++164, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++228, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++187, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++142, ++52, ++178, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++148, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++244, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++186, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++142, ++52, ++162, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++244, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++148, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++132, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++185, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++142, ++52, ++146, ++192, ++243, ++60, ++128, ++64, ++255, ++98, ++141, ++0, ++52, ++192, ++243, ++0, ++0, ++0, ++254, ++0, ++240, ++53, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++177, ++0, ++88, ++246, ++163, ++140, ++1, ++4, ++128, ++245, ++99, ++141, ++10, ++4, ++88, ++246, ++162, ++138, ++1, ++68, ++0, ++247, ++162, ++138, ++36, ++162, ++88, ++254, ++162, ++138, ++3, ++164, ++192, ++243, ++128, ++11, ++0, ++255, ++226, ++137, ++32, ++2, ++195, ++243, ++60, ++0, ++32, ++247, ++226, ++137, ++42, ++114, ++0, ++255, ++34, ++138, ++33, ++18, ++195, ++243, ++60, ++0, ++32, ++247, ++34, ++138, ++42, ++130, ++16, ++246, ++98, ++138, ++40, ++114, ++16, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++226, ++137, ++41, ++146, ++40, ++246, ++34, ++138, ++41, ++146, ++32, ++247, ++163, ++141, ++63, ++178, ++32, ++247, ++227, ++141, ++62, ++162, ++0, ++254, ++0, ++240, ++8, ++4, ++0, ++240, ++128, ++11, ++128, ++253, ++35, ++240, ++9, ++100, ++192, ++243, ++128, ++10, ++128, ++253, ++163, ++141, ++128, ++115, ++192, ++243, ++152, ++10, ++88, ++246, ++163, ++141, ++4, ++100, ++208, ++246, ++35, ++139, ++0, ++100, ++32, ++255, ++34, ++139, ++53, ++202, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++139, ++0, ++4, ++0, ++240, ++0, ++160, ++240, ++246, ++163, ++141, ++48, ++98, ++0, ++247, ++99, ++139, ++63, ++210, ++0, ++247, ++98, ++139, ++1, ++212, ++88, ++254, ++98, ++139, ++1, ++212, ++192, ++243, ++128, ++11, ++32, ++255, ++99, ++139, ++62, ++98, ++192, ++243, ++188, ++10, ++88, ++246, ++98, ++139, ++1, ++212, ++240, ++246, ++98, ++139, ++50, ++210, ++0, ++247, ++163, ++128, ++59, ++146, ++0, ++247, ++160, ++128, ++1, ++36, ++88, ++254, ++160, ++128, ++1, ++36, ++192, ++243, ++128, ++11, ++0, ++247, ++163, ++128, ++58, ++98, ++64, ++255, ++35, ++240, ++0, ++100, ++192, ++243, ++128, ++10, ++64, ++255, ++163, ++128, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++160, ++128, ++1, ++36, ++240, ++246, ++160, ++128, ++50, ++34, ++8, ++255, ++227, ++143, ++54, ++242, ++192, ++243, ++60, ++128, ++40, ++255, ++227, ++142, ++54, ++178, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++39, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++143, ++45, ++226, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++44, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++40, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++142, ++2, ++162, ++192, ++243, ++60, ++128, ++90, ++0, ++169, ++3, ++14, ++96, ++4, ++31, ++169, ++3, ++30, ++96, ++1, ++31, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++132, ++24, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++143, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++135, ++0, ++131, ++102, ++0, ++158, ++71, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++132, ++24, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++112, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++104, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++134, ++24, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++123, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++112, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++178, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++134, ++24, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++72, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++61, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++95, ++255, ++239, ++3, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++47, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++13, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++140, ++47, ++240, ++32, ++247, ++35, ++141, ++63, ++178, ++64, ++254, ++35, ++141, ++2, ++68, ++192, ++243, ++128, ++11, ++32, ++255, ++35, ++240, ++58, ++226, ++192, ++243, ++188, ++10, ++0, ++254, ++0, ++141, ++4, ++4, ++0, ++240, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++240, ++246, ++35, ++141, ++48, ++66, ++0, ++247, ++227, ++143, ++52, ++242, ++32, ++247, ++227, ++142, ++52, ++178, ++90, ++0, ++161, ++3, ++6, ++64, ++23, ++64, ++96, ++8, ++70, ++98, ++97, ++8, ++70, ++98, ++98, ++8, ++70, ++98, ++99, ++8, ++70, ++98, ++100, ++8, ++70, ++98, ++101, ++8, ++70, ++98, ++255, ++159, ++8, ++250, ++23, ++102, ++7, ++106, ++112, ++30, ++33, ++3, ++}; +diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h +new file mode 100644 +index 0000000000..2901b6568d +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform8.h +@@ -0,0 +1,3070 @@ ++static const unsigned char rpi_hevc_transform8 [] = { +21, +106, +0, @@ -13487,932 +22053,9 @@ index 0000000..4309f1c +33, +3, +}; -diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s -new file mode 100644 -index 0000000..5543093 ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform.s -@@ -0,0 +1,917 @@ -+# ****************************************************************************** -+# Argon Design Ltd. -+# (c) Copyright 2015 Argon Design Ltd. All rights reserved. -+# -+# Module : HEVC -+# Author : Peter de Rivaz -+# ****************************************************************************** -+ -+# HEVC VPU Transform -+# -+# Transform matrix can be thought of as -+# output row vector = input row vector * transMatrix2 -+# -+# The even rows of the matrix are symmetric -+# The odd rows of the matrix are antisymmetric -+# -+# So only need to compute the first half of the results, then can compute the remainder with a butterfly -+# -+# EXAMPLE -+# (a b c d) (1 2 2 1) -+# (3 4 -4 -3) -+# (5 6 6 5) -+# (7 8 -8 -7) -+# -+# x=(a c)(1 2) = 1a+5c 2a+6c -+# (5 6) -+# -+# y=(b d)(3 4) = 3b+7d 4b+8d -+# (7 8) -+# -+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d -+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d -+# -+# Final results are (u , v[::-1]) -+# -+# -+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) -+# Apply the even matrix first and stop before rounding -+# Then apply the odd matrix in a full manner: -+# -+# First step is to compute partial products with the first input (16 cycles) -+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output -+# 2a 4b 6c 8d -+# 2a -4b 6c -8d -+# 1a -3b 5c -7d -+# -+# Second step is to sum partial products into final position (8 cycles) -+# 1a+3b+5c+7d -+# 2a+4b+6c+8d -+# 2a-4b+6c-8d -+# 1a-3b+5c-7d -+# -+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) -+# -+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) -+# -+# For 8x8 we could compute two in parallel. -+# -+# -+ -+# Columns are transformed first -+# -+# Store top left half of transMatrix2 in -+# Store bottom left half of transMatrix2 in HX(32,32) -+# -+# For 16x16 -+# HX(0:15,0) contains input data before transform -+# HY(0:15,0) contains 32bit output data after transform -+# HX(32,0) contains even rows of left half of transMatrix2 -+# HX(32,32) contains odd rows of left half of transMatrix2 -+# HY(48,0) contains partial products ready for summing -+# -+ -+ -+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done -+# coeffs32 -+# num32: number of 32x32 transforms -+# command 0 for transform, 1 for memclear16(int16_t *dst,num16) -+# -+hevc_trans_16x16: -+ cmp r5,1 -+ beq memclear16 -+ cmp r5,2 -+ beq hevc_deblock_16x16 -+ cmp r5,3 -+ beq hevc_uv_deblock_16x16 -+ cmp r5,4 -+ beq hevc_uv_deblock_16x16_with_clear -+ cmp r5,5 -+ beq hevc_run_command_list -+ -+ push r6-r15, lr # TODO cut down number of used registers -+ mov r14,r3 # coeffs32 -+ mov r15,r4 # num32 -+ mov r3, 16*2 # Stride of transMatrix2 in bytes -+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix -+ -+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix -+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ # Now use r0 to describe which matrix we are working on. -+ # Allows us to prefetch the next block of coefficients for efficiency. -+ mov r0,0 # This describes the location where we read our coefficients from -+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) -+ mov r7,16*16*2 # Total block size -+ mov r8,64*16 # Value used to swap from current to next VRF location -+ vldh HX(0++,0)+r0,(r1 += r3) REP 16 -+ mov r4,64 # Constant used for rounding first pass -+ mov r5,1<<11 # Constant used for rounding second pass -+ -+ # At start of block r0,r1 point to the current block (that has already been loaded) -+block_loop: -+ eor r0,r8 -+ add r1,r7 -+ # Prefetch the next block -+ vldh HX(0++,0)+r0,(r1 += r3) REP 16 -+ eor r0,r8 -+ sub r1,r7 -+ -+ # Transform the current block -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? -+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position -+ -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) -+ -+ # Save results - note there has been a transposition during the processing so we save columns -+ vsth VX(0,32++)+r0, (r1 += r3) REP 16 -+ -+ # Move onto next block -+ eor r0,r8 -+ add r1,r7 -+ -+ addcmpbgt r2,-1,0,block_loop -+ -+ # Now go and do any 32x32 transforms -+ b hevc_trans_32x32 -+ -+ pop r6-r15, pc -+ -+# r1,r2,r3 r7,r8 should be preserved -+# HX(0++,0)+r0 is the block to be transformed -+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients -+# Use HY(48,0) for intermediate results -+# r0 can be used, but should be returned to its original value at the end -+col_trans_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+col_trans_odd_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_odd_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_odd_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done -+# -+hevc_trans_32x32: -+ mov r1,r14 # coeffs -+ mov r2,r15 # num -+ -+ # Fetch odd transform matrix -+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) -+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix -+ #add r0, 16*16*2 -+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer -+ mov r7, 16*16*2 # Total block size -+ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) -+ # set r8 to 32byte aligned stack pointer -+ add r8,sp,31 -+ lsr r8,5 -+ lsl r8,5 -+ mov r9,r8 # Backup of the temporary storage -+ mov r10,r1 # Backup of the coefficient buffer -+block_loop32: -+ -+ # COLUMN TRANSFORM -+ mov r4, 64 # Constant used for rounding first pass -+ mov r5, 9 # left shift used for rounding first pass -+ -+ # Transform the first 16 columns -+ mov r1,r10 # Input Coefficient buffer -+ mov r8,r9 # Output temporary storage -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ # ROW TRANSFORM -+ mov r4, 1<<11 # Constant used for rounding second pass -+ mov r5, 4 # left shift used for rounding second pass -+ -+ mov r1,r9 # Input temporary storage -+ mov r8,r10 # Output Coefficient buffer -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ add r10, 32*32*2 # move onto next block of coefficients -+ addcmpbgt r2,-1,0,block_loop32 -+ -+ add sp,sp,32*32*2+32 # Restore stack -+ -+ pop r6-r15, pc -+ -+trans32: -+ push lr -+ # We can no longer afford the VRF space to do prefetching when doing 32x32 -+ # Fetch the even rows -+ vldh HX(0++,0),(r1 += r3) REP 16 -+ # Fetch the odd rows -+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 -+ -+ # Transform the even rows using even matrix -+ mov r0, 0 # Even rows -+ bl col_trans_16 -+ -+ # Now transform the odd rows using odd matrix -+ mov r0, 64*16 # Odd rows -+ bl col_trans_odd_16 -+ -+ # Now apply butterfly to compute the first 16 results -+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ # 16bit results now in HX(48,32) -+ mov r0,r8 -+ mov r6,32*2 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ -+ # Now apply butterfly to compute the second 16 results (in reverse order) -+ vsub HY(63,0),HY(0 ,0),HY(16,0) -+ vsub HY(62,0),HY(1 ,0),HY(17,0) -+ vsub HY(61,0),HY(2 ,0),HY(18,0) -+ vsub HY(60,0),HY(3 ,0),HY(19,0) -+ vsub HY(59,0),HY(4 ,0),HY(20,0) -+ vsub HY(58,0),HY(5 ,0),HY(21,0) -+ vsub HY(57,0),HY(6 ,0),HY(22,0) -+ vsub HY(56,0),HY(7 ,0),HY(23,0) -+ vsub HY(55,0),HY(8 ,0),HY(24,0) -+ vsub HY(54,0),HY(9 ,0),HY(25,0) -+ vsub HY(53,0),HY(10,0),HY(26,0) -+ vsub HY(52,0),HY(11,0),HY(27,0) -+ vsub HY(51,0),HY(12,0),HY(28,0) -+ vsub HY(50,0),HY(13,0),HY(29,0) -+ vsub HY(49,0),HY(14,0),HY(30,0) -+ vsub HY(48,0),HY(15,0),HY(31,0) -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ add r0,r8,32 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ pop pc -+ -+memclear16: -+ # r0 is address -+ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified) -+ vmov HX(0++,0),0 REP 16 -+ mov r2,32 -+loop: -+ vsth HX(0++,0),(r0+=r2) REP 16 -+ add r0,16*16*2 -+ sub r1,16*16 -+ cmp r1,0 -+ bgt loop -+ b lr -+ -+ -+################################################################################ -+# HEVC VPU Deblock -+# -+# Vertical edges before horizontal -+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked -+# -+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge. -+# The VPU code works in units of 16x16 blocks. -+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time). -+# One final horizontal filter is required at the end. -+# PCM is not allowed in this code. -+# -+# -+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering) -+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering. -+ -+.set P0,63 -+.set P1,62 -+.set P2,61 -+.set P3,60 -+.set Q0,59 -+.set Q1,58 -+.set Q2,57 -+.set Q3,56 -+ -+.set dp,32 -+.set dq,33 -+.set d,34 -+.set decision,35 -+.set beta,36 -+.set beta2,37 -+.set beta3,38 -+.set ptest,39 -+.set qtest,40 -+.set pqtest,41 -+.set thresh,42 -+.set deltatest, 44 -+.set deltap1, 45 -+.set tc25, 46 -+.set setup,47 -+.set tc,48 -+.set tc25,49 -+.set tc2, 50 -+.set do_filter, 51 -+.set delta, 52 -+.set tc10, 53 -+.set delta0, 54 -+.set delta1, 55 -+.set zeros, 0 -+.set setup_input, 1 -+.set deltaq1, 2 -+ -+ -+ -+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image. -+# Row has num16 16x16 blocks across -+# Beta goes from 0 to 64 -+# tc goes from 0 to 24 -+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number] -+# has 8 bytes per edge -+# has 16 bytes per direction -+# has 32 bytes per 16x16 block -+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4)) -+hevc_deblock_16x16: -+ push r6-r15, lr -+ mov r9,r4 -+ mov r4,r3 -+ mov r13,r2 -+ mov r2,r0 -+ mov r10,r0 -+ subscale4 r0,r1 -+ mov r8,63 -+ mov r6,-3 -+ vmov H(zeros,0),0 -+# r7 is number of blocks still to load -+# r0 is location of current block - 4 * stride -+# r1 is stride -+# r2 is location of current block -+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical -+# r4 is setup -+# r5 is for temporary calculations -+# r8 holds 63 -+# r6 holds -3 -+# r9 holds the number of 16 high rows to process -+# r10 holds the original img base -+# r11 returns 0 if no filtering was done on the edge -+# r12 saves a copy of this -+# r13 is copy of width -+ -+process_row: -+ # First iteration does not do horizontal filtering on previous -+ mov r7, r13 -+ mov r3,0 -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) # We may wish to prefetch these -+ vstb H(zeros,0),(r4) -+ bl vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 -+ bl vert_filter -+ sub r3,8 -+ b start_deblock_loop -+deblock_loop: -+ # Middle iterations do vertical on current block and horizontal on preceding -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) -+ vstb H(zeros,0),(r4) -+ bl vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl vert_filter -+ sub r3,8 -+ vldb H(setup_input,0), -16(r4) -+ vstb H(zeros,0),-16(r4) -+ bl horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl horz_filter -+ sub r3,8*64 -+ addcmpbeq r12,0,0,skip_save_top -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+skip_save_top: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+start_deblock_loop: -+ # move onto next 16x16 (could do this with circular buffer support instead) -+ add r3,16 -+ and r3,r8 -+ add r4,32 -+ # Perform loop counter operations (may work with an addcmpbgt as well?) -+ add r0,16 -+ add r2,16 -+ sub r7,1 -+ cmp r7,0 # Are there still more blocks to load -+ bgt deblock_loop -+ -+ # Final iteration needs to just do horizontal filtering -+ vldb H(setup_input,0), -16(r4) -+ vstb H(zeros,0),-16(r4) -+ bl horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl horz_filter -+ sub r3,64*8 -+ addcmpbeq r12,0,0,skip_save_top2 -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+skip_save_top2: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+ -+# Now look to see if we should do another row -+ sub r9,1 -+ cmp r9,0 -+ bgt start_again -+ pop r6-r15, pc -+start_again: -+ # Need to sort out r0,r2 to point to next row down -+ addscale16 r10,r1 -+ mov r2,r10 -+ subscale4 r0,r2,r1 -+ b process_row -+ -+ -+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered -+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations -+ -+vert_filter: -+ push lr -+ -+ vmov HX(P3,0), V(16,12)+r3 -+ vmov HX(P2,0), V(16,13)+r3 -+ vmov HX(P1,0), V(16,14)+r3 -+ vmov HX(P0,0), V(16,15)+r3 -+ vmov HX(Q0,0), V(16,16)+r3 -+ vmov HX(Q1,0), V(16,17)+r3 -+ vmov HX(Q2,0), V(16,18)+r3 -+ vmov HX(Q3,0), V(16,19)+r3 -+ -+ bl do_luma_filter -+ -+ vadds V(16,13)+r3, HX(P2,0), 0 -+ vadds V(16,14)+r3, HX(P1,0), 0 -+ vadds V(16,15)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds V(16,16)+r3, HX(Q0,0), 0 -+ vadds V(16,17)+r3, HX(Q1,0), 0 -+ vadds V(16,18)+r3, HX(Q2,0), 0 -+ -+ pop pc -+ -+# Filter edge at H(16,0)+r3 -+horz_filter: -+ push lr -+ -+ vmov HX(P3,0), H(12,0)+r3 -+ vmov HX(P2,0), H(13,0)+r3 -+ vmov HX(P1,0), H(14,0)+r3 -+ vmov HX(P0,0), H(15,0)+r3 -+ vmov HX(Q0,0), H(16,0)+r3 -+ vmov HX(Q1,0), H(17,0)+r3 -+ vmov HX(Q2,0), H(18,0)+r3 -+ vmov HX(Q3,0), H(19,0)+r3 -+ -+ bl do_luma_filter -+ -+ vadds H(13,0)+r3, HX(P2,0), 0 -+ vadds H(14,0)+r3, HX(P1,0), 0 -+ vadds H(15,0)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds H(16,0)+r3, HX(Q0,0), 0 -+ vadds H(17,0)+r3, HX(Q1,0), 0 -+ vadds H(18,0)+r3, HX(Q2,0), 0 -+ -+ pop pc -+ -+# r4 points to array of beta/tc for each 4 length edge -+do_luma_filter: -+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8 -+ valtl HX(beta,0),H(setup,0),H(setup,0) -+ valtu HX(tc,0),H(setup,0),H(setup,0) -+ vmul HX(tc25,0), HX(tc,0), 5 -+ vadd HX(tc25,0),HX(tc25,0), 1 -+ vasr HX(tc25,0), HX(tc25,0), 1 -+ -+ # Compute decision -+ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1 -+ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1 -+ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0 -+ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0 -+ -+ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1 -+ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1 -+ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0 -+ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0 -+ -+ vadd HX(d,0), HX(dp,0), HX(dq,0) -+ vasr HX(beta2,0),HX(beta,0),2 -+ vasr HX(beta3,0),HX(beta,0),3 -+ -+ # Compute flags that are negative if all conditions pass -+ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC -+ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC -+ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF -+ -+ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN -+ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF -+ vadd HX(decision,0), HX(d,0), HX(d,0) IFN -+ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF -+ vmov HX(decision,0), 1 IFNN -+ vadd H(decision,0),H(decision,3),0 IFN -+ vadd H(decision,16),H(decision,19),0 IFN -+ vmov -,HX(decision,0) SETF # N marks strong filter -+ vmov HX(decision,0), 1 IFNN # NN marks normal filter -+ -+ vadd HX(do_filter,0), HX(d,3), HX(d,0) -+ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter -+ vmov HX(decision,0),0 IFNN # Z marks no filter -+ -+ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3 -+ # First extract out even terms -+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3 -+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123 -+ # Now expand back -+ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233 -+ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333 -+ -+ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering -+ -+ # Do a quick check to see if there is anything to do -+ mov r11, 0 # Signal no filtering -+ vmov -,1 IFNZ SUMS r5 -+ cmp r5,0 -+ beq filtering_done -+ mov r11, 1 # Signal some filtering -+ # And whether there is any strong filtering -+ vmov -,1 IFN SUMS r5 -+ cmp r5,0 -+ beq normal_filtering -+ -+ ############################################################################## -+ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!) -+ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2 -+ -+ # Take a copy of the original pixels for use in decision calculation -+ vmov HX(P0,32),HX(P0,0) -+ vmov HX(Q0,32),HX(Q0,0) -+ vmov HX(P1,32),HX(P1,0) -+ vmov HX(Q1,32),HX(Q1,0) -+ vmov HX(P2,32),HX(P2,0) -+ vmov HX(Q2,32),HX(Q2,0) -+ -+ vadd -,HX(P2,32),4 CLRA SACC -+ vshl -,HX(P1,32),1 SACC -+ vshl -,HX(P0,32),1 SACC -+ vshl -,HX(Q0,32),1 SACC -+ vshl HX(delta,0),HX(Q1,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(P0,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN -+ -+ vadd -,HX(P2,32),2 CLRA SACC -+ vadd -,HX(P1,32),HX(P0,32) SACC -+ vshl HX(delta,0),HX(Q0,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 2 -+ vsub HX(delta,0),HX(delta,0),HX(P1,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN -+ -+ vadd -,HX(Q0,32),4 CLRA SACC -+ vadd -,HX(P1,32),HX(P0,32) SACC -+ vmul -,HX(P2,32),3 SACC -+ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(P2,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN -+ #vmov HX(P2,0),3 IFN -+ -+ # Now reverse all P/Qs -+ -+ vadd -,HX(Q2,32),4 CLRA SACC -+ vshl -,HX(Q1,32),1 SACC -+ vshl -,HX(Q0,32),1 SACC -+ vshl -,HX(P0,32),1 SACC -+ vshl HX(delta,0),HX(P1,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(Q0,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN -+ -+ vadd -,HX(Q2,32),2 CLRA SACC -+ vadd -,HX(Q1,32),HX(Q0,32) SACC -+ vshl HX(delta,0),HX(P0,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 2 -+ vsub HX(delta,0),HX(delta,0),HX(Q1,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN -+ -+ vadd -,HX(P0,32),4 CLRA SACC -+ vadd -,HX(Q1,32),HX(Q0,32) SACC -+ vmul -,HX(Q2,32),3 SACC -+ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(Q2,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN -+ -+ ############################################################################## -+ # Normal filtering -+normal_filtering: -+ # Invert the decision flags -+ # make instruction more complicated as assembler has error and loses SETF -+ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering -+ vmov -, HX(tc10,0) SETF # IFN means normal filtering -+ -+ vmov -,1 IFN SUMS r5 -+ cmp r5,0 -+ beq filtering_done -+ -+ vasr HX(tc2,0), HX(tc,0), 1 -+ vmul HX(tc10,0), HX(tc,0), 10 -+ -+ vasr HX(thresh,0), HX(beta,0), 1 -+ vadd HX(thresh,0), HX(thresh,0), HX(beta,0) -+ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC -+ -+ vadd HX(ptest,0),HX(dp,3),HX(dp,0) -+ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel -+ vadd HX(qtest,0),HX(dq,3),HX(dq,0) -+ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel -+ # Expand ptest and qtest together -+ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q -+ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........ -+ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq -+ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0) -+ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0) -+ -+ vsub HX(delta0,0), HX(Q0,0), HX(P0,0) -+ vsub HX(delta1,0), HX(Q1,0), HX(P1,0) -+ vmov -,8 CLRA SACC -+ vmul -,HX(delta0,0), 9 SACC -+ vmul HX(delta0,0),HX(delta1,0), r6 SACC -+ vasr HX(delta0,0), HX(delta0,0), 4 -+ vdist HX(deltatest,0), HX(delta0,0), 0 -+ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something -+ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later -+ -+ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0) -+ -+ vadd HX(deltap1,0), HX(P2,0), HX(P0,0) -+ vadd HX(deltap1,0), HX(deltap1,0), 1 -+ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC -+ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC -+ vasr HX(deltap1,0), HX(deltap1,0), 1 -+ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0) -+ -+ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0) -+ vadd HX(deltaq1,0), HX(deltaq1,0), 1 -+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC -+ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0) -+ vrsub -, HX(delta0,0), 0 SACC -+ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC -+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 -+ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0) -+ -+ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN -+ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN -+ -+ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1 -+ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN -+ -+ vmov -,HX(deltatest,0) SETF -+ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1 -+ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN -+ -+ #vmov HX(P2,0),1 IFN -+ -+filtering_done: -+ b lr -+ -+ -+hevc_uv_deblock_16x16: -+ push r6-r15, lr -+ mov r14,0 -+ b hevc_uv_start -+hevc_uv_deblock_16x16_with_clear: -+ push r6-r15, lr -+ mov r14,1 -+ b hevc_uv_start -+ -+hevc_uv_start: -+ mov r9,r4 -+ mov r4,r3 -+ mov r13,r2 -+ mov r2,r0 -+ mov r10,r0 -+ subscale4 r0,r1 -+ mov r8,63 -+ mov r6,-3 -+ vmov H(zeros,0),0 -+# r7 is number of blocks still to load -+# r0 is location of current block - 4 * stride -+# r1 is stride -+# r2 is location of current block -+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical -+# r4 is setup -+# r5 is for temporary calculations -+# r8 holds 63 -+# r6 holds -3 -+# r9 holds the number of 16 high rows to process -+# r10 holds the original img base -+# r11 returns 0 if no filtering was done on the edge -+# r12 saves a copy of this -+# r13 is copy of width -+# r14 is 1 if we should clear the old contents, or 0 if not -+ -+uv_process_row: -+ # First iteration does not do horizontal filtering on previous -+ mov r7, r13 -+ mov r3,0 -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) # We may wish to prefetch these -+ cmp r14,1 -+ bne uv_skip0 -+ vstb H(zeros,0),(r4) -+uv_skip0: -+ bl uv_vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 -+ bl uv_vert_filter -+ sub r3,8 -+ b uv_start_deblock_loop -+uv_deblock_loop: -+ # Middle iterations do vertical on current block and horizontal on preceding -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) -+ cmp r14,1 -+ bne uv_skip1 -+ vstb H(zeros,0),(r4) -+uv_skip1: -+ bl uv_vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_vert_filter -+ sub r3,8 -+ vldb H(setup_input,0), -16(r4) -+ cmp r14,1 -+ bne uv_skip3 -+ vstb H(zeros,0),-16(r4) -+uv_skip3: -+ bl uv_horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_horz_filter -+ sub r3,8*64 -+ addcmpbeq r12,0,0,uv_skip_save_top -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+uv_skip_save_top: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+uv_start_deblock_loop: -+ # move onto next 16x16 (could do this with circular buffer support instead) -+ add r3,16 -+ and r3,r8 -+ add r4,32 -+ # Perform loop counter operations (may work with an addcmpbgt as well?) -+ add r0,16 -+ add r2,16 -+ sub r7,1 -+ cmp r7,0 # Are there still more blocks to load -+ bgt uv_deblock_loop -+ -+ # Final iteration needs to just do horizontal filtering -+ vldb H(setup_input,0), -16(r4) -+ cmp r14,1 -+ bne uv_skip2 -+ vstb H(zeros,0),-16(r4) -+uv_skip2: -+ bl uv_horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_horz_filter -+ sub r3,64*8 -+ addcmpbeq r12,0,0,uv_skip_save_top2 -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+uv_skip_save_top2: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+ -+# Now look to see if we should do another row -+ sub r9,1 -+ cmp r9,0 -+ bgt uv_start_again -+ pop r6-r15, pc -+uv_start_again: -+ # Need to sort out r0,r2 to point to next row down -+ addscale16 r10,r1 -+ mov r2,r10 -+ subscale4 r0,r2,r1 -+ b uv_process_row -+ -+ -+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered -+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations -+ -+uv_vert_filter: -+ push lr -+ -+ vmov HX(P1,0), V(16,14)+r3 -+ vmov HX(P0,0), V(16,15)+r3 -+ vmov HX(Q0,0), V(16,16)+r3 -+ vmov HX(Q1,0), V(16,17)+r3 -+ -+ bl do_chroma_filter -+ -+ vadds V(16,15)+r3, HX(P0,0), 0 -+ vadds V(16,16)+r3, HX(Q0,0), 0 -+ -+ pop pc -+ -+# Filter edge at H(16,0)+r3 -+uv_horz_filter: -+ push lr -+ -+ vmov HX(P1,0), H(14,0)+r3 -+ vmov HX(P0,0), H(15,0)+r3 -+ vmov HX(Q0,0), H(16,0)+r3 -+ vmov HX(Q1,0), H(17,0)+r3 -+ -+ bl do_chroma_filter -+ -+ vadds H(15,0)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds H(16,0)+r3, HX(Q0,0), 0 -+ -+ pop pc -+ -+# r4 points to array of beta/tc for each 4 length edge -+do_chroma_filter: -+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8 -+ valtl HX(tc,0),H(setup,0),H(setup,0) -+ -+ vsub HX(delta,0),HX(Q0,0),HX(P0,0) -+ vshl HX(delta,0),HX(delta,0),2 CLRA SACC -+ vsub -,HX(P1,0),HX(Q1,0) SACC -+ vmov HX(delta,0),4 SACC -+ vasr HX(delta,0),HX(delta,0),3 -+ vclamps HX(delta,0), HX(delta,0), HX(tc,0) -+ vadd HX(P0,0),HX(P0,0),HX(delta,0) -+ vsub HX(Q0,0),HX(Q0,0),HX(delta,0) -+ b lr -+ -+# r0 = list -+# r1 = number -+hevc_run_command_list: -+ push r6-r7, lr -+ mov r6, r0 -+ mov r7, r1 -+loop_cmds: -+ ld r0,(r6) # How to encode r6++? -+ add r6,4 -+ ld r1,(r6) -+ add r6,4 -+ ld r2,(r6) -+ add r6,4 -+ ld r3,(r6) -+ add r6,4 -+ ld r4,(r6) -+ add r6,4 -+ ld r5,(r6) -+ add r6,4 -+ bl hevc_trans_16x16 -+ sub r7,1 -+ cmp r7,0 -+ bgt loop_cmds -+ -+ pop r6-r7, pc diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c new file mode 100644 -index 0000000..0255f5d +index 0000000000..0255f5dd44 --- /dev/null +++ b/libavcodec/rpi_mailbox.c @@ -0,0 +1,149 @@ @@ -14567,7 +22210,7 @@ index 0000000..0255f5d + diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h new file mode 100644 -index 0000000..b316878 +index 0000000000..b3168788d2 --- /dev/null +++ b/libavcodec/rpi_mailbox.h @@ -0,0 +1,58 @@ @@ -14629,12 +22272,64 @@ index 0000000..b316878 +int mbox_get_image_params(int fd, VC_IMAGE_T * img); + +#endif +diff --git a/libavcodec/rpi_opts.h b/libavcodec/rpi_opts.h +new file mode 100644 +index 0000000000..e6127749ea +--- /dev/null ++++ b/libavcodec/rpi_opts.h +@@ -0,0 +1,46 @@ ++#ifndef AVCODEC_RPI_OPTS_H ++#define AVCODEC_RPI_OPTS_H ++ ++// define RPI to split the CABAC/prediction/transform into separate stages ++#ifndef RPI ++ ++ #define RPI_INTER 0 ++ #define RPI_TSTATS 0 ++ #define RPI_HEVC_SAND 0 ++ ++#else ++ #include "config.h" ++ ++ #define RPI_INTER 1 // 0 use ARM for UV inter-pred, 1 use QPU ++ ++ // By passing jobs to a worker thread we hope to be able to catch up during slow frames ++ // This has no effect unless RPI_WORKER is defined ++ // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as ++ // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one ++ // free for the foreground to fill in. ++ #define RPI_MAX_JOBS 2 ++ ++ // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs ++ // As it stands there is something mildy broken in VPU deblock - looks mostly OK ++ // but reliably fails some conformance tests (e.g. DBLK_A/B/C_) ++ // With VPU luma & chroma pred it is much the same speed to deblock on the ARM ++// #define RPI_DEBLOCK_VPU ++ ++ #define RPI_VPU_DEBLOCK_CACHED 1 ++ ++ #if HAVE_NEON ++ #define RPI_HEVC_SAND 1 ++ #else ++ // Sand bust on Pi1 currently - reasons unknown ++ #define RPI_HEVC_SAND 0 ++ #endif ++ ++ ++ #define RPI_QPU_EMU_Y 0 ++ #define RPI_QPU_EMU_C 0 ++ ++ #define RPI_TSTATS 0 ++#endif ++ ++#endif ++ diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c new file mode 100644 -index 0000000..7c0eedd +index 0000000000..e872b855b7 --- /dev/null +++ b/libavcodec/rpi_qpu.c -@@ -0,0 +1,902 @@ +@@ -0,0 +1,935 @@ +#ifdef RPI +#include +#include @@ -14653,8 +22348,9 @@ index 0000000..7c0eedd +#include "rpi_mailbox.h" +#include "rpi_qpu.h" +#include "rpi_shader.h" -+#include "rpi_hevc_transform.h" -+#include "rpi_zc.h" ++#include "rpi_hevc_transform8.h" ++#include "rpi_hevc_transform10.h" ++#include "libavutil/rpi_sand_fns.h" + +#pragma GCC diagnostic push +// Many many redundant decls in the header files @@ -14678,26 +22374,13 @@ index 0000000..7c0eedd +#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling +#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) + -+// On Pi2 there is no way to access the VPU L2 cache -+// GPU_MEM_FLG should be 4 for uncached memory. (Or C for alias to allocate in the VPU L2 cache) -+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly -+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug. -+#define GPU_MEM_FLG 0x4 -+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0 (On Pi1 it allows ARM to access VPU L2 cache) -+#define GPU_MEM_MAP 0x0 -+ +#define vcos_verify_ge0(x) ((x)>=0) + -+/*static const unsigned code[] = -+{ -+ #include "rpi_shader.hex" -+};*/ -+ +// Size in 32bit words -+#define QPU_CODE_SIZE 2048 ++#define QPU_CODE_SIZE 4098 +#define VPU_CODE_SIZE 2048 + -+const short rpi_transMatrix2even[32][16] = { // Even rows first ++static const short rpi_transMatrix2even[32][16] = { // Even rows first +{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, +{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, +{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, @@ -14737,7 +22420,8 @@ index 0000000..7c0eedd +struct GPU +{ + unsigned int qpu_code[QPU_CODE_SIZE]; -+ unsigned int vpu_code[VPU_CODE_SIZE]; ++ unsigned int vpu_code8[VPU_CODE_SIZE]; ++ unsigned int vpu_code10[VPU_CODE_SIZE]; + short transMatrix2even[16*16*2]; +}; + @@ -14749,8 +22433,9 @@ index 0000000..7c0eedd +#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A) + +struct rpi_cache_flush_env_s { -+ unsigned int n; -+ struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; ++// unsigned int n; ++// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; ++ struct vcsm_user_clean_invalid2_s v; +}; + +#define WAIT_COUNT_MAX 16 @@ -14774,7 +22459,6 @@ index 0000000..7c0eedd +typedef struct vq_wait_s +{ + sem_t sem; -+ unsigned int cost; + struct vq_wait_s * next; +} vq_wait_t; + @@ -14793,7 +22477,7 @@ index 0000000..7c0eedd + int open_count; + int init_count; + int mb; -+ unsigned int current_load; ++ int vpu_i_cache_flushed; + GPU_MEM_PTR_T code_gm_ptr; + vq_wait_pool_t wait_pool; +#if RPI_TRACE_TIME_VPU_QPU_WAIT @@ -14866,8 +22550,8 @@ index 0000000..7c0eedd + +// GPU_MEM_PTR_T alloc fns +static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { -+ p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" ); ++ p->numbytes = (numbytes + 255) & ~255; // Round up ++ p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" ); + //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); + //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); + //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); @@ -14878,12 +22562,14 @@ index 0000000..7c0eedd + av_assert0(p->arm); + p->vc = mbox_mem_lock(mb, p->vc_handle); + av_assert0(p->vc); ++// printf("***** %s, %d\n", __func__, numbytes); ++ + return 0; +} + +static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { + p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); ++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" ); + av_assert0(p->vcsm_handle); + p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); + av_assert0(p->vc_handle); @@ -14891,6 +22577,7 @@ index 0000000..7c0eedd + av_assert0(p->arm); + p->vc = mbox_mem_lock(mb, p->vc_handle); + av_assert0(p->vc); ++// printf("***** %s, %d\n", __func__, numbytes); + return 0; +} + @@ -14899,6 +22586,7 @@ index 0000000..7c0eedd + vcsm_unlock_ptr(p->arm); + vcsm_free(p->vcsm_handle); + memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++// printf("***** %s\n", __func__); +} + + @@ -14955,9 +22643,14 @@ index 0000000..7c0eedd + } + // And the VPU code + { -+ int num_bytes = sizeof(rpi_hevc_transform); ++ int num_bytes = sizeof(rpi_hevc_transform8); + av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes); ++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); ++ } ++ { ++ int num_bytes = sizeof(rpi_hevc_transform10); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); + } + // And the transform coefficients + memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); @@ -15048,10 +22741,18 @@ index 0000000..7c0eedd + gpu_unlock_unref(ge); +} + -+unsigned int vpu_get_fn(void) { ++unsigned int vpu_get_fn(const unsigned int bit_depth) { + // Make sure that the gpu is initialized + av_assert0(gpu != NULL); -+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code); ++ switch (bit_depth){ ++ case 8: ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); ++ case 10: ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); ++ default: ++ av_assert0(0); ++ } ++ return 0; +} + +unsigned int vpu_get_constants(void) { @@ -15081,95 +22782,75 @@ index 0000000..7c0eedd +// +// Cache flush functions + ++#define CACHE_EL_MAX 16 + +rpi_cache_flush_env_t * rpi_cache_flush_init() +{ -+ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t)); -+ if (rfe == NULL) -+ return NULL; ++ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) + ++ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX); ++ if (rfe == NULL) ++ return NULL; + -+ rfe->n = 0; -+ return rfe; ++ rfe->v.op_count = 0; ++ return rfe; +} + +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) +{ -+ if (rfe != NULL) -+ free(rfe); ++ if (rfe != NULL) ++ free(rfe); +} + +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) +{ -+ int rc = 0; -+ unsigned int na; -+ unsigned int nr; ++ int rc = 0; + -+ // Clear any reamaining ents in the final block -+ if ((nr = rfe->n % CFE_ENTS_PER_A) != 0) -+ memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0])); ++ if (vcsm_clean_invalid2(&rfe->v) != 0) ++ rc = -1; + -+ for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na) -+ { -+ if (vcsm_clean_invalid(rfe->a + na) != 0) -+ rc = -1; -+ } ++ free(rfe); + -+ free(rfe); ++ if (rc == 0) ++ return 0; + -+ if (rc == 0) -+ return 0; -+ -+ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno); -+ return rc; ++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno); ++ return rc; +} + -+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) +{ -+ // Deal with empty pointer trivially -+ if (gm == NULL || gm->numbytes == 0) -+ return; ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; + -+ { -+ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); -+ const unsigned int n = rfe->n % CFE_ENTS_PER_A; ++ av_assert0(rfe->v.op_count <= CACHE_EL_MAX); + -+ av_assert0(rfe->n < CFE_ENT_COUNT); -+ -+ a->s[n].cmd = mode; -+ a->s[n].handle = gm->vcsm_handle; -+ a->s[n].addr = (unsigned int)gm->arm; -+ a->s[n].size = gm->numbytes; -+ ++rfe->n; -+ } ++ b->invalidate_mode = mode; ++ b->block_count = blocks; ++ b->start_address = gm->arm + offset0; ++ b->block_size = block_size; ++ b->inter_block_stride = block_stride; +} + +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, + const unsigned int offset, const unsigned int size) +{ -+ // Deal with empty pointer trivially -+ if (gm == NULL || size == 0) -+ return; ++ // Deal with empty pointer trivially ++ if (gm == NULL || size == 0) ++ return; + -+// printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes); ++ av_assert0(offset <= gm->numbytes); ++ av_assert0(size <= gm->numbytes); ++ av_assert0(offset + size <= gm->numbytes); + -+ av_assert0(offset <= gm->numbytes); -+ av_assert0(size <= gm->numbytes); -+ av_assert0(offset + size <= gm->numbytes); -+ -+ { -+ struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A); -+ const unsigned int n = rfe->n % CFE_ENTS_PER_A; -+ -+ av_assert0(rfe->n < CFE_ENT_COUNT); -+ -+ a->s[n].cmd = mode; -+ a->s[n].handle = gm->vcsm_handle; -+ a->s[n].addr = (unsigned int)gm->arm + offset; -+ a->s[n].size = size; -+ ++rfe->n; -+ } ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); +} + ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++{ ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); ++} ++ ++ +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) +{ +#if !RPI_ONE_BUF @@ -15186,21 +22867,27 @@ index 0000000..7c0eedd + } +} + -+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, -+ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma) ++// Flush an area of a frame ++// Width, height, x0, y0 in luma pels ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma) +{ -+ const unsigned int y_offset = frame->linesize[0] * start_line; -+ const unsigned int y_size = frame->linesize[0] * n; ++ const unsigned int y_offset = frame->linesize[0] * y0; ++ const unsigned int y_size = frame->linesize[0] * height; + // Round UV up/down to get everything + const unsigned int uv_rnd = (1U << uv_shift) >> 1; -+ const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift); -+ const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset; ++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); ++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; + ++#if 0 ++ // *** frame->height is cropped height so not good + // As all unsigned they will also reject -ve + // Test individually as well as added to reject overflow -+ av_assert0(start_line <= (unsigned int)frame->height); ++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped + av_assert0(n <= (unsigned int)frame->height); + av_assert0(start_line + n <= (unsigned int)frame->height); ++#endif + + if (!gpu_is_buf1(frame)) + { @@ -15212,7 +22899,7 @@ index 0000000..7c0eedd + rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); + } + } -+ else if (!rpi_sliced_frame(frame)) ++ else if (!av_rpi_is_sand_frame(frame)) + { + const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); + if (do_luma) { @@ -15225,16 +22912,30 @@ index 0000000..7c0eedd + } + else + { -+ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); -+// printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' '); -+ for (int x = 0; x < frame->width; x += frame->linesize[0]) { -+ if (do_luma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size); -+ } -+ if (do_chroma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, -+ (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size); -+ } ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame); ++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); ++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C ++ av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); ++ ++ if (do_chroma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); ++ b->block_size = uv_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ if (do_luma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); ++ b->block_size = y_size; ++ b->inter_block_stride = stride1 * stride2; + } + } +} @@ -15275,13 +22976,11 @@ index 0000000..7c0eedd + + +// If sem_init actually takes time then maybe we want a pool... -+static vq_wait_t * vq_wait_new(const unsigned int cost) ++static vq_wait_t * vq_wait_new(void) +{ + gpu_env_t * const ge = gpu_lock_ref(); + vq_wait_t * const wait = ge->wait_pool.head; + ge->wait_pool.head = wait->next; -+ ge->current_load += cost; -+ wait->cost = cost; + wait->next = NULL; + +#if RPI_TRACE_TIME_VPU_QPU_WAIT @@ -15337,17 +23036,13 @@ index 0000000..7c0eedd + +static void vq_wait_post(vq_wait_t * const wait) +{ -+#if !RPI_TRACE_TIME_VPU_QPU_WAIT -+ if (wait->cost != 0) -+#endif ++#if RPI_TRACE_TIME_VPU_QPU_WAIT + { + gpu_env_t *const ge = gpu_lock(); -+ ge->current_load -= wait->cost; -+#if RPI_TRACE_TIME_VPU_QPU_WAIT + tto_end(&ge->ttw.active, ns_time()); -+#endif + gpu_unlock(); + } ++#endif + + sem_post(&wait->sem); +} @@ -15363,7 +23058,6 @@ index 0000000..7c0eedd +{ + unsigned int n; + unsigned int mask; -+ unsigned int cost; + struct gpu_job_s j[VPU_QPU_JOB_MAX]; +}; + @@ -15396,23 +23090,26 @@ index 0000000..7c0eedd + vqj->mask |= VPU_QPU_MASK_VPU; + + j->command = EXECUTE_VPU; -+ j->u.v.q[0] = vpu_code; ++ // The bottom two bits of the execute address contain no-flush flags ++ // b0 will flush the VPU I-cache if unset so we nearly always want that set ++ // as we never reload code ++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; + j->u.v.q[1] = r0; + j->u.v.q[2] = r1; + j->u.v.q[3] = r2; + j->u.v.q[4] = r3; + j->u.v.q[5] = r4; + j->u.v.q[6] = r5; ++ gpu->vpu_i_cache_flushed = 1; + } +} + +// flags are QPU_FLAGS_xxx -+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail) ++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) +{ + if (n != 0) { + struct gpu_job_s *const j = new_job(vqj); + vqj->mask |= VPU_QPU_MASK_QPU; -+ vqj->cost += cost; + + j->command = EXECUTE_QPU; + j->u.q.jobs = n; @@ -15442,7 +23139,7 @@ index 0000000..7c0eedd + } + + // We are going to want a sync object -+ wait = vq_wait_new(vqj->cost); ++ wait = vq_wait_new(); + + // There are 2 VPU Qs & 1 QPU Q so we can collapse sync + // If we only posted one thing or only QPU jobs @@ -15464,7 +23161,6 @@ index 0000000..7c0eedd + j->callback.cookie = wait; + } + -+ vqj->cost = 0; + vqj->mask = 0; + *wait_h = wait; +} @@ -15483,11 +23179,6 @@ index 0000000..7c0eedd + return rv; +} + -+unsigned int vpu_qpu_current_load(void) -+{ -+ return gpu_ptr()->current_load; -+} -+ +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) +{ + if (wait_h != NULL) @@ -15536,13 +23227,50 @@ index 0000000..7c0eedd + return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code); +} + ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) ++{ ++ // Dummy values we can catch with emulation ++ qf->y_pxx = ~1U; ++ qf->y_bxx = ~2U; ++ qf->y_p00 = ~3U; ++ qf->y_b00 = ~4U; ++ qf->c_pxx = ~5U; ++ qf->c_bxx = ~6U; ++ ++ switch (bit_depth) { ++ case 8: ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y_b00); ++ qf->c_pxx = qpu_fn(mc_filter_c_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c_b); ++ break; ++ case 10: ++ qf->c_pxx = qpu_fn(mc_filter_c10_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c10_b); ++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y10_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y10_b00); ++ break; ++ default: ++ return -1; ++ } ++ return 0; ++} ++ +#endif // RPI diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h new file mode 100644 -index 0000000..a95f7d9 +index 0000000000..485a08f8ba --- /dev/null +++ b/libavcodec/rpi_qpu.h -@@ -0,0 +1,200 @@ +@@ -0,0 +1,206 @@ +#ifndef RPI_QPU_H +#define RPI_QPU_H + @@ -15687,21 +23415,35 @@ index 0000000..a95f7d9 +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, + const unsigned int offset, const unsigned int size); ++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode); -+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, -+ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma); ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma); + +// init, add, finish for one gm ptr +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); + + +// QPU specific functions ++ ++typedef struct HEVCRpiQpu { ++ uint32_t c_pxx; ++ uint32_t c_pxx_l1; ++ uint32_t c_bxx; ++ uint32_t y_pxx; ++ uint32_t y_bxx; ++ uint32_t y_p00; ++ uint32_t y_b00; ++} HEVCRpiQpu; ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); ++ +uint32_t qpu_fn(const int * const mc_fn); + -+#define QPU_N_GRP_UV 4 -+#define QPU_N_UV 8 -+#define QPU_N_GRP_Y 4 // 4 QPUs per TMU -+#define QPU_N_Y 12 ++#define QPU_N_GRP 4 ++#define QPU_N_MAX 12 + +#define QPU_MAIL_EL_VALS 2 + @@ -15717,27 +23459,19 @@ index 0000000..a95f7d9 +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, + const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); -+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail); ++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); +int vpu_qpu_job_start(const vpu_qpu_job_h vqj); +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); + -+ -+extern unsigned int vpu_get_fn(void); ++extern unsigned int vpu_get_fn(const unsigned int bit_depth); +extern unsigned int vpu_get_constants(void); + +// Waits for previous post_codee to complete and Will null out *wait_h after use +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); -+unsigned int vpu_qpu_current_load(void); +int vpu_qpu_init(void); +void vpu_qpu_term(void); + -+// Simple test of shader code -+extern int rpi_test_shader(void); -+ -+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst); -+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch); -+ +extern int gpu_get_mailbox(void); +void gpu_ref(void); +void gpu_unref(void); @@ -15745,10 +23479,10 @@ index 0000000..a95f7d9 +#endif diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c new file mode 100644 -index 0000000..0898ecd +index 0000000000..2c6541a8fb --- /dev/null +++ b/libavcodec/rpi_shader.c -@@ -0,0 +1,670 @@ +@@ -0,0 +1,1570 @@ +#include "rpi_shader.h" + +#ifdef _MSC_VER @@ -15772,648 +23506,1548 @@ index 0000000..0898ecd +__attribute__((aligned(8))) +#endif +unsigned int rpi_shader[] = { -+// ::mc_setup_c -+/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1 ; mov -, unif -+/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif -+/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1 -+/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 -+/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0 -+/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 -+/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 -+/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 -+/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 -+/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif ; mov ra12, 0 -+/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif ; mov ra13, 0 -+/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num ; mov ra14, 0 -+/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0 -+/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b -+/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a -+/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1 -+/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4 -+/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2 -+/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0 ; v8subs r1, r1, r1 -+/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch ++// ::mc_setup_c_q0 ++// ::mc_start ++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c_qn ++/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1 ++/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif ++/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift ++/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch ++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num ++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch +/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 +/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_y ++/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 +/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 -+/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0 -+/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y -+/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0 -+/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0 -+/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y -+/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0 -+/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif -+/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif -+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 -+/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5 -+/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 -+/* [0x00000160] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) -+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) -+/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 -+/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif -+/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif -+/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a -+/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b -+/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num -+/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0 ; mov -, unif -+/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x ; mov -, unif -+/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1 ; mov -, unif -+/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4 -+/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2 -+/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0 ; v8subs r1, r1, r1 -+/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch -+/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1 -+/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_y2 -+/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 -+/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0 -+/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y -+/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif -+/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0 ; mov -, unif -+/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y ; mov -, unif -+/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0 -+// ::mc_filter_uv -+/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif ; mov vw_setup, rb28 -+/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif ; mov r0, elem_num -+/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0 ; v8subs r1, r1, r1 -+/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif -+/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next -+/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif -+/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4 -+/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2 ; mov ra0, unif -+/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0 ; mov ra_y_next, ra2.16a -+/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 -+/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a -+/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 -+/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2 ; mov ra3, unif -+/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1 ; mov ra1, unif -+/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3 ; mov.ifnz ra1, unif -+/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0, r0, r2 ; mov rb8, ra3.8a -+/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0, r0, 15 ; mov rb9, ra3.8b -+/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27 ; mov r1, ra1.16b -+/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13 ; mov rb10, ra3.8c -+/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 -+/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1 -+// :uvloop -+/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 -+/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next -+/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8 ; mov.ifnz r3, ra_y -+/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0 ; mov.ifz ra_base, ra_base_next -+/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch -+/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15 -+/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11 -+/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 -+/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13 -+/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop ; mov r1, r1 << 8 -+/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop -+/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13 -+/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1 -+/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+// ::mc_filter_uv_b0 -+/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif ; mov vw_setup, rb28 -+/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif ; mov r0, elem_num -+/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0 ; v8subs r1, r1, r1 -+/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif -+/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next -+/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x ; mov ra1, unif -+/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4 -+/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2 ; mov ra0, unif -+/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0 ; mov ra_y_next, ra2.16a -+/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra1.16b, 2 -+/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1 ; mov r1, ra1.16a -+/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 -+/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2 ; mov ra3, unif -+/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1 -+/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3 ; mov rb8, ra3.8a -+/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b -+/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, 15 ; mov rb10, ra3.8c -+/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27 -+/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d -+/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif -+/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif -+// :uvloop_b0 -+/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 -+/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next -+/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8 ; mov.ifnz r3, ra_y -+/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0 ; mov.ifz ra_base, ra_base_next -+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch -+/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 -+/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 -+/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 -+/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0 ; mov ra7, rb6 -+/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31 -+/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6 ; mov rb6, ra5 -+/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0 -+/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4 ; mov rb4, ra4 -+/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5 ; mov rb5, ra6 -+/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7 ; mov rb7, ra8 -+/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3 -+/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin -+/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3 ; mov -, unif -+/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16 ; mov ra_link, unif -+/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000 -+/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12 -+/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 -+/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 -+/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 -+/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 -+/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30 -+/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 -+/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 -+/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin -+/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 -+/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 -+/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 -+// :uv_b0_post12 -+/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6 ; mov r3, rb7 -+/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 -+/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2 ; mov rb4, r3 -+/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2, ra4 ; mov r3, rb5 -+/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 -+/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2 ; mov rb6, r3 -+// :uv_b0_post_fin -+/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif ; mov r0, elem_num -+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0 ; v8subs r1, r1, r1 -+/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch ; mov r3, unif -+/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next -+/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x ; mov -, unif -+/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4 -+/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2 ; mov ra0, unif -+/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0 ; mov ra_y2_next, ra2.16a -+/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1 ; mov ra3, unif -+/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1 ; mov rb8, ra3.8a -+/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 -+/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif ; mov rb9, ra3.8b -+/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif ; mov rb10, ra3.8c -+/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop ; mov rb11, ra3.8d -+/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3 -+/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1 -+// :uvloop_b -+/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu1 -+/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next -+/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8 ; mov.ifnz r3, ra_y2 -+/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0 ; mov.ifz ra_base2, rb_base2_next -+/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1 ; mul24 r2, r2, rb_pitch -+/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13 -+/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9 -+/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10 -+/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8 -+/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7 -+/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11 -+/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 -+/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6 ; mul24 r1, r1, ra_k256 -+/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14 ; mov rb6, ra5 -+/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4 ; mul24 r1, r1, ra1.16a -+/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0 ; mov rb4, ra4 -+/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5 ; mul24 r1, r1, ra_k256 -+/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12 ; mov rb5, ra6 -+/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31 ; mov ra6, rb7 -+/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13 -+/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop ; mov r1, r1 << 8 -+/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b -+/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13 -+/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait ; mov rb7, ra8 -+/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3 -+/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif -+// ::mc_interrupt_exit8c -+/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov -, vw_wait ; nop ; ldtmu0 -+/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_exit -+// ::mc_exit_c -+/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 -+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0) -+/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop ; nop -+/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_interrupt_exit12 -+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1 -+/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0) -+/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_exit1 -+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait -+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0 -+/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1 -+/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend -+/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop -+/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop ; nop -+// ::mc_setup -+/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1 ; mov ra8, unif -+/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif -+/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif -+/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif -+/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif -+/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif -+/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1 -+/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 -+/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif -+/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) -+/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or rb24, r1, rb_pitch -+/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num -+/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3 -+/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch -+/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 -+/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b -+/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1 -+/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 -+/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1 -+/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3 -+/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 -+/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b -+/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1 -+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0 -+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1 -+/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1 -+/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256 -+/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255 -+/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0 -+/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 -+/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 -+/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 -+/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 -+/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num -+/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 -+/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 -+/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 -+/* [0x00000be0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 -+/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) -+/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 -+/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) -+/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 -+/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 -+/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9 -+/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0 -+/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1 -+/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif ; mul24 r1, r1, rb_pitch -+/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base -+/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0 -+/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y -+/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1 -+/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch -+/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2 -+// :per_block_setup -+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif -+/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif ; mov r3, elem_num -+/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next -+/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next -+/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3 -+/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3 -+/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 -+/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch -+/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0 -+/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b -+/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif -+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3 -+/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 -+/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x -+/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 -+/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 -+/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2 -+/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 -+/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0 -+/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b -+/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif -+/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 -+/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width -+/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5 ; mov r0, ra_height -+/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16 -+/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1 -+/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7 -+/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0, r0, 7 -+/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0, r0, ra_width -+/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16 -+/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif -+/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16 ; mov ra5, unif -+/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400 -+/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3 ; mov rb14, ra5.16a -+/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 -+/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d -+/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c -+/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d -+/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c -+/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 -+/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d -+/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c -+/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 -+/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d -+/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c -+/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 -+/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d -+/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c -+/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 -+/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d -+/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c -+/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 -+/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d -+/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c -+/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 -+/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d -+/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c -+/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a ; mov ra18, unif -+/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b -+/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c -+/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18 -+/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif -+/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link -+/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13 -+/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9 -+/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d -+// ::mc_filter -+/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1 -+// :yloop -+/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 -+/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3 -+/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 -+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait -+/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14 -+/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12 -+/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop -+/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16 -+/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1 -+/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0 -+/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0 -+/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup -+/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest -+/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1 -+/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0 -+/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1 -+/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23 -+/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0 -+/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch -+/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 -+/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 -+/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop -+/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop -+// ::mc_filter_b -+// :yloopb -+/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 -+/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3 -+/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next -+/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 -+/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 -+/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0 -+/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y -+/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 -+/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+/* [0x00001188] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0 -+/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31 -+/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8 -+/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9 -+/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10 -+/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 -+/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1 -+/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a -+/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b -+/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 -+/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 -+/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 -+/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12 -+/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 -+/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 -+/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14 -+/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 -+/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait -+/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8 -+/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb -+/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13 -+/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255 -+/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0 -+/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16 -+/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1 -+/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0 -+/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0 -+/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup -+/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26 -+/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29 -+/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest -+/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1 -+/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0 -+/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1 -+/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23 -+/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0 -+/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch -+/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0 -+/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28 -+/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb -+/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop -+/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop ++/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif ++/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000118] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif ++/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 ++/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y ++// :1 ++/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c_p ++/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 ++/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b ++/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c_p_l1 ++/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 ++/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b ++/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c_b ++/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif ++/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif ++/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif ++/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift ++/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a ++/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b ++/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c ++/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif ++/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 ++/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d ++/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 ++/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif ++// :1 ++/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5 ++/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 ++/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 ++/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a ++/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 ++/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 ++/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_sync_q0 ++/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q1 ++/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q2 ++/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q3 ++/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q4 ++/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q5 ++/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q6 ++/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q7 ++/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q8 ++/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q9 ++/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q10 ++/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q11 ++/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_qn ++// ::mc_exit_y_qn ++/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_q0 ++// ::mc_exit_y_q0 ++/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y_q0 ++/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y_qn ++/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1 ++/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch ++/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num ++/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 ++/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth ++/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000e08] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_8 ++/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 ++/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0, r0, v_dma_h_shift ++/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift ++/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255 ++/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d ++/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c ++/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 ++/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d ++/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c ++/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 ++/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d ++/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c ++/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 ++/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 ++/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 ++/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 ++/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif ++/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 ++/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 ++/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ++/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif ++// ::mc_filter_y_pxx ++/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++// :1 ++/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height ++/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y_bxx ++/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++// :1 ++/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b ++/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y_p00 ++/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next ++/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif ++/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif ++/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif ++/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif ++/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base ++/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 ++/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif ++// :1 ++/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b ++/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y_b00 ++/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7 ++/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 ++/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 ++/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 ++/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 ++/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b ++/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_setup_c10_q0 ++/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c10_qn ++/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1 ++/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif ++/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift ++/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch ++/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num ++/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 ++/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif ++/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x000017f0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif ++/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif ++/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0 ++/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD ; mov r0, ra_y ++// :1 ++/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c10_p ++/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8 ++/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b ++/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_p_l1 ++/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2 ; mov rb9, ra3.8b ++/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif ; mov ra9, rb_max_y ++/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2 ; mov ra_link, unif ++/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1 ++// :1 ++/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9 ; mov.ifnc r0, r2 ++/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3 ; mov ra5, ra6 ++/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7 ; mul24 r1, ra7, rb10 ++/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0 ; mul24 r0, ra4, rb8 ++/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8 ++/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra5, rb9 ++/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3 ++/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b ++/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_b ++/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif ++/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif ++/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif ++/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift ++/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5 ; mov rb8, ra3.8a ++/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x ; mov rb9, ra3.8b ++/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1 ; mov rb10, ra3.8c ++/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1 ; mov rb_dest, unif ++/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0 ++/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y ; mov rb11, ra3.8d ++/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15 ++/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9 ; mov ra_link, unif ++// :1 ++/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 ++/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5 ; mov ra4, ra5 ++/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8) ++/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 ++/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7 ; mul24 r3, ra7, rb10 ++/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a ++/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 ++/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 ++/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b ++/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_sync10_q0 ++/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q1 ++/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q2 ++/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q3 ++/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q4 ++/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q5 ++/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q6 ++/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q7 ++/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q8 ++/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q9 ++/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q10 ++/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q11 ++/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_q0 ++// ::mc_exit_y10_q0 ++/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_qn ++// ::mc_exit_y10_qn ++/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y10_q0 ++/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y10_qn ++/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif ++/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30 ++/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100 ++/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask ++/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif ++/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 ++/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift ++/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00002390] */ 0x159d03c0, 0x10021627, // or rb_dma1_base, r1, rb_pitch ++/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num ++/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0 ++/* [0x00002448] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2 ++/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth ++/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x000024d0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_10 ++/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7 ++/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0, r0, v_dma_h_shift ++/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0, r0, v_dma_wh_shift ++/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3 ; mov r3, ra_k255 ++/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d ++/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c ++/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00 ++/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d ++/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c ++/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40 ++/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d ++/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c ++/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 ++/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500 ++/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 ++/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 ++/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif ++/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000 ++/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d ++/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 ++/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ++/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9 ; mov ra_link, unif ++// ::mc_filter_y10_pxx ++/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++// :1 ++/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop ; mul24 r1, r1, ra_wt_mul_l0 ++/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height ++/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_p00 ++/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next ++/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch ; mov ra_base_next, unif ++/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1 ; mov ra_width_height, unif ++/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif ++/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif ++/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base ++/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 ++/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1 ; mov ra_link, unif ++// :1 ++/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b ++/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_bxx ++/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++// :1 ++/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y ; mov ra7, ra8 ++/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8 ; mov ra9, ra10 ++/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11 ++/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4 ++/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5 ++/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7 ++/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14 ++/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b ++/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_b00 ++/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 ++/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7 ++/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0 ++/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0 ++/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0 ++/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask ++/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1 ++/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height ++/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15 ++/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 ++/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, rb_dest ++/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b ++/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1 ++/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2 ++/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init +// ::mc_end +}; +#ifdef __HIGHC__ @@ -16421,35 +25055,79 @@ index 0000000..0898ecd +#endif diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h new file mode 100644 -index 0000000..d17b9fd +index 0000000000..82bf380eb4 --- /dev/null +++ b/libavcodec/rpi_shader.h -@@ -0,0 +1,19 @@ +@@ -0,0 +1,63 @@ +#ifndef rpi_shader_H +#define rpi_shader_H + +extern unsigned int rpi_shader[]; + -+#define mc_setup_c (rpi_shader + 0) -+#define mc_filter_uv (rpi_shader + 152) -+#define mc_filter_uv_b0 (rpi_shader + 280) -+#define mc_interrupt_exit8c (rpi_shader + 554) -+#define mc_exit (rpi_shader + 582) -+#define mc_exit_c (rpi_shader + 582) -+#define mc_interrupt_exit12 (rpi_shader + 598) -+#define mc_exit1 (rpi_shader + 634) -+#define mc_setup (rpi_shader + 650) -+#define mc_filter (rpi_shader + 942) -+#define mc_filter_b (rpi_shader + 1094) -+#define mc_end (rpi_shader + 1246) ++#define mc_setup_c_q0 (rpi_shader + 0) ++#define mc_start (rpi_shader + 0) ++#define mc_setup_c_qn (rpi_shader + 2) ++#define mc_filter_c_p (rpi_shader + 142) ++#define mc_filter_c_p_l1 (rpi_shader + 272) ++#define mc_filter_c_b (rpi_shader + 402) ++#define mc_sync_q0 (rpi_shader + 590) ++#define mc_sync_q1 (rpi_shader + 608) ++#define mc_sync_q2 (rpi_shader + 620) ++#define mc_sync_q3 (rpi_shader + 632) ++#define mc_sync_q4 (rpi_shader + 644) ++#define mc_sync_q5 (rpi_shader + 662) ++#define mc_sync_q6 (rpi_shader + 674) ++#define mc_sync_q7 (rpi_shader + 686) ++#define mc_sync_q8 (rpi_shader + 698) ++#define mc_sync_q9 (rpi_shader + 716) ++#define mc_sync_q10 (rpi_shader + 728) ++#define mc_sync_q11 (rpi_shader + 740) ++#define mc_exit_c_qn (rpi_shader + 752) ++#define mc_exit_y_qn (rpi_shader + 752) ++#define mc_exit_c_q0 (rpi_shader + 770) ++#define mc_exit_y_q0 (rpi_shader + 770) ++#define mc_setup_y_q0 (rpi_shader + 790) ++#define mc_setup_y_qn (rpi_shader + 792) ++#define mc_filter_y_pxx (rpi_shader + 1032) ++#define mc_filter_y_bxx (rpi_shader + 1162) ++#define mc_filter_y_p00 (rpi_shader + 1292) ++#define mc_filter_y_b00 (rpi_shader + 1382) ++#define mc_setup_c10_q0 (rpi_shader + 1462) ++#define mc_setup_c10_qn (rpi_shader + 1464) ++#define mc_filter_c10_p (rpi_shader + 1600) ++#define mc_filter_c10_p_l1 (rpi_shader + 1728) ++#define mc_filter_c10_b (rpi_shader + 1856) ++#define mc_sync10_q0 (rpi_shader + 2042) ++#define mc_sync10_q1 (rpi_shader + 2060) ++#define mc_sync10_q2 (rpi_shader + 2072) ++#define mc_sync10_q3 (rpi_shader + 2084) ++#define mc_sync10_q4 (rpi_shader + 2096) ++#define mc_sync10_q5 (rpi_shader + 2114) ++#define mc_sync10_q6 (rpi_shader + 2126) ++#define mc_sync10_q7 (rpi_shader + 2138) ++#define mc_sync10_q8 (rpi_shader + 2150) ++#define mc_sync10_q9 (rpi_shader + 2168) ++#define mc_sync10_q10 (rpi_shader + 2180) ++#define mc_sync10_q11 (rpi_shader + 2192) ++#define mc_exit_c10_q0 (rpi_shader + 2204) ++#define mc_exit_y10_q0 (rpi_shader + 2204) ++#define mc_exit_c10_qn (rpi_shader + 2224) ++#define mc_exit_y10_qn (rpi_shader + 2224) ++#define mc_setup_y10_q0 (rpi_shader + 2242) ++#define mc_setup_y10_qn (rpi_shader + 2244) ++#define mc_filter_y10_pxx (rpi_shader + 2494) ++#define mc_filter_y10_p00 (rpi_shader + 2624) ++#define mc_filter_y10_bxx (rpi_shader + 2716) ++#define mc_filter_y10_b00 (rpi_shader + 2846) ++#define mc_end (rpi_shader + 2926) + +#endif diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm new file mode 100644 -index 0000000..aa3fe47 +index 0000000000..ba6cc13a95 --- /dev/null +++ b/libavcodec/rpi_shader.qasm -@@ -0,0 +1,1259 @@ +@@ -0,0 +1,1741 @@ + +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress +# the warning that we are using rotation & ra/rb registers. r0..3 can be @@ -16457,102 +25135,197 @@ index 0000000..aa3fe47 +# local 4. As it happens this is what is wanted here as we do not want the +# constants from the other half of the calc. + ++# PREREAD is the number of requests that we have sitting in the TMU request ++# queue. ++# ++# There are 8 slots availible in the TMU request Q for tm0s requests, but ++# only 4 output FIFO entries and overflow is bad (corruption or crash) ++# (If threaded then only 2 out FIFO entries, but we aren't.) ++# In s/w we are effectively limited to the min vertical read which is >= 4 ++# so output FIFO is the limit. ++# ++# However in the current world there seems to be no benefit (and a small ++# overhead) in setting this bigger than 2. ++ ++.set PREREAD, 4 ++ ++# Block heights - 8 & 16 are the only numbers we currently support ++ ++.set C_BLK_HEIGHT_8, 16 ++.set C_BLK_HEIGHT_16, 8 ++.set Y_BLK_HEIGHT_8, 16 ++.set Y_BLK_HEIGHT_16, 8 ++ ++# QPU counts - depend on block size ++# If we have a 2-byte format & block_size > 8 then can only afford ++# 8 QPUs ++# These numbers must match the numbers in rpi_shader_cmd.h ++ ++.set N_QPU_8, 12 ++.set N_QPU_16, 12 ++ +# register allocation +# -+# ra0...ra7 eight horizontal filter coefficients -+# -+# rb0 rx_shift2 -+# rb1 rb_y2_next -+# -+# rb4...rb7 -+# -+# rb8..rb11, ra8...ra11 Y: eight filtered rows of context (ra11 == most recent) -+# -+# (ra15 isn't clamped to zero - this happens during the -+# copy to ra14, and during its use in the vertical filter) -+# -+# rb8...rb11 eight vertical filter coefficients + -+# ra4 y: Fiter, UV: part -of b0 -> b stash ++# ra0-3 ++# Used as temp and may be loop filter coeffs (split into .8s) ++# or temp in loop. Check usage on an individual basis. + -+# rb12 offset to add before shift (round + weighting offsets) -+# rb13 shift: denom + 6 + 9 -+# rb14 L0 weight (U on left, V on right) -+# rb15 -- free -- -+# -+# ra16 width:height -+# ra17 ra_y:ra_xshift -+# ra18 L1 weight (Y) -+# ra19 ra_y_next:ra_xshift_next -+# -+# rb16 pitch -+# rb17 height + 1 -+# rb18 max(height,16) + 3 -+# rb19 frame_base2_next -+# -+# ra20 1 -+# ra21 ra_y2_next:ra_y2 (luma); free (chroma) -+# ra22 ra_k256 256 -+# ra23 0 -+# -+# rb20 -- free -- -+# rb21 -- free -- -+# rb22 rb_k255 255 -+# rb23 dest (Y) -+# -+# rb24 vdw_setup_1(dst_pitch) -+# rb25 frame width-1 -+# rb26 height<<23 + width<<16 + vdw_setup_0 -+# rb27 vdw_setup_0 (depends on QPU number) -+# rb28 vpm_setup (depends on QPU number) for writing 8bit results into VPM -+# rb29 vdw_setup_1(dst_pitch-width) -+# rb30 frame height-1 -+# rb31 used as temp to count loop iterations -+# -+# ra24 src frame base -+# ra25 src frame base 2 -+# ra26 next ra24 -+# ra27 next ra25 -+# ra28 -- free -- -+# ra29 -- free -- -+# -+# Use an even numbered register as a link register to avoid corrupting flags -+# ra30 next kernel address -+# ra31 chroma-B height+3; free otherwise ++# ra4-7 ++# C: L0 H filter out FIFO ++# otherwise -- free -- + -+.set rb_max_x, rb25 -+.set rb_max_y, rb30 -+.set rb_pitch, rb16 ++# ra8-11 ++# temp in some places - check usage ++# Y: (with rb8-11) horiz out FIFO ++ ++# ra12-15 ++# -- free -- ++ ++# uniform: width:height +.set ra_width_height, ra16 +.set ra_width, ra16.16b +.set ra_height, ra16.16a -+.set ra_y2, ra21.16a -+.set ra_y2_next, ra21.16b + -+.set rb_base2_next, rb19 ++# y:y2 same layout as y_y2_next so we can update both together ++.set ra_y_y2, ra17 ++.set ra_y2, ra17.16a ++.set ra_y, ra17.16b + -+.set rb_dest, rb23 ++# uniform: L1 weight (U on left, V on right) ++# Only used in Y B ++.set ra_wt_off_mul_l1, ra18 ++.set ra_wt_off_l1, ra18.16b ++.set ra_wt_mul_l1, ra18.16a ++ ++# y_next:y2_next same layout as y_y2 so we can update both together ++.set ra_y_y2_next, ra19 ++.set ra_y_next, ra19.16b ++.set ra_y2_next, ra19.16a ++ ++# Setup: consts - subdivide a single register ++.set ra_kff100100, ra20 ++.set ra_k256, ra20.16a ++.set ra_k0, ra20.8a ++.set ra_k1, ra20.8b ++.set ra_k16, ra20.8c ++.set ra_k255, ra20.8d ++ ++# Loop: xshifts ++.set ra_xshift, ra21.16a ++.set ra_xshift_next, ra21.16b ++ ++# Loop var: L0 weight (U on left, V on right) ++# _off_ is not used in loop as we want to modify it before use ++.set ra_wt_off_mul_l0, ra22 ++.set ra_wt_mul_l0, ra22.16a ++.set ra_wt_off_l0, ra22.16b ++ ++# Max pel value (for 8 bit we can get away with sat ops but not 9+) ++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the ++# 2nd byte but as the source should never be > 3 there 0x3ff should do ++.set ra_blk_height_pmax, ra23 ++.set ra_pmax, ra23.16a ++.set ra_blk_height, ra23.8c ++# -- free -- ra23.8d ++ ++# Loop: src frame base (L0) +.set ra_base, ra24 -+.set ra_base_next, ra26 -+.set ra_xshift, ra17.16a + ++# Loop: src frame base (L1) +.set ra_base2, ra25 + -+# Note ra_xy & ra_xy_next should have same structure! -+.set ra_xshift_next, ra19.16a ++# Loop: next src frame base (L0) ++.set ra_base_next, ra26 ++ ++# -- free -- ra27 ++# -- free -- ra28 ++# -- free -- ra29 ++ ++# Use an even numbered register as a link register to avoid corrupting flags ++.set ra_link, ra30 ++ ++# -- free -- ra31 ++ +.set rb_xshift2, rb0 +.set rb_xshift2_next, rb1 + -+.set ra_y_next, ra19.16b -+.set ra_y, ra17.16b ++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 ++.set rb_elem_x, rb2 + -+.set ra_k1, ra20 ++# El Flags ++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n ++.set rb_ef, rb3 ++ ++# rb4-7 ++# C-B: L1 H filter out FIFO ++# Y: (with ra2.8x) Y vertical filter coeffs ++ ++# rb8-11 ++# C: Vertical filter coeffs ++# Y: (with ra8-11) horiz out FIFO ++ ++# Loop var: offset to add before shift (round + weighting offsets) ++# Exact value varies by loop ++.set rb_wt_off, rb12 ++ ++# Setup: denom + 6 + 9 ++.set rb_wt_den_p15, rb13 ++ ++# -- free -- rb14 ++# -- free -- rb15 ++ ++# Line pitch (128 for sand128) ++.set rb_pitch, rb16 ++ ++# Loop count - 2 (set up TMU for next xfer) ++.set rb_i_tmu, rb17 ++ ++# Loop count for min(height, 16) ++# Y will reset & loop again if height > 16 ++.set rb_lcount, rb18 ++ ++# frame_base2_next ++.set rb_base2_next, rb19 ++ ++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give ++# offset to the slice +.set rb_xpitch, rb20 -+.set rb_k255, rb22 -+.set ra_k256, ra22 -+.set ra_k0, ra23 + -+.set ra_link, ra30 ++# -- free -- rb21 ++ ++# Setup: 0xff (8-bit) / 0xffff (9+ bit) ++.set rb_pmask, rb22 ++ ++# Loop: destination address ++.set rb_dest, rb23 ++ ++# vdw_setup_1(dst_pitch) ++.set rb_dma1_base, rb24 ++ ++# Setup: pic width - 1 ++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. ++.set rb_max_x, rb25 ++ ++# Loop: height<<23 + width<<16 + vdw_setup_0 ++.set rb_dma0, rb26 ++ ++# vdw_setup_0 (depends on QPU number) ++.set rb_dma0_base, rb27 ++ ++# Setup: vw_setup value to reset VPM write pointer ++.set rb_vpm_init, rb28 ++ ++# Loop: vdw_setup_1(dst_pitch-width) = stride ++.set rb_dma1, rb29 ++ ++# Setup: pic_height - 1 ++.set rb_max_y, rb30 ++ ++# -- free -- rb31 ++ ++ ++ + +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. +.set i_shift16, -16 @@ -16564,8 +25337,10 @@ index 0000000..aa3fe47 +# Macros that express this - obviously these can't be overlapped +# so are probably unsuitable for loop code + -+.macro m_calc_dma_regs, r_vpm, r_dma ++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma + mov r2, qpu_num ++.if v_bit_depth <= 8 ++ # 8 bit version + asr r1, r2, 2 + shl r1, r1, 6 + and r0, r2, 3 @@ -16576,811 +25351,983 @@ index 0000000..aa3fe47 + + mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later + shl r0, r0, 5 -+ add r_dma, r0, r1 # DMA out -+.endm + -+# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16 -+.macro m_calc_dma_regs_c, r_vpm, r_dma -+ mov r2, qpu_num ++.else ++ # 16 bit version ++ # Limited to 8 QPUs if blk height > 8 + asr r1, r2, 1 ++.if v_blk_height <= 8 ++ shl r1, r1, 4 ++.else + shl r1, r1, 5 ++.endif + and r0, r2, 1 + or r0, r0, r1 + -+ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit -+ add r_vpm, r0, r1 # VPM 8bit storage ++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR ++ add r_vpm, r0, r1 + + # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into + # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) -+ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later ++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later + shl r0, r0, 6 ++.endif + add r_dma, r0, r1 # DMA out +.endm + + ++.macro m_setup_q0 ++ srel -, 12 ++.endm ++ ++# Code start label ++::mc_start ++ +################################################################################ +# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) -+::mc_setup_c -+ mov tmurs, 1 ; mov -, unif # No swap TMUs ; Next fn (ignored) ++ ++.macro m_setup_c, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_pmask, 0xff ++.set v_blk_height, C_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 2 ++.set v_pmask, 0xffff ++.set v_blk_height, C_BLK_HEIGHT_16 ++.endif ++ ++ mov tmurs, 1 # No swap TMUs + +# Load first request location -+ mov ra0, unif # next_x_y ++ mov ra0, unif # next_x_y ++ ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 + + mov ra_base, unif # Store frame c base + +# Read image dimensions -+ sub rb_max_x, unif, 1 # pic c width -+ sub rb_max_y, unif, 1 # pic c height ++ sub r0, unif, 1 # pic c width ++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes ++ sub rb_max_y, unif, 1 # pic c height + +# load constants -+ mov ra_k1, 1 -+ mov ra_k256, 256 -+ mov rb_k255, 255 -+ mov ra_k0, 0 ++ mov ra_kff100100, 0xff100100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) + -+# touch registers to keep simulator happy -+ -+ # ra/b4..7: B0 -> B stash registers -+ mov ra4, 0 ; mov rb4, 0 -+ mov ra5, 0 ; mov rb5, 0 -+ mov ra6, 0 ; mov rb6, 0 -+ mov ra7, 0 ; mov rb7, 0 -+ -+ mov r1, vdw_setup_1(0) # Merged with dst_stride shortly, delay slot for ra_base -+ -+# ; ra12..15: vertical scroll registers +# get source pitch -+ mov rb_xpitch, unif ; mov ra12, 0 # stride2 -+ mov rb_pitch, unif ; mov ra13, 0 # stride1 -+ mov r0, elem_num ; mov ra14, 0 -+# get destination vdw setup -+ add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1 ++ mov rb_xpitch, unif # stride2 ++ mov rb_pitch, unif # stride1 ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly ++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 ++ ++ and r0, 1, elem_num ++ nop ; mul24 r0, r0, 5 ++.if v_bit_depth <= 8 ++ add rb_elem_x, r0, elem_num ++.else ++ add r0, r0, elem_num ++ add rb_elem_x, r0, r0 ++.endif + +# Compute base address for first and second access +# ra_base ends up with t0s base +# ra_base2 ends up with t1s base + -+ add r0, r0, ra0.16b # Add elem no to x to get X for this slice ++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] ++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice + max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y + min r0, r0, rb_max_x + +# Get shift -+ and r1, r0, 1 -+ shl ra_xshift_next, r1, 4 ++# Shift will always calculate as 0 for 9+ bit ++# Ideally we can optimize the shift out of the code in these cases but for now ++# it is tidier to leave it in ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.else ++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++.endif + -+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to + -+ and r0, r0, -2 -+ add r0, r0, r0 ; v8subs r1, r1, r1 -+ sub r1, r1, rb_pitch ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch + and r1, r0, r1 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra_y ++ add r0, r0, r1 + add ra_base, ra_base, r0 + -+ max r0, r1, 0 -+ min r0, r0, rb_max_y -+ -+# submit texture requests for first line -+ add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+ add t0s, ra_base, r0 -+ -+# submit texture requests for 2nd line -+ -+ max r0, r1, 0 -+ min r0, r0, rb_max_y -+ -+ add ra_y, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+ add t0s, ra_base, r0 -+ -+ add rb13, 9, unif # denominator -+ mov -, unif # Unused ++ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator + +# Compute part of VPM to use for DMA output -+ m_calc_dma_regs_c rb28, rb27 ++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base + -+# ----------------- +# And again for L1, but only worrying about frame2 stuff + -+ mov ra_link, unif # Next fn -+ +# Load first request location -+ mov ra0, unif # next_x_y ++ mov ra0, unif # next_x_y + -+ mov ra_base2, unif # Store frame c base ++ mov ra_base2, unif # [ra0 delay] Store frame c base + +# Compute base address for first and second access +# ra_base ends up with t0s base +# ra_base2 ends up with t1s base + -+ mov ra_y2, ra0.16a # Store y -+ mov r0, ra0.16b # Load x -+ add r0, r0, elem_num # Add QPU slice -+ max r0, r0, 0 ; mov -, unif # Unused 0 -+ min r0, r0, rb_max_x ; mov -, unif # Unused 1 ++ shl r0, ra0.16b, v_x_shift ++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset ++ max r0, r0, 0 ++ min r0, r0, rb_max_x + -+# Get shift -+ and r1, r0, 1 ; mov -, unif # Unused 2 -+ shl rb_xshift2_next, r1, 4 ++# Get shift (already zero if 9+ bit so ignore) ++.if v_bit_depth <= 8 ++ shl rb_xshift2_next, r0, 3 ++.endif + +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs + -+ and r0, r0, -2 -+ add r0, r0, r0 ; v8subs r1, r1, r1 -+ sub r1, r1, rb_pitch ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch + and r1, r0, r1 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra_y2 ++ add r0, r0, r1 ; mov r2, ra_y2 + add ra_base2, ra_base2, r0 + -+ max r0, r1, 0 -+ min r0, r0, rb_max_y ++# Do preloads ++# r0 = ra_y, r2 = ra_y2 ++ mov r3, PREREAD ; mov r0, ra_y + -+# submit texture requests for first line -+ add r1, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+ add t1s, ra_base2, r0 ; mov -, unif # Unused 3 ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 + -+# submit texture requests for 2nd line -+ -+ max r0, r1, 0 ; mov -, unif # Unused 4 ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b + ++ mov ra_link, unif # link ++# touch registers to keep simulator happy ++ # ra/b4..7: B0 -> B stash registers ++ mov ra4, 0 ; mov rb4, 0 + bra -, ra_link -+ -+ min r0, r0, rb_max_y ; mov -, unif # Unused 5 -+ add ra_y2, r1, ra_k1 ; mul24 r0, r0, rb_pitch -+ add t1s, ra_base2, r0 -+ ++ mov ra5, 0 ; mov rb5, 0 ++ mov ra6, 0 ; mov rb6, 0 ++ mov ra7, 0 ; mov rb7, 0 +# >>> ra_link -+ -+ -+.macro setf_nz_if_v -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] +.endm + ++::mc_setup_c_q0 ++ m_setup_q0 ++::mc_setup_c_qn ++ m_setup_c 8 + +################################################################################ + -+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst) ++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst) + +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block -+::mc_filter_uv -+ mov ra_link, unif ; mov vw_setup, rb28 # ; x_y ++ ++.macro m_filter_c_p, v_tmu, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_x_mul, 4 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_tmu == 0 ++.set vrx_xshift, rb_xshift2 # b side more convienient ++.set vrx_xshift_next, ra_xshift_next ++.set vra_y_next, ra_y_next ++.set vrx_base_next, ra_base_next ++.set vra_y, ra_y ++.set vra_base, ra_base ++.set vr_txs, t0s ++.else ++.set vrx_xshift, ra_xshift # a side more convienient ++.set vrx_xshift_next, rb_xshift2_next ++.set vra_y_next, ra_y2_next ++.set vrx_base_next, rb_base2_next ++.set vra_y, ra_y2 ++.set vra_base, ra_base2 ++.set vr_txs, t1s ++.endif + +# per-channel shifts were calculated on the *previous* invocation -+ +# get base addresses and per-channel shifts for *next* invocation -+ mov ra2, unif ; mov r0, elem_num ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+ setf_nz_if_v # Also acts as delay slot for ra2 ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base + -+ add r0, ra2.16b, r0 ; v8subs r1, r1, r1 # x ; r1=0 -+ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base -+ max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B -+ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 ++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs ++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a + -+ shl ra_xshift_next, r0, 4 -+ -+ and r0, r0, -2 ; mov ra0, unif # H filter coeffs -+ add r0, r0, r0 ; mov ra_y_next, ra2.16a -+ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=x*2 (we are working in pel pairs) ++.if v_bit_depth <= 8 ++ shl vrx_xshift_next, r0, 3 ++ and r0, r0, -4 ++.endif ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height -+ add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 ++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ add vrx_base_next, r3, r0 ; mov r1, ra_height + +# set up VPM write -+ -+ sub rb29, rb24, r2 ; mov ra3, unif # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs -+ add rb17, r1, 1 ; mov ra1, unif # ; U offset/weight -+ add rb18, r1, 3 ; mov.ifnz ra1, unif # ; V offset/weight ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight ++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight + +# ; unpack filter coefficients + -+ add r0, r0, r2 ; mov rb8, ra3.8a # Combine width and height of destination area -+ shl r0, r0, 15 ; mov rb9, ra3.8b # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb26, r0, rb27 ; mov r1, ra1.16b # ; r1=weight ++ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2) ++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight + -+ shl r1, r1, rb13 ; mov rb10, ra3.8c -+ mov r3, 0 ; mov rb11, ra3.8d # Loop count ++ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y + -+ asr rb12, r1, 1 -+ shl rb14, ra1.16a, 1 # b14 = weight*2 ++ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d + -+# rb14 - weight L0 * 2 -+# rb13 = weight denom + 6 + 9 -+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) ++ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link ++ sub ra3, rb_wt_den_p15, ra_k1 + ++# r5 = 0 (loop counter) ++# ra9 = alias for rb_max_y ++# ra_wt_mul_l0 = weight L0 ++# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19] ++# rb_wt_off = (offset * 2 + 1) << (ra3 - 1) ++ ++# We want (r0r1) ++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... ++# We fetch (after shift) ++# C0 : C3 : C1 : C4 : C2 : C5 : ... ++ ++:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+# r3 = 0 -+:uvloop -+# retrieve texture results and pick out bytes -+# then submit two more texture requests ++.if v_tmu == 0 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment ++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++.else ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment ++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++.endif + -+ sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 # loop counter increment -+ shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next -+ shr r1, r0, 8 ; mov.ifnz r3, ra_y ++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++ min r3, r3, ra9 ; mov.ifnc r0, r2 + -+ max r2, r3, 0 ; mov.ifz ra_base, ra_base_next -+ min r2, r2, rb_max_y -+ add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte -+ -+# generate seven shifted versions -+# interleave with scroll of vertical context -+ -+ setf_nz_if_v ++ mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + +# apply horizontal filter +# The filter coeffs for the two halves of this are the same (unlike in the +# Y case) so it doesn't matter which ra0 we get them from ++# Also as the two halves are locked together we don't need to separate the 1st ++# r0 mul or the last r1 mul as they are vaild for all QPUs + -+ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 -+ sub.setf -, r3, 4 ; mov ra12, ra13 -+ brr.anyn -, r:uvloop -+ mov ra13, ra14 ; mul24 r1, ra14, rb9 -+ mov ra14, ra15 -+ mov ra15, r0 ; mul24 r0, ra12, rb8 -+# >>> .anyn uvloop ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 + -+# apply vertical filter and write to VPM ++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift) ++# Have to dup block as we need to move the brr - code is more common than it ++# looks at first glance ++.if v_bit_depth <= 8 ++ brr.anyn -, r:1b ++ add r2, r2, r3 ; mov ra5, ra6 ++ mov ra6, ra7 ; mul24 r1, ra7, rb10 ++ sub ra7, r2, r0 ; mul24 r0, ra4, rb8 ++.else ++ add r2, r2, r3 ; mov ra5, ra6 ++ brr.anyn -, r:1b ++ mov ra6, ra7 ; mul24 r1, ra7, rb10 ++ sub r2, r2, r0 ; mul24 r0, ra4, rb8 ++ asr ra7, r2, v_bit_depth - 8 ++.endif ++# >>> .anyn 1b + -+ sub r1, r1, r0 ; mul24 r0, ra14, rb10 -+ add r1, r1, r0 ; mul24 r0, ra15, rb11 ++ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay] ++ add r1, r1, r0 ; mul24 r0, ra7, rb11 + sub r1, r1, r0 -+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 + asr r1, r1, 14 -+ nop ; mul24 r1, r1, rb14 -+ shl r1, r1, 8 ++ nop ; mul24 r1, r1, ra_wt_mul_l0 ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, ra3 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> .anyn 1b + -+ add r1, r1, rb12 -+ asr ra1.8as, r1, rb13 -+ nop ; mov r1, r1 << 8 -+ brr.anyn -, r:uvloop -+ asr ra1.8bs, r1, rb13 -+ mov -, vw_wait -+ mov vpm, ra1 ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + -+# >>> ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height + -+# DMA out for U & stash for V -+ bra -, ra_link -+ mov vw_setup, rb26 -+ mov vw_setup, rb29 -+ mov vw_addr, unif # u_dst_addr -+# >>> ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++# At 10 bits ++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits) ++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230 ++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits) ++# (P) ++# * weight (255) = 5987400 = 0x5b5c48 (23 bits) ++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits) ++# ... should be OK ++# ++# (B) ++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits) ++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits) ++# So signed overflow if we sign extend here :-( ++# ++# In practice this doesn't happen (we need a maximal offset and a very unlucky ++# filter). ++# ++# This could be fixed by offsetting the filters s.t. they are unsigned until ++# weight mul and then removing the offset with the weighting offset (I think ++# this should work) or splitting the rounding & offsetting ++ ++::mc_filter_c_p ++ m_filter_c_p 0, 8 ++ ++::mc_filter_c_p_l1 ++ m_filter_c_p 1, 8 + +################################################################################ + -+# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst) ++# mc_filter_c_b + +# At this point we have already issued two pairs of texture requests for the current block +# ra_x, ra_x16_base point to the current coordinates for this block -+::mc_filter_uv_b0 -+ mov -, unif ; mov vw_setup, rb28 # next_fn ignored - always uv_b ++ ++.macro m_filter_c_b, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++.set v_x_mul, (1 << v_x_shift) + +# per-channel shifts were calculated on the *previous* invocation + +# get base addresses and per-channel shifts for *next* invocation -+ mov ra2, unif ; mov r0, elem_num ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y + -+ setf_nz_if_v # Also acts as delay slot for ra2 ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base + -+ add r0, ra2.16b, r0 ; v8subs r1, r1, r1 # x ; r1=0 -+ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base -+ max r0, r0, 0 ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B -+ min r0, r0, rb_max_x ; mov ra1, unif # ; width_height ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 ++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs + -+ shl ra_xshift_next, r0, 4 ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.endif + -+ and r0, r0, -2 ; mov ra0, unif # H filter coeffs -+ add r0, r0, r0 ; mov ra_y_next, ra2.16a -+ and r1, r0, r1 ; mul24 r2, ra1.16b, 2 # r2=x*2 (we are working in pel pairs) ++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra1.16a # Add stripe offsets ; r1=height -+ add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256 ++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B + +# set up VPM write + -+ sub rb29, rb24, r2 ; mov ra3, unif # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs -+ add rb17, r1, 1 -+ add ra31, r1, 3 ; mov rb8, ra3.8a # Combine width and height of destination area ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight ++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight + -+# ; unpack filter coefficients ++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 ++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base ++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs + -+ add r0, r0, r2 ; mov rb9, ra3.8b -+ shl r0, r0, 15 ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb26, r0, rb27 ++# L1 - uniform layout could possibly be optimized + -+ mov r3, 0 ; mov rb11, ra3.8d # Loop count ++ shl r0, ra3.16b, v_x_shift # r0=x*2 ++ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs ++ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight ++ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs ++ min r0, r0, rb_max_x ; mov rb9, ra3.8b + -+ mov rb14, unif # U weight -+ mov.ifnz rb14, unif # V weight ++.if v_bit_depth <= 8 ++ shl rb_xshift2_next, r0, 3 ++.endif + -+# rb14 unused in b0 but will hang around till the second pass -+ -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+# r3 = 0 -+:uvloop_b0 -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+ sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu0 # loop counter increment -+ shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next -+ shr r1, r0, 8 ; mov.ifnz r3, ra_y -+ -+ max r2, r3, 0 ; mov.ifz ra_base, ra_base_next -+ min r2, r2, rb_max_y -+ add ra_y, r3, ra_k1 ; mul24 r2, r2, rb_pitch -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte -+ -+# generate seven shifted versions -+# interleave with scroll of vertical context -+ -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ -+ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 # Need to wait 1 cycle for rotated r1 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 -+ sub.setf -, r3, 4 ; mov ra12, ra13 -+ brr.anyn -, r:uvloop_b0 -+ mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13 -+ mov ra14, ra15 ; mul24 r2, ra15, rb10 # ra15 is about to be ra14 -+ mov ra15, r0 ; mul24 r0, ra12, rb8 -+# >>> .anyn uvloop_b0 -+ -+# apply vertical filter and write to B-FIFO -+ -+ sub r1, r1, r0 ; mov ra8.16b, ra7 # start of B FIFO writes -+ add r1, r1, r2 ; mul24 r0, ra15, rb11 # N.B. ra15 write gap -+ sub r1, r1, r0 ; mov ra7, rb6 -+ -+# FIFO goes: -+# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b -+# This arrangement optimizes the inner loop FIFOs at the expense of making the -+# bulk shift between loops quite a bit nastier -+# a8 used as temp -+ -+ sub.setf -, r3, ra31 -+ asr ra8.16a, r1, 6 ; mov rb6, ra5 # This discards the high bits that might be bad -+ brr.anyn -, r:uvloop_b0 -+ mov ra5, rb4 ; mov rb4, ra4 -+ mov ra4, rb5 ; mov rb5, ra6 -+ mov ra6, rb7 ; mov rb7, ra8 -+# >>> -+ -+# 1st half done all results now in the a/b4..7 fifo -+ -+# Need to bulk rotate FIFO for heights other than 16 -+# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with -+# we are allowed 3/4 cb_size w/h :-( -+ -+# Destination uniforms discarded -+# At the end drop through to _b - we will always do b after b0 -+ -+ sub.setf -, 15, r3 # 12 + 3 of preroll -+ brr.anyn -, r:uv_b0_post_fin # h > 12 (n) => 16 (do nothing) -+ sub r3, 11, r3 ; mov -, unif # r3 = shifts wanted ; Discard u_dst_addr -+ mov r0, i_shift16 ; mov ra_link, unif -+ mov r1, 0x10000 -+# >>> -+ brr.anyz -, r:uv_b0_post12 # h == 12 deal with specially -+# If h != 16 && h != 12 then h <= 8 so -+# shift 8 with discard (.16b = .16a on all regs) -+ shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1 -+ shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1 -+ shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1 -+# >>> -+ shl ra4, ra4, r0 ; mul24 rb4, rb4, r1 -+ -+ shl.setf -, r3, i_shift30 # b2 -> C, b1 -> N -+# Shift 4 -+ mov.ifc ra7, ra4 ; mov.ifc rb6, rb5 -+ mov.ifc ra5, ra6 ; mov.ifc rb4, rb7 -+ # If we shifted by 4 here then the max length remaining is 4 -+ # so that is it -+ -+ brr -, r:uv_b0_post_fin -+# Shift 2 -+ mov.ifn ra7, ra5 ; mov.ifn rb6, rb4 -+ mov.ifn ra5, ra4 ; mov.ifn rb4, rb5 -+ mov.ifn ra4, ra6 ; mov.ifn rb5, rb7 -+ # 6 / 2 so need 6 outputs -+# >>> -+ -+:uv_b0_post12 -+# this one is annoying as we need to swap halves of things that don't -+# really want to be swapped -+ -+# b7a, a6a, b5a, a4a -+# b4a, a5a, b6a, a7a -+# b7b, a6b, b5b, a4b -+# b4b, a5b, b6b, a7b -+ -+ mov r2, ra6 ; mov r3, rb7 -+ shl ra6, ra5, r0 ; mul24 rb7, rb4, r1 -+ mov ra5, r2 ; mov rb4, r3 -+ -+ mov r2, ra4 ; mov r3, rb5 -+ shl ra4, ra7, r0 ; mul24 rb5, rb6, r1 -+ mov ra7, r2 ; mov rb6, r3 -+ -+:uv_b0_post_fin -+ -+##### L1 B processing -+ -+# per-channel shifts were calculated on the *previous* invocation -+ -+# get base addresses and per-channel shifts for *next* invocation -+ mov ra2, unif ; mov r0, elem_num -+ -+ setf_nz_if_v # Also acts as delay slot for ra2 -+ -+ add r0, ra2.16b, r0 ; v8subs r1, r1, r1 # x ; r1=0 -+ sub r1, r1, rb_pitch ; mov r3, unif # r1=pitch2 mask ; r3=base -+ max r0, r0, ra_k0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B -+ min r0, r0, rb_max_x ; mov -, unif # ; width_height -+ -+ shl rb_xshift2_next, r0, 4 -+ -+ and r0, r0, -2 ; mov ra0, unif # H filter coeffs -+ add r0, r0, r0 ; mov ra_y2_next, ra2.16a -+ and r1, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight ++ and r1, r0, r1 ; mov rb10, ra3.8c + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov rb8, ra3.8a # Add stripe offsets ; start unpacking filter coeffs ++ add r0, r0, r1 ; mov rb_dest, unif # Add stripe offsets ; dst_addr + add rb_base2_next, r3, r0 + -+ mov ra1, unif ; mov rb9, ra3.8b # U offset/weight -+ mov.ifnz ra1, unif ; mov rb10, ra3.8c # V offset/weight ++ mov ra9, rb_max_y ; mov rb11, ra3.8d ++ shl r1, ra_wt_off_l1, rb_wt_den_p15 ++ asr rb_wt_off, r1, 9 ; mov ra_link, unif # link + -+ nop ; mov rb11, ra3.8d -+ shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3 # ; r3 (loop counter) = 0 -+ asr rb12, r1, 1 -+ -+# ra1.16a used directly in the loop ++# r5 loop counter ++# ra0 H coeffs L0 ++# ra1 H coeffs L1 ++# ra2 V coeffs L0 ++# ra3 temp ++# ra4-7 L0 H FIFO ++# rb4-7 L1 H FIFO ++# rb8-rb11 V coeffs L1 ++# ra9 rb_max_y alias + ++:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment ++ shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++ add ra_y, 1, ra_y ; mov r3, ra_y + -+# r3 = 0 ++ max r3, r3, ra_k0 ; mov r0, r1 << 15 ++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 + -+:uvloop_b -+# retrieve texture results and pick out bytes -+# then submit two more texture requests ++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + -+ sub.setf -, r3, rb17 ; v8adds rb31, r3, ra_k1 ; ldtmu1 # loop counter increment -+ shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next -+ shr r1, r0, 8 ; mov.ifnz r3, ra_y2 ++# L0 H-filter ++# H FIFO scrolls are spread all over this loop ++ mov rb4, rb5 ; mov ra4, ra5 # ? Just moves + -+ max r2, r3, ra_k0 ; mov.ifz ra_base2, rb_base2_next -+ min r2, r2, rb_max_y -+ add ra_y2, r3, ra_k1 ; mul24 r2, r2, rb_pitch -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++.if v_bit_depth <= 8 ++ sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++.else ++ sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++ asr ra3, r2, (v_bit_depth - 8) ++.endif + -+# generate seven shifted versions -+# interleave with scroll of vertical context ++ shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++ shr r1, r2, v_v_shift ; mov r3, ra_y2 ++ add ra_y2, r3, ra_k1 ; mov rb6, rb7 + -+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ max r3, r3, ra_k0 ; mov r0, r1 << 15 ++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 + -+ and r1, r1, rb_k255 ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 -+ sub.setf -, r3, 4 ; mov ra12, ra13 -+ brr.anyn -, r:uvloop_b -+ mov ra13, ra14 ; mul24 r1, ra14, rb9 -+ mov ra14, ra15 ; mul24 r2, ra15, rb10 -+ mov ra15, r0 ; mul24 r0, ra12, rb8 -+# >>> .anyn uvloop_b ++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++ add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + -+# apply vertical filter and write to VPM ++# L1 H-filter + -+ sub r1, r1, r0 ; mov ra8.16b, ra7 # FIFO rotate (all ra/b4..7) -+ add r1, r1, r2 ; mul24 r0, ra15, rb11 -+ sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14 -+ mov ra7, rb6 ; mul24 r1, r1, ra_k256 -+ asr r1, r1, 14 ; mov rb6, ra5 # shift2=6 ++ and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 ++ nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++# V filters - start in branch delay slots of H ++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction ++ add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ brr.anyn -, r:1b ++ mov ra6, ra7 ; mul24 r3, ra7, rb10 ++ sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a ++ asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 ++# >>> .anyn 1b + -+ mov ra5, rb4 ; mul24 r1, r1, ra1.16a -+ add r1, r1, r0 ; mov rb4, ra4 ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay] ++ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++ sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++ add r1, r1, r0 ; mul24 r0, ra7, rb11 ++ sub r1, r1, r0 ; mul24 r2, r2, ra_k256 + -+ mov ra4, rb5 ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend -+ add r1, r1, rb12 ; mov rb5, ra6 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1) ++ asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 + -+ sub.setf -, r3, ra31 ; mov ra6, rb7 -+ asr ra3.8as, r1, rb13 -+ nop ; mov r1, r1 << 8 -+ brr.anyn -, r:uvloop_b -+ asr ra3.8bs, r1, rb13 -+ mov -, vw_wait ; mov rb7, ra8 # vw_wait is B-reg (annoyingly) ; Final FIFO mov -+ mov vpm, ra3 -+# >>> ++ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9) ++ add r1, r1, r2 ; mov r3, ra_blk_height ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend ++ ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link + -+ bra -, ra_link -+ mov vw_setup, rb26 -+ mov vw_setup, rb29 -+ mov vw_addr, unif # c_dst_addr ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm + ++::mc_filter_c_b ++ m_filter_c_b 8 + +################################################################################ ++# Exit code used by both Luma & Chroma so place between them to avoid I-cache ++# conflicts ++ ++.macro m_exit_drain ++.if PREREAD == 2 ++# Special case 2 as loop is wasteful ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ nop ; nop ; ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 ++.else ++ mov.setf r3, PREREAD - 1 ++:1 ++ brr.anynz -, r:1b ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ sub.setf r3, r3, 1 ++ # >>> ++ mov -, vw_wait ++.endif ++.endm ++ ++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) ++# All qpus start at the beginning and after that (group - 1) must have finished ++# before (group) can start ++# ++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain ++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - ++# lockup otherwise) ++# ++# There is some, currently ill defined, potential lockup if we have the VDM active ++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? ++# ++# The code stalled when I had many waiters on a single sem so we have a ++# "ripple" of srels to restart. Unsure why, may have been bug, but this works ++# and we currently have both the memory & sems to support it. ++.macro m_sync_q, n_qpu, n_quads ++# Do not generate code for qpu >= quads * 4 - fns should never be called ++.if n_qpu < n_quads * 4 ++ mov ra_link, unif # Can only branch to an a reg (not r0) ++ mov -, vw_wait # [ra_link delay] ++ ++.set n_sem_sync, n_qpu - (n_qpu % 4) ++.set n_sem_in, n_qpu ++.set n_sem_out, n_qpu + 1 ++ ++.if n_qpu % 4 == 0 ++ ++.set n_sem_quad_in, 12 + n_qpu / 4 ++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) ++ ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ bra -, ra_link ++ sacq -, n_sem_quad_in ++ srel -, n_sem_out ++ srel -, n_sem_quad_out ++ ++.else ++ bra -, ra_link ++ srel -, n_sem_sync ++ sacq -, n_sem_in ++.if n_sem_out % 4 != 0 ++ srel -, n_sem_out ++.else ++ nop ++.endif ++.endif ++.endif ++.endm ++ ++.set v_quads8, N_QPU_8 / 4 ++ ++::mc_sync_q0 ++ m_sync_q 0, v_quads8 ++::mc_sync_q1 ++ m_sync_q 1, v_quads8 ++::mc_sync_q2 ++ m_sync_q 2, v_quads8 ++::mc_sync_q3 ++ m_sync_q 3, v_quads8 ++::mc_sync_q4 ++ m_sync_q 4, v_quads8 ++::mc_sync_q5 ++ m_sync_q 5, v_quads8 ++::mc_sync_q6 ++ m_sync_q 6, v_quads8 ++::mc_sync_q7 ++ m_sync_q 7, v_quads8 ++::mc_sync_q8 ++ m_sync_q 8, v_quads8 ++::mc_sync_q9 ++ m_sync_q 9, v_quads8 ++::mc_sync_q10 ++ m_sync_q 10, v_quads8 ++::mc_sync_q11 ++ m_sync_q 11, v_quads8 + +# mc_exit() -+ -+::mc_interrupt_exit8c -+ ldtmu0 -+ ldtmu1 -+ ldtmu1 -+ mov -, vw_wait ; nop ; ldtmu0 # wait on the VDW -+ -+ mov -,sacq(0) # 1 -+ mov -,sacq(0) # 2 -+ mov -,sacq(0) # 3 -+ mov -,sacq(0) # 4 -+ mov -,sacq(0) # 5 -+ mov -,sacq(0) # 6 -+ mov -,sacq(0) # 7 -+# mov -,sacq(0) # 8 -+# mov -,sacq(0) # 9 -+# mov -,sacq(0) # 10 -+# mov -,sacq(0) # 11 -+ -+ nop ; nop ; thrend -+ mov interrupt, 1; nop # delay slot 1 -+ nop ; nop # delay slot 2 -+ +# Chroma & Luma the same now -+::mc_exit_c -+::mc_exit -+ ldtmu0 -+ ldtmu1 -+ ldtmu0 -+ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW + -+ mov -,srel(0) ++.macro m_exit_qn ++ m_exit_drain ++ nop ; nop ; thrend ++ nop ++ nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_qn ++::mc_exit_y_qn ++ m_exit_qn + -+ nop ; nop ; thrend -+ nop ; nop # delay slot 1 -+ nop ; nop # delay slot 2 + + +# mc_interrupt_exit12() -+::mc_interrupt_exit12 -+ ldtmu0 -+ ldtmu1 -+ ldtmu0 -+ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW + -+ mov -,sacq(0) # 1 -+ mov -,sacq(0) # 2 -+ mov -,sacq(0) # 3 -+ mov -,sacq(0) # 4 -+ mov -,sacq(0) # 5 -+ mov -,sacq(0) # 6 -+ mov -,sacq(0) # 7 -+ mov -,sacq(0) # 8 -+ mov -,sacq(0) # 9 -+ mov -,sacq(0) # 10 -+ mov -,sacq(0) # 11 ++.macro m_exit_q0 ++ m_exit_drain ++ sacq -, 12 ++ nop ; nop ; thrend ++ mov interrupt, 1 ++ nop ++# >>> thrend <<< ++.endm + -+ nop ; nop ; thrend -+ mov interrupt, 1; nop # delay slot 1 -+ nop ; nop # delay slot 2 -+ -+ -+::mc_exit1 -+ mov -, vw_wait # wait on the VDW -+ -+ ldtmu0 -+ ldtmu1 -+ ldtmu0 -+ ldtmu1 -+ nop ; nop ; thrend -+ mov interrupt, 1; nop # delay slot 1 -+ nop ; nop # delay slot 2 ++::mc_exit_c_q0 ++::mc_exit_y_q0 ++ m_exit_q0 + +# LUMA CODE + +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. +# For P frames we make the second x,y coordinates offset by +8 + ++ +################################################################################ -+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel) -+::mc_setup ++# mc_setup ++# ++# typedef struct qpu_mc_pred_y_s_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t pic_h; ++# uint16_t pic_w; ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_s_t; ++ ++.macro m_setup_y, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_pmask, 0xff ++.set v_blk_height, Y_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 1 ++.set v_pmask, 0xffff ++.set v_blk_height, Y_BLK_HEIGHT_16 ++.endif ++ ++ + # Need to save these because we need to know the frame dimensions before computing texture coordinates -+ mov tmurs, 1 ; mov ra8, unif # No TMU swap ; y_x -+ mov ra9, unif # ref_y_base -+ mov ra10, unif # y2_x2 -+ mov ra11, unif # ref_y2_base ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ mov ra9, unif # ref_y_base ++ mov ra1, unif # x2_y2 ++ mov ra11, unif # ref_y2_base ++ ++# load constants ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ++ ++ ++ mov ra_kff100100, 0xff100100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ ++# Compute part of VPM to use + +# Read image dimensions -+ mov ra3, unif # width_height -+ mov rb_xpitch, unif # stride2 ++ mov ra3, unif # width_height ++ mov rb_xpitch, unif # stride2 ++.if v_x_shift == 0 + sub rb_max_x, ra3.16b, 1 ++.else ++ sub r0, ra3.16b, 1 ++ shl rb_max_x, r0, v_x_shift ++.endif + sub rb_max_y, ra3.16a, 1 -+ mov rb_pitch, unif # stride1 ++ mov rb_pitch, unif # stride1 + +# get destination pitch + mov r1, vdw_setup_1(0) -+ or rb24, r1, rb_pitch ++ or rb_dma1_base, r1, rb_pitch + +# Compute base address for first and second access + mov r3, elem_num -+ add r0, ra8.16a, r3 # Load x + elem_num ++ add r0, ra0.16b, r3 # Load x + elem_num ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif + max r0, r0, 0 + min r0, r0, rb_max_x + shl ra_xshift_next, r0, 3 # Compute shifts + -+ -+# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs ++# X is byte offset - we can only load words - mask + + and r0, r0, -4 ; v8subs r2, r2, r2 + sub r2, r2, rb_pitch + and r1, r0, r2 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets ++ add r0, r0, r1 # Add stripe offsets + add ra_base, ra9, r0 + -+ mov r1, ra8.16b # Load y -+ add ra_y, r1, 1 # Set for next -+ max r1, r1, 0 -+ min r1, r1, rb_max_y -+ -+# submit texture requests for first line -+ nop ; mul24 r1, r1, rb_pitch -+ add t0s, ra_base, r1 -+ -+ + # r3 still contains elem_num -+ add r0, ra10.16a, r3 # Load x ++ add r0, ra1.16b, r3 # Load x ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif + max r0, r0, 0 + min r0, r0, rb_max_x -+ shl rb_xshift2_next, r0, 3 # Compute shifts ++ shl rb_xshift2_next, r0, 3 # Compute shifts + + # r2 still contains mask + and r0, r0, -4 + and r1, r0, r2 + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets ++ add r0, r0, r1 # Add stripe offsets + add ra_base2, ra11, r0 + -+ mov r1, ra10.16b # Load y -+ add ra_y2, r1, 1 # Set for next -+ max r1, r1, 0 ++# Do preloads ++ nop ; mov r0, ra0.16a # ; r0 = y ++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 ++ ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 + min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 + -+# submit texture requests for first line -+ nop ; mul24 r1, r1, rb_pitch -+ add t1s, ra_base2, r1 ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b + -+# load constants ++ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom + -+ mov ra_k1, 1 -+ mov ra_k256, 256 -+ mov rb_k255, 255 -+ mov ra_k0, 0 ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ ++ mov ra_link, unif # Next fn + +# touch vertical context to keep simulator happy -+ + mov ra8, 0 ; mov rb8, 0 ++ bra -, ra_link + mov ra9, 0 ; mov rb9, 0 + mov ra10, 0 ; mov rb10, 0 + mov ra11, 0 ; mov rb11, 0 ++# >>> ra_link ++.endm + -+# Compute part of VPM to use -+ m_calc_dma_regs rb28, rb27 -+ -+# Weighted prediction denom -+ add rb13, unif, 9 # unif = weight denom + 6 -+ -+# submit texture requests for second line -+ max r1, ra_y, 0 -+ min r1, r1, rb_max_y -+ add ra_y, ra_y, 1 -+ mov -, unif ; mul24 r1, r1, rb_pitch # unused ; -+ add t0s, r1, ra_base -+ -+ max r1, ra_y2, 0 -+ min r1, r1, rb_max_y -+ add ra_y2, ra_y2, 1 -+ nop ; mul24 r1, r1, rb_pitch -+ add t1s, r1, ra_base2 -+ -+# FALL THROUGHT TO PER-BLOCK SETUP ++::mc_setup_y_q0 ++ m_setup_q0 ++::mc_setup_y_qn ++ m_setup_y 8 + ++################################################################################ ++# +# Start of per-block setup code +# P and B blocks share the same setup code to save on Icache space -+:per_block_setup -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] -+ mov ra_link, unif -+#### We do all the setup even if we are about to exit - reading junk from unif.... + -+ mov ra1, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? -+ -+# per-channel shifts were calculated on the *previous* invocation -+ mov ra_xshift, ra_xshift_next -+ mov rb_xshift2, rb_xshift2_next ++# luma_setup_delay3 done in delay slots of branch that got us here + +# get base addresses and per-channel shifts for *next* invocation ++# per-channel shifts were calculated on the *previous* invocation + -+ add r0, ra1.16a, r3 # Load x -+ max r0, r0, 0 ++# 1st 3 instructions of per_block-setup in branch delay ++# ++# typedef struct qpu_mc_pred_y_p_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t h; ++# uint16_t w; ++# uint32_t mymx21; ++# uint32_t wo1; ++# uint32_t wo2; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p_t; ++# ++ ++.macro m_luma_setup, v_bit_depth ++# Hack - QASM may well have have label pasting but I have no idea how... ++.if v_bit_depth == 8 ++ brr ra_link, r:per_block_setup_8 ++.elif v_bit_depth == 10 ++ brr ra_link, r:per_block_setup_10 ++.endif ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++.endm ++ ++.macro m_per_block_setup, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next + min r0, r0, rb_max_x + + shl ra_xshift_next, r0, 3 # Compute shifts -+ and r0, r0, -4 ; v8subs r2, r2, r2 -+ sub r2, r2, rb_pitch -+ and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets -+ add ra_base_next, unif, r0 # Base1 -+ mov ra_y_next, ra1.16b # Load y -+ mov ra1, unif # x2_y2 -+ nop # ra1 delay -+ -+ add r0, ra1.16a, r3 # Load x2 -+ max r0, r0, 0 -+ min r0, r0, rb_max_x -+ -+ shl rb_xshift2_next, r0, 3 # Compute shifts + and r0, r0, -4 -+ and r1, r0, r2 ++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a + xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets -+ add rb_base2_next, unif, r0 # Base1 -+ mov ra_y2_next, ra1.16b # Load y -+ mov ra_width_height, unif # width_height ++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y ++ add ra_base_next, ra_base_next, r0 # [ra1 delay] + -+# set up VPM write -+ mov vw_setup, rb28 # [ra1 delay] ++ add r0, ra1.16b, r3 # Load x2 ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height ++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes ++ add rb_base2_next, rb_base2_next, r0 + -+# get width,height of block (unif load above) -+ sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width) -+ add rb17, ra_height, 5 ; mov r0, ra_height -+ mov r1, 16 -+ min r0, r0, r1 -+ add rb18, r0, 7 -+ shl r0, r0, 7 -+ add r0, r0, ra_width # Combine width and height of destination area -+ shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb26, r0, rb27 ; mov r0, unif # Packed filter offsets ++# get width,height of block (unif load above), r1 = width * pel_size ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height ++ add rb_lcount, r0, 7 ++ shl r0, r0, v_dma_h_shift ++ add r0, r0, r1 # Combine width and height of destination area ++ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets + +# get filter coefficients and discard unused B frame values -+ shl.ifz r0, r0, i_shift16 ; mov ra5, unif # Pick half to use ; L0 offset/weight -+ mov r2, 0x01040400 # [ra5 delay] -+ shl ra8, r0, 3 ; mov rb14, ra5.16a ++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight ++ shl ra8, r0, 3 ; mov r3, ra_k255 + +# Pack the 1st 4 filter coefs for H & V tightly ++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) + -+ mov r1,0x00010100 # -ve ++ mov r1,0x00010100 # -ve [ra8 delay] + ror ra2.8a, r1, ra8.8d + ror ra0.8a, r1, ra8.8c + -+ ror ra2.8b, r2, ra8.8d -+ ror ra0.8b, r2, ra8.8c ++ mov r1, 0x01040400 ++ ror ra2.8b, r1, ra8.8d ++ ror ra0.8b, r1, ra8.8c + + mov r1,0x050b0a00 # -ve + ror ra2.8c, r1, ra8.8d @@ -17390,49 +26337,44 @@ index 0000000..aa3fe47 + ror ra2.8d, r1, ra8.8d + ror ra0.8d, r1, ra8.8c + -+# In the 2nd vertical half we use b registers due to -+# using a-side fifo regs. The easiest way to achieve this to pack it -+# and then unpack! ++# In the 2nd vertical half we use b registers due to using a-side fifo regs + + mov r1,0x3a281100 -+ ror ra3.8a, r1, ra8.8d -+ ror ra1.8a, r1, ra8.8c ++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 + + mov r1,0x0a0b0500 # -ve -+ ror ra3.8b, r1, ra8.8d -+ ror ra1.8b, r1, ra8.8c ++ ror r0, r1, ra8.8d ++ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 + + mov r1,0x04040100 -+ ror ra3.8c, r1, ra8.8d -+ ror ra1.8c, r1, ra8.8c ++ ror r0, r1, ra8.8d ++ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 ++ ++ mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address + + mov r1,0x01010000 # -ve -+ ror ra3.8d, r1, ra8.8d -+ ror ra1.8d, r1, ra8.8c -+ -+# Extract weighted prediction information in parallel -+# We are annoyingly A src limited here -+ -+ mov rb4, ra3.8a ; mov ra18, unif -+ mov rb5, ra3.8b -+ mov rb6, ra3.8c -+ mov.ifnz ra5, ra18 -+ -+ mov rb_dest, unif # Destination address ++ ror r0, r1, ra8.8d + + bra -, ra_link ++ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 + -+ shl r0, ra5.16b, rb13 # Offset calc -+ asr rb12, r0, 9 # For B l1 & L0 offsets should be identical so it doesn't matter which we use -+ mov r3, 0 ; mov rb7, ra3.8d ++ shl r0, ra_wt_off_l0, rb_wt_den_p15 # Offset calc ++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ asr rb_wt_off, r0, 9 ; mov ra_link, unif # ; link - load after we've used its previous val +# >>> branch ra_link -+# -+# r3 = 0 -+# ra18.16a = weight L1 -+# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) -+# rb12 = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1) -+# rb13 = weight denom + 6 + 9 -+# rb14 = weight L0 ++ ++# r5 = 0 ++# ra_wt_mul_l1 = weight L1 ++# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) ++# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) ++# rb_wt_den_p15 = weight denom + 6 + 9 ++# rb_wt_mul_l0 = weight L0 ++.endm ++ ++:per_block_setup_8 ++ m_per_block_setup 8 ++ + + +################################################################################ @@ -17440,137 +26382,118 @@ index 0000000..aa3fe47 +# In a P block, y2_x2 should be y_x+8 +# At this point we have already issued two pairs of texture requests for the current block + -+::mc_filter -+# ra5.16a = weight << 16; We want weight * 2 in rb14 ++.macro m_filter_y_pxx, v_bit_depth ++ m_luma_setup v_bit_depth + -+ shl rb14, ra5.16a, 1 ++ shl ra_wt_mul_l0, ra_wt_mul_l0, 1 + -+# r3 = 0 ++# r5 = 0 (loop count) + -+:yloop ++:1 +# retrieve texture results and pick out bytes +# then submit two more texture requests + -+# If we knew there was no clipping then this code would get simpler. -+# Perhaps we could add on the pitch and clip using larger values? -+ +# N.B. Whilst y == y2 as far as this loop is concerned we will start +# the grab for the next block before we finish with this block and that +# might be B where y != y2 so we must do full processing on both y and y2 + -+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 -+ mov.ifz ra_base, ra_base_next ; mov rb31, r3 -+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next + -+ max r2, ra_y2, 0 # y -+ min r2, r2, rb_max_y -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ; mov ra7, ra8 ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + -+# generate seven shifted versions -+# interleave with scroll of vertical context -+ -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 + +# apply horizontal filter -+ nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r3, 8 ; mov r1, ra8 -+ mov ra8, ra9 ; mov rb8, rb9 -+ brr.anyn -, r:yloop -+ mov ra9, ra10 ; mov rb9, rb10 -+ mov ra10, ra11 ; mov rb10, rb11 -+ mov ra11, r0 ; mov rb11, r1 -+ # >>> .anyn yloop ++ sub.setf -, r5, 8 ; mov ra9, ra10 ++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++ brr.anyn -, r:1b ++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++ mov ra10, ra11 ; mov rb10, rb11 ++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++ # >>> .anyn 1b + + # apply vertical filter and write to VPM -+ -+ nop ; mul24 r0, rb8, ra2.8a -+ nop ; mul24 r1, rb9, ra2.8b -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 ; mov -, vw_wait ++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb4 ++ add r1, r1, r0 ; mul24 r0, ra9, rb5 ++ sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++ add r1, r1, r0 ; mul24 r0, ra11, rb7 ++ sub r1, r1, r0 +# At this point r1 is a 22-bit signed quantity: 8 (original sample), +# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign) +# The top 8 bits have rubbish in them as mul24 is unsigned +# The low 6 bits need discard before weighting -+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish + asr r1, r1, 14 -+ nop ; mul24 r1, r1, rb14 -+ add r1, r1, rb12 ++ nop ; mul24 r1, r1, ra_wt_mul_l0 ++ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop ++ ++ shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch + -+ shl r1, r1, 8 -+ brr.anyn -, r:yloop -+ asr r1, r1, rb13 -+# We have a saturating pack unit - I can't help feeling it should be useful here -+ min r1, r1, rb_k255 # Delay 2 rb_k255 = 255 -+ max vpm, r1, 0 # Delay 3 +# >>> branch.anyn yloop + -+# If looping again the we consumed 16 height last loop -+ # rb29 (stride) remains constant -+ # rb17 remains const (based on total height) -+ # recalc rb26, rb18 based on new segment height -+ # N.B. r3 is loop counter still ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + -+ mov r1, 16 -+ sub r0, ra_height, r1 -+ mov ra_height, r0 -+ max.setf r0, r0, 0 # Done if Z now ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out -+ brr.anyz -, r:per_block_setup -+ mov vw_setup, rb26 # VDW setup 0 Delay 1 -+ mov vw_setup, rb29 # Stride Delay 2 -+ mov vw_addr, rb_dest # start the VDW Delay 3 -+# >>> .anyz per_block_setup -+ -+ min r0, r0, r1 -+ add rb18, rb18, r0 -+ sub r0, r0, r1 -+ shl r0, r0, i_shift23 -+ add rb26, rb26, r0 -+ -+ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 -+ add rb_dest, rb_dest, r0 -+ -+ mov vw_setup, rb28 # Reset our VDM write pointer -+ -+ brr -, r:yloop -+ nop -+ nop -+ nop -+# >>> -+ ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link + ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm + ++::mc_filter_y_pxx ++ m_filter_y_pxx 8 + + +################################################################################ @@ -17578,243 +26501,1106 @@ index 0000000..aa3fe47 +# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) +# In a P block, only the first half of coefficients contain used information. +# At this point we have already issued two pairs of texture requests for the current block -+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?) -+# Can fill in the coefficients so only -+# Can also assume default weighted prediction for B frames. +# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time? +# Or possibly by taking advantage of symmetry? -+# From 19->7 32bits per command. + -+::mc_filter_b -+ # r0 = weightL0 << 16, we want it in rb14 -+# asr rb14, r0, i_shift16 ++.macro m_filter_y_bxx, v_bit_depth ++ m_luma_setup v_bit_depth + -+:yloopb -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ -+# If we knew there was no clipping then this code would get simpler. -+# Perhaps we could add on the pitch and clip using larger values? -+ -+ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ; ldtmu1 -+ mov.ifz ra_base, ra_base_next ; mov rb31, r3 -+ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y2, ra_y2_next ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch + + max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next + -+ max r2, ra_y2, 0 # y -+ min r2, r2, rb_max_y -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r1, r1, rb_k255 ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ; mov ra7, ra8 ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte + -+# generate seven shifted versions -+# interleave with scroll of vertical context -+ -+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] ++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 + +# apply horizontal filter -+ nop ; mul24 r3, ra0.8a, r0 -+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 -+ sub r0, r2, r3 ; mov r3, rb31 ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 + -+ sub.setf -, r3, 8 ; mov r1, ra8 -+ mov ra8, ra9 ; mov rb8, rb9 -+ brr.anyn -, r:yloopb -+ mov ra9, ra10 ; mov rb9, rb10 -+ mov ra10, ra11 ; mov rb10, rb11 -+ mov ra11, r0 ; mov rb11, r1 -+ # >>> .anyn yloopb ++ sub.setf -, r5, 8 ; mov ra9, ra10 ++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++ brr.anyn -, r:1b ++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++ mov ra10, ra11 ; mov rb10, rb11 ++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++ # >>> .anyn 1b + + # apply vertical filter and write to VPM -+ nop ; mul24 r0, rb8, ra2.8a -+ nop ; mul24 r1, rb9, ra2.8b -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 ; mov r2, rb12 ++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb4 ++ add r1, r1, r0 ; mul24 r0, ra9, rb5 ++ sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++ add r1, r1, r0 ; mul24 r0, ra11, rb7 ++ sub r1, r1, r0 ; mov r2, rb_wt_off +# As with P-pred r1 is a 22-bit signed quantity in 32-bits +# Top 8 bits are bad - low 6 bits should be discarded -+ sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256 ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 + + asr r1, r1, 14 -+ nop ; mul24 r0, r1, rb14 -+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0 ++ nop ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 + -+ add r1, r1, r0 ; mov -, vw_wait -+ shl r1, r1, 8 ++ add r1, r1, r0 ; mov r3, ra_blk_height ++ shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b + -+ brr.anyn -, r:yloopb -+ asr r1, r1, rb13 # Delay 1 -+ min r1, r1, rb_k255 # Delay 2 -+ max vpm, r1, 0 # Delay 3 ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) + ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height + -+# If looping again the we consumed 16 height last loop -+ # rb29 (stride) remains constant -+ # rb17 remains const (based on total height) -+ # recalc rb26, rb18 based on new segment height -+ # N.B. r3 is loop counter still -+ -+ mov r1, 16 -+ sub r0, ra_height, r1 -+ mov ra_height, r0 -+ max.setf r0, r0, 0 # Done if Z now ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 + +# DMA out -+ brr.anyz -, r:per_block_setup -+ mov vw_setup, rb26 # VDW setup 0 Delay 1 -+ mov vw_setup, rb29 # Stride Delay 2 -+ mov vw_addr, rb_dest # start the VDW Delay 3 -+# >>> .anyz per_block_setup ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link + -+ min r0, r0, r1 -+ add rb18, rb18, r0 -+ sub r0, r0, r1 -+ shl r0, r0, i_shift23 -+ add rb26, rb26, r0 ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm + -+ nop ; mul24 r0, r1, rb_pitch # r0 = pitch*16 -+ add rb_dest, rb_dest, r0 -+ -+ mov vw_setup, rb28 # Reset our VDM write pointer -+ -+ brr -, r:yloopb -+ nop -+ nop -+ nop ++::mc_filter_y_bxx ++ m_filter_y_bxx 8 + +################################################################################ ++# ++# typedef struct qpu_mc_pred_y_p00_s { ++# qpu_mc_src_t next_src1; ++# uint16_t h; ++# uint16_t w; ++# uint32_t wo1; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p00_t; ++ ++.macro m_filter_y_p00, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++ mov ra0, unif ; mov r3, elem_num # y_x ++ mov ra_xshift, ra_xshift_next # [ra0 delay] ++ add r0, ra0.16b, r3 ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height ++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write ++ ++# get width,height of block (unif load above) ++# Compute vdw_setup1(dst_pitch-width) ++ shl r1, ra_width, v_x_shift ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset ++ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr ++ add rb_dma0, r0, rb_dma0_base ++ ++ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 ++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_p00 ++ m_filter_y_p00 8 ++ ++################################################################################ ++ ++.macro m_filter_y_b00, v_bit_depth ++# luma setup does a fair bit more than we need calculating filter coeffs ++# that we will never use but it saves I-cache to use it (also simple!) ++ m_luma_setup v_bit_depth ++ ++# Fix up vals that were expecting a filter (somewhat icky) ++ mov r0, 7 ++ sub rb_i_tmu, rb_i_tmu, r0 ++ sub rb_lcount, rb_lcount, r0 ++ mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++ shl rb_wt_off, rb_wt_off, r0 ++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++ add r1, r0, r1 ++ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_b00 ++ m_filter_y_b00 8 ++ ++################################################################################ ++################################################################################ ++# 10 BIT ++ ++::mc_setup_c10_q0 ++ m_setup_q0 ++::mc_setup_c10_qn ++ m_setup_c 10 ++ ++::mc_filter_c10_p ++ m_filter_c_p 0, 10 ++ ++::mc_filter_c10_p_l1 ++ m_filter_c_p 1, 10 ++ ++ ++::mc_filter_c10_b ++ m_filter_c_b 10 ++ ++# Even if these fns are the same as for other bit depths we want our own copy ++# to keep the code we are using in a single lump to avoid (direct map) cache ++# thrashing ++.set v_quads10, N_QPU_16 / 4 ++ ++::mc_sync10_q0 ++ m_sync_q 0, v_quads10 ++::mc_sync10_q1 ++ m_sync_q 1, v_quads10 ++::mc_sync10_q2 ++ m_sync_q 2, v_quads10 ++::mc_sync10_q3 ++ m_sync_q 3, v_quads10 ++::mc_sync10_q4 ++ m_sync_q 4, v_quads10 ++::mc_sync10_q5 ++ m_sync_q 5, v_quads10 ++::mc_sync10_q6 ++ m_sync_q 6, v_quads10 ++::mc_sync10_q7 ++ m_sync_q 7, v_quads10 ++::mc_sync10_q8 ++ m_sync_q 8, v_quads10 ++::mc_sync10_q9 ++ m_sync_q 9, v_quads10 ++::mc_sync10_q10 ++ m_sync_q 10, v_quads10 ++::mc_sync10_q11 ++ m_sync_q 11, v_quads10 ++ ++::mc_exit_y10_q0 ++::mc_exit_c10_q0 ++ m_exit_q0 ++ ++::mc_exit_y10_qn ++::mc_exit_c10_qn ++ m_exit_qn ++ ++::mc_setup_y10_q0 ++ m_setup_q0 ++::mc_setup_y10_qn ++ m_setup_y 10 ++ ++:per_block_setup_10 ++ m_per_block_setup 10 ++ ++::mc_filter_y10_pxx ++ m_filter_y_pxx 10 ++ ++::mc_filter_y10_p00 ++ m_filter_y_p00 10 ++ ++::mc_filter_y10_bxx ++ m_filter_y_bxx 10 ++ ++::mc_filter_y10_b00 ++ m_filter_y_b00 10 ++ ++ + +::mc_end +# Do not add code here because mc_end must appear after all other code. diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h new file mode 100644 -index 0000000..27cbb59 +index 0000000000..9f8983da52 --- /dev/null +++ b/libavcodec/rpi_shader_cmd.h -@@ -0,0 +1,88 @@ +@@ -0,0 +1,128 @@ +#ifndef RPI_SHADER_CMD_H +#define RPI_SHADER_CMD_H + +#pragma pack(push, 4) + -+typedef struct qpu_mc_pred_c_s { ++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y ++// If mixed then we are just confused and get a lot of warnings.... ++typedef const uint8_t * qpu_mc_src_addr_t; ++typedef uint8_t * qpu_mc_dst_addr_t; ++#else ++typedef uint32_t qpu_mc_src_addr_t; ++typedef uint32_t qpu_mc_dst_addr_t; ++#endif ++ ++typedef struct qpu_mc_src_s ++{ ++ int16_t y; ++ int16_t x; ++ qpu_mc_src_addr_t base; ++} qpu_mc_src_t; ++ ++ ++typedef struct qpu_mc_pred_c_p_s { ++ qpu_mc_src_t next_src; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ qpu_mc_dst_addr_t dst_addr_c; + uint32_t next_fn; -+ int16_t next_src_y; -+ int16_t next_src_x; -+ uint32_t next_src_base_c; ++} qpu_mc_pred_c_p_t; ++ ++typedef struct qpu_mc_pred_c_b_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x1; ++ uint32_t coeffs_y1; ++ uint32_t weight_u1; ++ uint32_t weight_v1; ++ qpu_mc_src_t next_src2; ++ uint32_t coeffs_x2; ++ uint32_t coeffs_y2; ++ uint32_t wo_u2; ++ uint32_t wo_v2; ++ qpu_mc_dst_addr_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_b_t; ++ ++typedef struct qpu_mc_pred_c_s_s { ++ qpu_mc_src_t next_src1; ++ uint32_t pic_cw; // C Width (== Y width / 2) ++ uint32_t pic_ch; // C Height (== Y Height / 2) ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ qpu_mc_src_t next_src2; ++ uint32_t next_fn; ++} qpu_mc_pred_c_s_t; ++ ++typedef struct qpu_mc_pred_c_s { + union { -+ struct { -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t wo_u; -+ uint32_t wo_v; -+ uint32_t dst_addr_c; -+ } p; -+ struct { -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t weight_u; -+ uint32_t weight_v; -+ uint32_t dummy0; -+ } b0; -+ struct { -+ uint32_t dummy0; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t wo_u; -+ uint32_t wo_v; -+ uint32_t dst_addr_c; -+ } b1; -+ struct { -+ uint32_t pic_cw; // C Width (== Y width / 2) -+ uint32_t pic_ch; // C Height (== Y Height / 2) -+ uint32_t stride2; -+ uint32_t stride1; -+ uint32_t wdenom; -+ uint32_t dummy0; -+ } s0; -+ struct { -+ uint32_t dummy0; -+ uint32_t dummy1; -+ uint32_t dummy2; -+ uint32_t dummy3; -+ uint32_t dummy4; -+ uint32_t dummy5; -+ } s1; ++ qpu_mc_pred_c_p_t p; ++ qpu_mc_pred_c_b_t b; ++ qpu_mc_pred_c_s_t s; + }; +} qpu_mc_pred_c_t; + -+typedef struct qpu_mc_pred_y_s { -+ int16_t next_src1_x; -+ int16_t next_src1_y; -+ uint32_t next_src1_base; -+ int16_t next_src2_x; -+ int16_t next_src2_y; -+ uint32_t next_src2_base; -+ union { -+ struct { -+ uint16_t h; -+ uint16_t w; -+ uint32_t mymx21; -+ uint32_t wo1; -+ uint32_t wo2; -+ uint32_t dst_addr; -+ } p; -+ struct { -+ uint16_t pic_h; -+ uint16_t pic_w; -+ uint32_t stride2; -+ uint32_t stride1; -+ uint32_t wdenom; -+ uint32_t dummy0; -+ } s; -+ }; ++ ++typedef struct qpu_mc_pred_y_p_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t h; ++ uint16_t w; ++ uint32_t mymx21; ++ uint32_t wo1; ++ uint32_t wo2; ++ qpu_mc_dst_addr_t dst_addr; + uint32_t next_fn; ++} qpu_mc_pred_y_p_t; ++ ++typedef struct qpu_mc_pred_y_p00_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t wo1; ++ qpu_mc_dst_addr_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p00_t; ++ ++typedef struct qpu_mc_pred_y_s_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t pic_h; ++ uint16_t pic_w; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ uint32_t next_fn; ++} qpu_mc_pred_y_s_t; ++ ++// Only a useful structure in that it allows us to return something other than a void * ++typedef struct qpu_mc_pred_y_s { ++ union { ++ qpu_mc_pred_y_p_t p; ++ qpu_mc_pred_y_p00_t p00; ++ qpu_mc_pred_y_s_t s; ++ }; +} qpu_mc_pred_y_t; + ++typedef union qpu_mc_pred_cmd_u { ++ qpu_mc_pred_y_t y; ++ qpu_mc_pred_c_t c; ++ uint32_t data[1]; ++} qpu_mc_pred_cmd_t; ++ ++#define QPU_MC_PRED_N_Y8 12 ++#define QPU_MC_PRED_N_C8 12 ++ ++#define QPU_MC_PRED_N_Y10 12 ++#define QPU_MC_PRED_N_C10 12 ++ +#pragma pack(pop) + +#endif + +diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c +new file mode 100644 +index 0000000000..1925ab7a79 +--- /dev/null ++++ b/libavcodec/rpi_shader_template.c +@@ -0,0 +1,65 @@ ++#ifdef RPI ++ ++#include "hevc.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "rpi_shader_cmd.h" ++#include "rpi_shader_template.h" ++ ++typedef struct shader_track_s ++{ ++ const union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ const struct qpu_mc_src_s *last_l0; ++ const struct qpu_mc_src_s *last_l1; ++ uint32_t width; // pic_width * PW ++ uint32_t height; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++} shader_track_t; ++ ++static int wtoidx(const unsigned int w) ++{ ++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++ return pel_weight[w]; ++} ++ ++static const int fctom(uint32_t x) ++{ ++ int rv; ++ // As it happens we can take the 2nd filter term & divide it by 8 ++ // (dropping fractions) to get the fractional move ++ rv = 8 - ((x >> 11) & 0xf); ++ av_assert2(rv >= 0 && rv <= 7); ++ return rv; ++} ++ ++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) ++{ ++ return (x << shl) >> shr; ++} ++ ++static inline int woff_p(HEVCContext *const s, int32_t x) ++{ ++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int woff_b(HEVCContext *const s, int32_t x) ++{ ++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int wweight(int32_t x) ++{ ++ return ext(x, 16, 16); ++} ++ ++ ++#define PW 1 ++#include "rpi_shader_template_fn.h" ++ ++#undef PW ++#define PW 2 ++#include "rpi_shader_template_fn.h" ++ ++#endif ++ +diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h +new file mode 100644 +index 0000000000..ecf5b8185a +--- /dev/null ++++ b/libavcodec/rpi_shader_template.h +@@ -0,0 +1,24 @@ ++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++ ++#ifdef RPI ++struct HEVCContext; ++struct HEVCRpiInterPredEnv; ++ ++void rpi_shader_c8(struct HEVCContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void rpi_shader_c16(struct HEVCContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void rpi_sand_dump8(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++void rpi_sand_dump16(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++#endif ++#endif ++ +diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h +new file mode 100644 +index 0000000000..b5ac2ceed6 +--- /dev/null ++++ b/libavcodec/rpi_shader_template_fn.h +@@ -0,0 +1,477 @@ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++#define PATCH_STRIDE (16 * PW) ++ ++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { ++ const pixel s = *(const pixel *)src; ++ pixel * d = (pixel *)dst; ++ for (unsigned int j = 0; j < w; j += PW) { ++ *d++ = s; ++ } ++ } ++} ++ ++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride) { ++ memcpy(dst, src, w); ++ } ++} ++ ++static void FUNC(get_patch_y)(const shader_track_t * const st, ++ uint8_t * dst, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > st->width) { ++ if (x >= st->width) ++ x = st->width - PW; ++ dr = (x + w) - st->width; ++ w = st->width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > st->height) { ++ if (y >= st->height) ++ y = st->height - 1; ++ db = (y + h) - st->height; ++ h = st->height - y; ++ } ++ ++ dst += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); ++ if (dr != 0) ++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); ++ w += dl + dr; ++ dst -= dl; ++ ++ if (dt != 0) ++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); ++ if (db != 0) ++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); ++} ++ ++ ++ ++static void FUNC(get_patch_c)(const shader_track_t * const st, ++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ const int width = st->width; ++ const int height = st->height; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > width) { ++ if (x >= width) ++ x = width - PW; ++ dr = (x + w) - width; ++ w = width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > height) { ++ if (y >= height) ++ y = height - 1; ++ db = (y + h) - height; ++ h = height - y; ++ } ++ ++ dst_u += dl + dt * dst_stride; ++ dst_v += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ { ++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); ++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); ++ } ++ if (dr != 0) ++ { ++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); ++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); ++ } ++ w += dl + dr; ++ dst_u -= dl; ++ dst_v -= dl; ++ ++ if (dt != 0) ++ { ++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); ++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); ++ } ++ if (db != 0) ++ { ++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); ++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); ++ } ++} ++ ++// w, y, w, h in pixels ++// stride1, stride2 in bytes ++void FUNC(rpi_sand_dump)(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) ++{ ++ const int mask = stride2 == 0 ? ~0 : stride1 - 1; ++ ++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); ++ ++ if (is_c) { ++ x *= 2; ++ w *= 2; ++ } ++ ++ for (int i = y; i != y + h; ++i) { ++ for (int j = x; j != x + w; ++j) { ++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; ++ char sep = is_c && (j & 1) == 0 ? ':' : ' '; ++#if PW == 1 ++ if (j < 0 || i < 0) ++ printf("..%c", sep); ++ else ++ printf("%02x%c", *(const pixel*)p, sep); ++#else ++ if (j < 0 || i < 0) ++ printf("...%c", sep); ++ else ++ printf("%03x%c", *(const pixel*)p, sep); ++#endif ++ } ++ printf("\n"); ++ } ++} ++ ++ ++void FUNC(rpi_shader_c)(HEVCContext *const s, ++ const HEVCRpiInterPredEnv *const ipe_y, ++ const HEVCRpiInterPredEnv *const ipe_c) ++{ ++ for (int c_idx = 0; c_idx < 2; ++c_idx) ++ { ++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; ++ shader_track_t tracka[QPU_N_MAX] = {{NULL}}; ++ unsigned int exit_n = 0; ++ ++ if (ipe == NULL || !ipe->used) { ++ continue; ++ } ++ ++ do { ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ const HEVCRpiInterPredQ * const q = ipe->q + i; ++ shader_track_t * const st = tracka + i; ++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; ++ ++ for (;;) { ++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; ++ ++ if (link == q->code_setup) { ++ if (c_idx == 0) { ++ // Luma ++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s; ++ ++ st->height = c->pic_h; ++ st->width = c->pic_w * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->wdenom = c->wdenom; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else { ++ // Chroma ++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s; ++ ++ st->height = c->pic_ch; ++ st->width = c->pic_cw * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->wdenom = c->wdenom; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ } ++ else if (link == s->qpu.y_pxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ const int w1 = FFMIN(c->w, 8); ++ const int w2 = c->w - w1; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ if (w2 > 0) { ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ } ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); ++ if (w2 > 0) { ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); ++ } ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_bxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, ++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_p00) { ++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); ++ ++ st->last_l0 = &c->next_src1; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_b00) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ av_assert0(c->w <= 16 && c->h <= 64); ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( ++ patch_y3, patch_y1, PATCH_STRIDE, ++ c->h, 0, 0, c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, ++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), 0, 0, c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx_l1) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l1 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_bxx) { ++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b; ++ const int mx1 = fctom(c->coeffs_x1); ++ const int my1 = fctom(c->coeffs_y1); ++ const int mx2 = fctom(c->coeffs_x2); ++ const int my2 = fctom(c->coeffs_y2); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; ++ uint8_t patch_v1[PATCH_STRIDE * 72]; ++ uint8_t patch_u2[PATCH_STRIDE * 72]; ++ uint8_t patch_v2[PATCH_STRIDE * 72]; ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, ++ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2), ++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w); ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, ++ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2), ++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == q->code_sync) { ++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); ++ break; ++ } ++ else if (link == q->code_exit) { ++ // We expect exit to occur without other sync ++ av_assert0(i == exit_n); ++ ++exit_n; ++ break; ++ } ++ else { ++ av_assert0(0); ++ } ++ } ++ ++ st->qpu_mc_curr = cmd; ++ } ++ } while (exit_n == 0); ++ } ++} ++ ++#undef FUNC ++#undef pixel ++ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c new file mode 100644 -index 0000000..b061fe0 +index 0000000000..b502de0a2c --- /dev/null +++ b/libavcodec/rpi_zc.c -@@ -0,0 +1,581 @@ +@@ -0,0 +1,745 @@ +#include "config.h" +#ifdef RPI ++#include "libavcodec/avcodec.h" +#include "rpi_qpu.h" +#include "rpi_mailbox.h" +#include "rpi_zc.h" +#include "libavutil/avassert.h" ++#include "libavutil/rpi_sand_fns.h" +#include + +#include "libavutil/buffer_internal.h" @@ -17841,21 +27627,11 @@ index 0000000..b061fe0 + struct ZcPool * pool; +} ZcPoolEnt; + -+#if 1 -+//#define ALLOC_PAD 0x1000 -+#define ALLOC_PAD 0 -+#define ALLOC_ROUND 0x1000 -+//#define ALLOC_N_OFFSET 0x100 -+#define ALLOC_N_OFFSET 0 -+#define STRIDE_ROUND 0x80 -+#define STRIDE_OR 0x80 -+#else +#define ALLOC_PAD 0 +#define ALLOC_ROUND 0x1000 +#define ALLOC_N_OFFSET 0 -+#define STRIDE_ROUND 32 ++#define STRIDE_ROUND 64 +#define STRIDE_OR 0 -+#endif + +#define DEBUG_ZAP0_BUFFERS 0 + @@ -18032,13 +27808,22 @@ index 0000000..b061fe0 + { + case AV_PIX_FMT_YUV420P: + geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; -+ // geo.stride_y = ((video_width + 32 + 31) & ~31); + geo.stride_c = geo.stride_y / 2; -+ // geo.height_y = (video_height + 15) & ~15; + geo.height_y = (video_height + 32 + 31) & ~31; + geo.height_c = geo.height_y / 2; + geo.planes_c = 2; + geo.stripes = 1; ++ geo.bytes_per_pel = 1; ++ break; ++ ++ case AV_PIX_FMT_YUV420P10: ++ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ geo.stride_c = geo.stride_y / 2; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ geo.bytes_per_pel = 2; + break; + + case AV_PIX_FMT_SAND128: @@ -18073,6 +27858,7 @@ index 0000000..b061fe0 + geo.height_c = img.pitch / stripe_w - geo.height_y; + geo.planes_c = 1; + geo.stripes = (video_width + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 1; + + pthread_mutex_unlock(&sand_lock); + @@ -18081,6 +27867,45 @@ index 0000000..b061fe0 + break; + } + ++ case AV_PIX_FMT_SAND64_16: ++ case AV_PIX_FMT_SAND64_10: ++ { ++ const unsigned int stripe_w = 128; // bytes ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV_16, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ gpu_ref(); ++ mbox_get_image_params(gpu_get_mailbox(), &new_img); ++ gpu_unref(); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 2; ++ ++ pthread_mutex_unlock(&sand_lock); ++ break; ++ } ++ + default: + memset(&geo, 0, sizeof(geo)); + break; @@ -18153,8 +27978,12 @@ index 0000000..b061fe0 + frame->linesize[0] = geo.stride_y; + frame->linesize[1] = geo.stride_c; + frame->linesize[2] = geo.stride_c; ++ // abuse: linesize[3] = "stripe stride" ++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). ++ // In a general case this makes the calculation an xor and multiply rather ++ // than a divide and multiply + if (geo.stripes > 1) -+ frame->linesize[3] = geo.height_y + geo.height_c; // abuse: linesize[3] = stripe stride ++ frame->linesize[3] = geo.height_y + geo.height_c; + + frame->data[0] = buf->data; + frame->data[1] = frame->data[0] + size_y; @@ -18164,6 +27993,11 @@ index 0000000..b061fe0 + frame->extended_data = frame->data; + // Leave extended buf alone + ++#if RPI_ZC_SAND_8_IN_10_BUF != 0 ++ // *** If we intend to use this for real we will want a 2nd buffer pool ++ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge ++#endif ++ + return 0; +} + @@ -18182,7 +28016,7 @@ index 0000000..b061fe0 + rv = avcodec_default_get_buffer2(s, frame, flags); + } + else if (frame->format == AV_PIX_FMT_YUV420P || -+ frame->format == AV_PIX_FMT_SAND128) ++ av_rpi_is_sand_frame(frame)) + { + rv = rpi_get_display_buffer(s->get_buffer_context, frame); + } @@ -18212,6 +28046,7 @@ index 0000000..b061fe0 + unsigned int i; + uint8_t * psrc, * pdest; + ++ dest->format = src->format; + dest->width = src->width; + dest->height = src->height; + @@ -18243,29 +28078,142 @@ index 0000000..b061fe0 +} + + ++static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s, ++ const AVFrame * const src) ++{ ++ AVFrame dest_frame; ++ AVFrame * const dest = &dest_frame; ++ unsigned int i; ++ uint8_t * psrc, * psrc2, * pdest; ++ ++ memset(dest, 0, sizeof(*dest)); ++ dest->format = AV_PIX_FMT_SAND128; ++ dest->width = src->width; ++ dest->height = src->height; ++ ++ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0) ++ { ++ return NULL; ++ } ++ ++ // Y ++ for (i = 0, psrc = src->data[0], pdest = dest->data[0]; ++ i != dest->height; ++ ++i, psrc += src->linesize[0], pdest += dest->linesize[0]) ++ { ++ uint16_t * s = (uint16_t*)psrc; ++ uint8_t * d = pdest; ++ for (unsigned int k = 0; k < dest->width; k += dest->linesize[0]) ++ { ++ const unsigned int n = FFMIN(dest->linesize[0], dest->width - k); ++ for (unsigned int j = 0; j != n; ++j) ++ *d++ = (uint8_t)(*s++ >> 2); ++ d += (dest->linesize[3] - 1) * dest->linesize[0]; ++ } ++ } ++ ++ // C ++ for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1]; ++ i != dest->height / 2; ++ ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1]) ++ { ++ const uint16_t * su = (uint16_t*)psrc; ++ const uint16_t * sv = (uint16_t*)psrc2; ++ uint8_t * d = pdest; ++ for (unsigned int k = 0; k < dest->width; k += dest->linesize[1]) ++ { ++ const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2; ++ for (unsigned int j = 0; j != n; ++j) ++ { ++ *d++ = (uint8_t)(*su++ >> 2); ++ *d++ = (uint8_t)(*sv++ >> 2); ++ } ++ d += (dest->linesize[3] - 1) * dest->linesize[1]; ++ } ++ } ++ ++ return dest->buf[0]; ++} ++ ++ ++static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s, ++ const AVFrame * const src, const unsigned int src_bits) ++{ ++ AVFrame dest_frame = { ++ .format = AV_PIX_FMT_SAND128, ++ .width = src->width, ++ .height = src->height ++ }; ++ AVFrame * const dest = &dest_frame; ++ const unsigned int shr = src_bits - 8; ++ ++ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0) ++ { ++ return NULL; ++ } ++ ++ // Y ++ av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest), ++ src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest), ++ src->width, src->height, shr); ++ // C ++ av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest), ++ src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest), ++ src->width, src->height / 2, shr); ++ ++ return dest->buf[0]; ++} ++ ++ ++ +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s, -+ const AVFrame * const frame, const int maycopy) ++ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy) +{ + assert(s != NULL); + + if (frame->format != AV_PIX_FMT_YUV420P && -+ frame->format != AV_PIX_FMT_SAND128) ++ frame->format != AV_PIX_FMT_YUV420P10 && ++ !av_rpi_is_sand_frame(frame)) + { + av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); + return NULL; + } + -+ if (frame->buf[1] != NULL) ++ if (frame->buf[1] != NULL || frame->format != expected_format) + { -+ av_assert0(frame->format == AV_PIX_FMT_YUV420P); ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL) ++ { ++// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__); ++ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]); ++ } ++#endif ++ + if (maycopy) + { -+ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); -+ return zc_copy(s, frame); ++ if (frame->buf[1] != NULL) ++ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); ++ else ++ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format); ++ ++ switch (frame->format) ++ { ++ case AV_PIX_FMT_YUV420P10: ++ return zc_420p10_to_sand128(s, frame); ++ ++ case AV_PIX_FMT_SAND64_10: ++ return zc_sand64_16_to_sand128(s, frame, 10); ++ ++ default: ++ return zc_copy(s, frame); ++ } + } + else + { -+ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__); ++ if (frame->buf[1] != NULL) ++ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__); ++ else ++ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format); + return NULL; + } + } @@ -18392,10 +28340,10 @@ index 0000000..b061fe0 + diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h new file mode 100644 -index 0000000..f4aeb78 +index 0000000000..26fb3be999 --- /dev/null +++ b/libavcodec/rpi_zc.h -@@ -0,0 +1,137 @@ +@@ -0,0 +1,105 @@ +#ifndef LIBAVCODEC_RPI_ZC_H +#define LIBAVCODEC_RPI_ZC_H + @@ -18406,23 +28354,33 @@ index 0000000..f4aeb78 +// bit of memory for the frame when can then be reference counted until +// display has finished with it. + -+#include "libavutil/frame.h" -+#include "libavcodec/avcodec.h" ++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame ++// 0 disables ++// *** This option still in development ++// Only works if SAO active ++// Allocates buffers that are twice the required size ++#define RPI_ZC_SAND_8_IN_10_BUF 0 ++ ++struct AVBufferRef; ++struct AVFrame; ++struct AVCodecContext; ++enum AVPixelFormat; + +// "Opaque" pointer to whatever we are using as a buffer reference -+typedef AVBufferRef * AVRpiZcRefPtr; ++typedef struct AVBufferRef * AVRpiZcRefPtr; + +struct AVZcEnv; +typedef struct AVZcEnv * AVZcEnvPtr; + +typedef struct AVRpiZcFrameGeometry +{ -+ unsigned int stride_y; -+ unsigned int height_y; -+ unsigned int stride_c; -+ unsigned int height_c; -+ unsigned int planes_c; -+ unsigned int stripes; ++ unsigned int stride_y; // Luma stride (bytes) ++ unsigned int height_y; // Luma height (lines) ++ unsigned int stride_c; // Chroma stride (bytes) ++ unsigned int height_c; // Chroma stride (lines) ++ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1) ++ unsigned int stripes; // Number of stripes (sand) ++ unsigned int bytes_per_pel; +} AVRpiZcFrameGeometry; + + @@ -18448,7 +28406,7 @@ index 0000000..f4aeb78 +// the data, then allocate a new buffer and copy the data into it +// Otherwise return NULL +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s, -+ const AVFrame * const frame, const int maycopy); ++ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy); + +// Get the vc_handle from the frame ref +// Returns -1 if ref doesn't look valid @@ -18489,52 +28447,10 @@ index 0000000..f4aeb78 + + + -+static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame) -+{ -+ return frame->linesize[3]; -+} -+ -+static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ const unsigned int stride1 = frame->linesize[0]; -+ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y + stride2 * x2; -+} -+ -+static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) -+{ -+ const unsigned int stride1 = frame->linesize[0]; -+ const unsigned int stride2 = rpi_sliced_frame_stride2(frame); -+ const unsigned int x = x_c * 2; -+ const unsigned int x1 = x & (stride1 - 1); -+ const unsigned int x2 = x ^ x1; -+ -+ return x1 + stride1 * y_c + stride2 * x2; -+} -+ -+static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y); -+} -+ -+static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) -+{ -+ return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y); -+} -+ -+static inline int rpi_sliced_frame(const AVFrame * const frame) -+{ -+ return frame->format == AV_PIX_FMT_SAND128; -+} -+ -+ +#endif + diff --git a/libavcodec/utils.c b/libavcodec/utils.c -index f7adb52..3b398a3 100644 +index c4af9cbb17..c1b806e51b 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -26,6 +26,12 @@ @@ -18550,7 +28466,15 @@ index f7adb52..3b398a3 100644 #include "libavutil/atomic.h" #include "libavutil/attributes.h" #include "libavutil/avassert.h" -@@ -64,6 +70,10 @@ +@@ -39,6 +45,7 @@ + #include "libavutil/mathematics.h" + #include "libavutil/mem_internal.h" + #include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" + #include "libavutil/imgutils.h" + #include "libavutil/samplefmt.h" + #include "libavutil/dict.h" +@@ -64,6 +71,10 @@ #include "libavutil/ffversion.h" const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION; @@ -18561,7 +28485,7 @@ index f7adb52..3b398a3 100644 #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS static int default_lockmgr_cb(void **arg, enum AVLockOp op) { -@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, +@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels, return ret; } @@ -18609,7 +28533,7 @@ index f7adb52..3b398a3 100644 static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) { FramePool *pool = avctx->internal->pool; -@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) +@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame) av_buffer_pool_uninit(&pool->pools[i]); pool->linesize[i] = linesize[i]; if (size[i]) { @@ -18624,20 +28548,20 @@ index f7adb52..3b398a3 100644 pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1, CONFIG_MEMORY_POISONING ? NULL : -@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags +@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags { int ret; +#ifdef RPI + // This is going to end badly if we let it continue -+ av_assert0(frame->format != AV_PIX_FMT_SAND128); ++ av_assert0(!av_rpi_is_sand_frame(frame)); +#endif + if ((ret = update_frame_pool(avctx, frame)) < 0) return ret; diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c -index 21f8d9e..71ce7b9 100644 +index 21f8d9e00d..71ce7b9186 100644 --- a/libavfilter/avfilter.c +++ b/libavfilter/avfilter.c @@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args) @@ -18649,7 +28573,7 @@ index 21f8d9e..71ce7b9 100644 #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR if ( !strcmp(filter->filter->name, "format") || diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c -index b31d233..2767306 100644 +index 6767b65ec8..f270190d57 100644 --- a/libavformat/mpegts.c +++ b/libavformat/mpegts.c @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = { @@ -18662,10 +28586,10 @@ index b31d233..2767306 100644 { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC }, { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS }, diff --git a/libavformat/utils.c b/libavformat/utils.c -index 6f343f2..83f26d5 100644 +index 5a35953d24..d36fdc3199 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c -@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in +@@ -694,7 +694,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in int default_stream_index = av_find_default_stream_index(s); if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) { for (i = 0; i < s->nb_streams; i++) { @@ -18674,8 +28598,84 @@ index 6f343f2..83f26d5 100644 continue; s->streams[i]->pts_wrap_reference = pts_wrap_reference; s->streams[i]->pts_wrap_behavior = pts_wrap_behavior; +diff --git a/libavutil/Makefile b/libavutil/Makefile +index 1e061763a2..cbc9bc145b 100644 +--- a/libavutil/Makefile ++++ b/libavutil/Makefile +@@ -59,6 +59,8 @@ HEADERS = adler32.h \ + rational.h \ + replaygain.h \ + ripemd.h \ ++ rpi_sand_fns.h \ ++ rpi_sand_fn_pw.h \ + samplefmt.h \ + sha.h \ + sha512.h \ +@@ -136,6 +138,7 @@ OBJS = adler32.o \ + reverse.o \ + rc4.o \ + ripemd.o \ ++ rpi_sand_fns.o \ + samplefmt.o \ + sha.o \ + sha512.o \ +diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile +index 5da44b0542..b74b7c4e2f 100644 +--- a/libavutil/arm/Makefile ++++ b/libavutil/arm/Makefile +@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ + + NEON-OBJS += arm/float_dsp_init_neon.o \ + arm/float_dsp_neon.o \ ++ arm/rpi_sand_neon.o \ +diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S +new file mode 100644 +index 0000000000..dbffdaefa4 +--- /dev/null ++++ b/libavutil/arm/rpi_sand_neon.S +@@ -0,0 +1,40 @@ ++#include "libavutil/arm/asm.S" ++ ++@ void rpi_sand128b_stripe_to_8_10( ++@ uint8_t * dest, [r0] ++@ const uint8_t * src1, [r1] ++@ const uint8_t * src2, [r2] ++@ unsigned int lines); [r3] ++ ++.macro stripe2_to_8, bit_depth ++ vpush {q4-q7} ++1: ++ vldm r1!, {q0-q7} ++ subs r3, #1 ++ vldm r2!, {q8-q15} ++ vqrshrn.u16 d0, q0, #\bit_depth - 8 ++ vqrshrn.u16 d1, q1, #\bit_depth - 8 ++ vqrshrn.u16 d2, q2, #\bit_depth - 8 ++ vqrshrn.u16 d3, q3, #\bit_depth - 8 ++ vqrshrn.u16 d4, q4, #\bit_depth - 8 ++ vqrshrn.u16 d5, q5, #\bit_depth - 8 ++ vqrshrn.u16 d6, q6, #\bit_depth - 8 ++ vqrshrn.u16 d7, q7, #\bit_depth - 8 ++ vqrshrn.u16 d8, q8, #\bit_depth - 8 ++ vqrshrn.u16 d9, q9, #\bit_depth - 8 ++ vqrshrn.u16 d10, q10, #\bit_depth - 8 ++ vqrshrn.u16 d11, q11, #\bit_depth - 8 ++ vqrshrn.u16 d12, q12, #\bit_depth - 8 ++ vqrshrn.u16 d13, q13, #\bit_depth - 8 ++ vqrshrn.u16 d14, q14, #\bit_depth - 8 ++ vqrshrn.u16 d15, q15, #\bit_depth - 8 ++ vstm r0!, {q0-q7} ++ bne 1b ++ vpop {q4-q7} ++ bx lr ++.endm ++ ++function rpi_sand128b_stripe_to_8_10, export=1 ++ stripe2_to_8 10 ++endfunc ++ diff --git a/libavutil/buffer.c b/libavutil/buffer.c -index 694e116..203ca7b 100644 +index 694e116a3c..203ca7b3a8 100644 --- a/libavutil/buffer.c +++ b/libavutil/buffer.c @@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool) @@ -18689,7 +28689,7 @@ index 694e116..203ca7b 100644 + return buf->opaque; +} diff --git a/libavutil/buffer.h b/libavutil/buffer.h -index 0c0ce12..82e0bc3 100644 +index 0c0ce12cf2..82e0bc3058 100644 --- a/libavutil/buffer.h +++ b/libavutil/buffer.h @@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool); @@ -18702,11 +28702,51 @@ index 0c0ce12..82e0bc3 100644 /** * @} */ +diff --git a/libavutil/frame.h b/libavutil/frame.h +index 2b5c3320c3..990347e484 100644 +--- a/libavutil/frame.h ++++ b/libavutil/frame.h +@@ -120,7 +120,20 @@ enum AVFrameSideDataType { + * The GOP timecode in 25 bit timecode format. Data format is 64-bit integer. + * This is set on the first frame of a GOP that has a temporal reference of 0. + */ +- AV_FRAME_DATA_GOP_TIMECODE ++ AV_FRAME_DATA_GOP_TIMECODE, ++ ++ /** ++ * The data represents the AVSphericalMapping structure defined in ++ * libavutil/spherical.h. ++ */ ++ AV_FRAME_DATA_SPHERICAL, ++ ++ /** ++ * Extra data required to deal with a cropped Sand frame ++ * AVFrame holds the cropped size, but we cannot simply offset the start ++ * address to get the picture as we can for planar formats ++ */ ++ AV_FRAME_DATA_SAND_INFO, + }; + + enum AVActiveFormatDescription { +@@ -133,6 +146,13 @@ enum AVActiveFormatDescription { + AV_AFD_SP_4_3 = 15, + }; + ++typedef struct AVFrameDataSandInfo ++{ ++ unsigned int left_offset; ++ unsigned int top_offset; ++ unsigned int pic_width; ++ unsigned int pic_height; ++} AVFrameDataSandInfo; + + /** + * Structure to hold side data for an AVFrame. diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c -index 0dffa4d..5644176 100644 +index 0dffa4dbdb..17134b4f38 100644 --- a/libavutil/pixdesc.c +++ b/libavutil/pixdesc.c -@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { +@@ -2088,6 +2088,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA, }, @@ -18721,35 +28761,481 @@ index 0dffa4d..5644176 100644 + { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ + }, + .flags = 0, -+ } ++ }, ++ [AV_PIX_FMT_SAND64_10] = { ++ .name = "sand64_10", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */ ++ { 1, 4, 0, 0, 10, 1, 9, 1 }, /* U */ ++ { 1, 4, 1, 0, 10, 1, 9, 2 }, /* V */ ++ }, ++ .flags = 0, ++ }, }; #if FF_API_PLUS1_MINUS1 FF_ENABLE_DEPRECATION_WARNINGS diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h -index 0ed01c4..4705e80 100644 +index 0ed01c4844..2155b78704 100644 --- a/libavutil/pixfmt.h +++ b/libavutil/pixfmt.h -@@ -303,7 +303,10 @@ enum AVPixelFormat { +@@ -303,7 +303,22 @@ enum AVPixelFormat { AV_PIX_FMT_GBRAP10BE, ///< planar GBR 4:4:4:4 40bpp, big-endian AV_PIX_FMT_GBRAP10LE, ///< planar GBR 4:4:4:4 40bpp, little-endian - AV_PIX_FMT_NB, ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions ++ AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec ++ ++ AV_PIX_FMT_GRAY12BE, ///< Y , 12bpp, big-endian ++ AV_PIX_FMT_GRAY12LE, ///< Y , 12bpp, little-endian ++ AV_PIX_FMT_GRAY10BE, ///< Y , 10bpp, big-endian ++ AV_PIX_FMT_GRAY10LE, ///< Y , 10bpp, little-endian ++ ++ AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian ++ AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian ++ +// RPI - not on ifdef so can be got at by calling progs -+ AV_PIX_FMT_SAND128, ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding + + AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions }; #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A +diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h +new file mode 100644 +index 0000000000..52d52a2a83 +--- /dev/null ++++ b/libavutil/rpi_sand_fn_pw.h +@@ -0,0 +1,182 @@ ++// * Included twice from rpi_sand_fn with different PW ++ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x; ++ const unsigned int w = _w; ++ const unsigned int mask = stride1 - 1; ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { ++ memcpy(dst, p, w); ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const uint8_t * p = p2; ++ uint8_t * d = dst; ++ memcpy(d, p1, w1); ++ d += w1; ++ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { ++ memcpy(d, p, stride1); ++ } ++ memcpy(d, p, w3); ++ } ++ } ++} ++ ++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) ++ ++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ const pixel * p = (const pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * p = (const pixel *)p1; ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++} ++ ++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++} ++ ++ ++#undef pixel ++#undef STRCAT ++#undef FUNC ++ +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +new file mode 100644 +index 0000000000..b8bfad915e +--- /dev/null ++++ b/libavutil/rpi_sand_fns.c +@@ -0,0 +1,96 @@ ++#include "config.h" ++#include ++#include ++#include "rpi_sand_fns.h" ++#include "avassert.h" ++ ++#define PW 1 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#define PW 2 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#if HAVE_NEON ++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines); ++#endif ++ ++#if 1 ++// Simple round ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ const unsigned int rnd = (1 << shr) >> 1; ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ *dst++ = (*src++ + rnd) >> shr; ++ } ++} ++#else ++// Dithered variation ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ unsigned int rnd = (1 << shr) >> 1; ++ const unsigned int mask = ((1 << shr) - 1); ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ rnd = *src++ + (rnd & mask); ++ *dst++ = rnd >> shr; ++ } ++} ++#endif ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr) ++{ ++ const unsigned int n = dst_stride1 / 2; ++ unsigned int j; ++ ++ // This is true for our current layouts ++ av_assert0(dst_stride1 == src_stride1); ++ ++ // As we have the same stride1 for src & dest and src is wider than dest ++ // then if we loop on src we can always write contiguously to dest ++ // We make no effort to copy an exact width - round up to nearest src stripe ++ // as we will always have storage in dest for that ++ ++#if HAVE_NEON ++ if (shr == 3 && src_stride1 == 128) { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ rpi_sand128b_stripe_to_8_10(d, s1, s2, h); ++ } ++ } ++ else ++#endif ++ { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ cpy16_to_8(d + n, s2, n, shr); ++ } ++ } ++ } ++ ++ // Fix up a trailing dest half stripe ++ if (j < w) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ } ++ } ++} ++ +diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h +new file mode 100644 +index 0000000000..48948ecb47 +--- /dev/null ++++ b/libavutil/rpi_sand_fns.h +@@ -0,0 +1,127 @@ ++#ifndef AVUTIL_RPI_SAND_FNS ++#define AVUTIL_RPI_SAND_FNS ++ ++#include "libavutil/frame.h" ++ ++// For all these fns _x & _w are measured as coord * PW ++// For the C fns coords are in chroma pels (so luma / 2) ++// Strides are in bytes ++ ++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_planar_to_sand_c8(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_planar_to_sand_c16(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr); ++ ++ ++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) ++{ ++ // * We could repl;ace thios with a fixed 128 whic would allow the compiler ++ // to optimize a whole lot better ++ return frame->linesize[0]; ++} ++ ++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) ++{ ++ return frame->linesize[3]; ++} ++ ++ ++static inline int av_rpi_is_sand_format(const int format) ++{ ++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16); ++} ++ ++static inline int av_rpi_is_sand_frame(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand_format(frame->format); ++} ++ ++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_SAND128); ++} ++ ++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) ++{ ++ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); ++} ++ ++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand8_frame(frame) ? 0 : 1; ++} ++ ++// If x is measured in bytes (not pixels) then this works for sand64_16 as ++// well as sand128 - but in the general case we work that out ++ ++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y + stride2 * x2; ++} ++ ++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y_c + stride2 * x2; ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); ++} ++ ++#endif ++ diff --git a/libswscale/input.c b/libswscale/input.c -index 14ab5ab..e61b67a 100644 +index 14ab5abb3a..7a827c71e3 100644 --- a/libswscale/input.c +++ b/libswscale/input.c -@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV, +@@ -719,6 +719,13 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV, } } -+ +static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV, + const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2, + int width, uint32_t *unused) @@ -18760,112 +29246,418 @@ index 14ab5ab..e61b67a 100644 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos)) static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, -@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) +@@ -1085,6 +1092,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) case AV_PIX_FMT_P010BE: c->chrToYV12 = p010BEToUV_c; break; + case AV_PIX_FMT_SAND128: -+ c->chrToYV12 = sand128ToUV_c; ++ case AV_PIX_FMT_SAND64_10: ++ c->chrToYV12 = sand128ToUV_c; // NIF + break; } if (c->chrSrcHSubSample) { switch (srcFormat) { diff --git a/libswscale/utils.c b/libswscale/utils.c -index 576d8f0..d7206cc 100644 +index 576d8f0d5a..fd88a5e51e 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c -@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { +@@ -248,6 +248,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { [AV_PIX_FMT_AYUV64LE] = { 1, 1}, [AV_PIX_FMT_P010LE] = { 1, 0 }, [AV_PIX_FMT_P010BE] = { 1, 0 }, +#ifdef RPI + [AV_PIX_FMT_SAND128] = { 1, 0 }, ++ [AV_PIX_FMT_SAND64_10] = { 1, 0 }, +#endif }; int sws_isSupportedInput(enum AVPixelFormat pix_fmt) -diff --git a/pi-util/conf.sh b/pi-util/conf.sh -new file mode 100755 -index 0000000..8b596a2 ---- /dev/null -+++ b/pi-util/conf.sh -@@ -0,0 +1,33 @@ -+echo "Configure for Pi2/3" -+ -+RPI_BUILDROOT=`pwd`/build -+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot -+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf -+RPI_OPT_VC=$RPI_ROOTFS/opt/vc -+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" -+#RPI_DEFS="-D__VCCOREVER__=0x04000000" -+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib" -+#RPI_KEEPS="-save-temps=obj" -+RPI_KEEPS="" -+ -+./configure --enable-cross-compile\ -+ --arch=armv6t2\ -+ --cpu=cortex-a7\ -+ --target-os=linux\ -+ --disable-stripping\ -+ --disable-thumb\ -+ --enable-mmal\ -+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ -+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ -+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\ -+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ -+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- -+ -+# --enable-extra-warnings\ -+# --arch=armv71\ -+# --enable-shared\ -+ -+# gcc option for getting asm listing -+# -Wa,-ahls -diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh +diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt new file mode 100644 -index 0000000..160e149 +index 0000000000..b1e99a6a89 --- /dev/null -+++ b/pi-util/conf1.sh -@@ -0,0 +1,34 @@ -+echo "Configure for Pi1" ++++ b/pi-util/BUILD.txt +@@ -0,0 +1,25 @@ ++Building Pi FFmpeg ++================== + -+RPI_BUILDROOT=`pwd`/build -+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot -+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf -+RPI_OPT_VC=$RPI_ROOTFS/opt/vc -+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" -+#RPI_DEFS="-D__VCCOREVER__=0x04000000" -+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib" -+#RPI_KEEPS="-save-temps=obj" -+RPI_KEEPS="" ++Configuration: ++============= + -+./configure --enable-cross-compile\ -+ --cpu=arm1176jzf-s\ -+ --arch=armv\ -+ --disable-neon\ -+ --target-os=linux\ -+ --disable-stripping\ -+ --enable-mmal\ -+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ -+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ -+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\ -+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ -+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++pi-util/conf_pi2.sh ++ ++contains suitable options to build the code for Pi2/3. It expects to find ++git clones of ++ ++https://github.com/raspberrypi/tools ++https://github.com/raspberrypi/firmware ++ ++in the parent of the FFmpeg directory. I recommend using --depth 1 to avoid a ++lot of history you don't want. ++ ++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be ++rebuilt. Otherwise the prebuilt .c & .h files will be used. ++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild ++ ++pi-util/conf_p1.sh should configure for Pi1. Beware that as of this time ++H265 QPU acceleration is broken on Pi1 and so it is disabled. + + -+# --enable-extra-warnings\ -+# --arch=armv71\ -+# --enable-shared\ -+ -+# gcc option for getting asm listing -+# -Wa,-ahls +diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv +new file mode 100644 +index 0000000000..f05b7753f7 +--- /dev/null ++++ b/pi-util/conf_h265.2016.csv +@@ -0,0 +1,193 @@ ++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 ++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 ++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 ++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 ++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 ++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 ++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 ++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 ++2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt ++2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt ++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 ++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 ++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 ++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 ++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 ++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth ++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? ++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 ++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5 ++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt ++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt ++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt ++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt ++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt ++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt ++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5 ++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5 ++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5 ++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5 ++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5 ++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5 ++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5 ++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5 ++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5 ++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5 ++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5 ++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5 ++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5 ++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5 ++2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5 ++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5 ++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt ++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt ++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5 ++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5 ++1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5 ++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5 ++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5 ++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5 ++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5 ++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5 ++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5 ++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5 ++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5 ++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5 ++2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5 +diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv +new file mode 100644 +index 0000000000..6082641271 +--- /dev/null ++++ b/pi-util/conf_h265.2016_HEVC_v1.csv +@@ -0,0 +1,147 @@ ++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 ++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 ++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 ++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 ++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 ++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 ++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 ++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 ++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt ++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt ++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 ++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 ++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 ++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 ++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 ++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth ++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? ++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv new file mode 100644 -index 0000000..fc14f2a +index 0000000000..fc14f2a3c2 --- /dev/null +++ b/pi-util/conf_h265.csv @@ -0,0 +1,144 @@ @@ -19013,14 +29805,88 @@ index 0000000..fc14f2a +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 +diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh +new file mode 100755 +index 0000000000..ec25b81c31 +--- /dev/null ++++ b/pi-util/conf_pi1.sh +@@ -0,0 +1,31 @@ ++echo "Configure for Pi1" ++ ++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf ++RPI_OPT_VC=`pwd`/../firmware/opt/vc ++ ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" ++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++./configure --enable-cross-compile\ ++ --cpu=arm1176jzf-s\ ++ --arch=arm\ ++ --disable-neon\ ++ --target-os=linux\ ++ --disable-stripping\ ++ --enable-mmal\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++ ++ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++# --enable-shared\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls +diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh +new file mode 100755 +index 0000000000..f8e5e75375 +--- /dev/null ++++ b/pi-util/conf_pi2.sh +@@ -0,0 +1,30 @@ ++echo "Configure for Pi2/3" ++ ++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf ++RPI_OPT_VC=`pwd`/../firmware/opt/vc ++ ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" ++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++./configure --enable-cross-compile\ ++ --arch=armv6t2\ ++ --cpu=cortex-a7\ ++ --target-os=linux\ ++ --disable-stripping\ ++ --disable-thumb\ ++ --enable-mmal\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++# --enable-shared\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py -new file mode 100644 -index 0000000..c896bc6 +new file mode 100755 +index 0000000000..70f7be22bb --- /dev/null +++ b/pi-util/ffconf.py -@@ -0,0 +1,154 @@ +@@ -0,0 +1,174 @@ +#!/usr/bin/env python + ++import string +import os +import subprocess +import re @@ -19029,12 +29895,20 @@ index 0000000..c896bc6 +import csv +from stat import * + -+conf_root = "/opt/conform/h265" +ffmpeg_exec = "./ffmpeg" + -+def testone(fileroot, name, es_file, md5_file): ++def testone(fileroot, srcname, es_file, md5_file): + tmp_root = "/tmp" + ++ names = srcname.split('/') ++ while len(names) > 1: ++ tmp_root = os.path.join(tmp_root, names[0]) ++ del names[0] ++ name = names[0] ++ ++ if not os.path.exists(tmp_root): ++ os.makedirs(tmp_root) ++ + dec_file = os.path.join(tmp_root, name + ".dec.md5") + try: + os.remove(dec_file) @@ -19079,10 +29953,10 @@ index 0000000..c896bc6 + +def scandir(root): + aconf = [] -+ ents = os.listdir(conf_root) ++ ents = os.listdir(root) + ents.sort(key=str.lower) + for name in ents: -+ test_path = os.path.join(conf_root, name) ++ test_path = os.path.join(root, name) + if S_ISDIR(os.stat(test_path).st_mode): + files = os.listdir(test_path) + es_file = "?" @@ -19093,7 +29967,7 @@ index 0000000..c896bc6 + pass + elif ext == ".bit" or ext == ".bin": + es_file = f -+ elif ext == ".md5": ++ elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")): + if md5_file == "?": + md5_file = f + elif base[-3:] == "yuv": @@ -19105,13 +29979,15 @@ index 0000000..c896bc6 + if not tests: + return True + for t in tests: -+ if name[0:len(t)] == t: ++ if name[0:len(t)] == t or name.find("/" + t) != -1: + return True -+ return False ++ return False + -+def doconf(csva, tests): -+ failures = [] ++def doconf(csva, tests, test_root): ++ unx_failures = [] + unx_success = [] ++ failures = 0 ++ successes = 0 + for a in csva: + exp_test = int(a[0]) + if (exp_test and runtest(a[1], tests)): @@ -19119,17 +29995,25 @@ index 0000000..c896bc6 + print "==== ", name, + sys.stdout.flush() + -+ rv = testone(os.path.join(conf_root, name), name, a[2], a[3]) ++ rv = testone(os.path.join(test_root, name), name, a[2], a[3]) ++ if (rv == 0): ++ successes += 1 ++ else: ++ failures += 1 ++ + if (rv == 0): + if exp_test == 2: + print ": * OK *" + unx_success.append(name) + else: + print ": ok" -+ elif exp_test > 1 and rv == 1: ++ elif exp_test == 2 and rv == 1: + print ": fail" ++ elif exp_test == 3 and rv == 2: ++ # Call an expected "crash" an abort ++ print ": abort" + else: -+ failures.append(name) ++ unx_failures.append(name) + if rv == 1: + print ": * FAIL *" + elif (rv == 2) : @@ -19139,11 +30023,11 @@ index 0000000..c896bc6 + else : + print ": * BANG *" + -+ if failures or unx_success: -+ print "Unexpected Failures:", failures ++ if unx_failures or unx_success: ++ print "Unexpected Failures:", unx_failures + print "Unexpected Success: ", unx_success + else: -+ print "All tests normal" ++ print "All tests normal:", successes, "ok,", failures, "failed" + + +class ConfCSVDialect(csv.Dialect): @@ -19159,2536 +30043,184 @@ index 0000000..c896bc6 + + argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester") + argp.add_argument("tests", nargs='*') ++ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test") + argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir") -+ argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename") ++ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename") + args = argp.parse_args() + + if args.csvgen: -+ csv.writer(sys.stdout).writerows(scandir(conf_root)) ++ csv.writer(sys.stdout).writerows(scandir(args.test_root)) + exit(0) + + with open(args.csv, 'rt') as csvfile: + csva = [a for a in csv.reader(csvfile, ConfCSVDialect())] + + -+ doconf(csva, args.tests) ++ doconf(csva, args.tests, args.test_root) + -diff --git a/pi-util/qasm.py b/pi-util/qasm.py -new file mode 100644 -index 0000000..1eacc04 +diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py +new file mode 100755 +index 0000000000..27cc453963 --- /dev/null -+++ b/pi-util/qasm.py -@@ -0,0 +1,2502 @@ -+#!/usr/bin/env python ++++ b/pi-util/ffperf.py +@@ -0,0 +1,124 @@ ++#!/usr/bin/env python3 + -+# add.ifz.setf -, r0, ra0 ; fmul rb1, rany2, 0 ; thrend # comment -+# add r0, r0, 1 # implicit mul nop -+# nop # explicit add nop, implicit mul nop -+# bkpt # implicit add/mul nop -+# mov r0, 0x1234 # hex immediate -+# mov r0, 20 * 40 # expressions... -+# mov r0, f(sqrt(2.0) * 3.0) # f() converts float to bits -+# mov r0, a:label # put address of label in r0 -+# :label -+# bra.allnn ra2, a:1f # branch to label 1 (searching forward), using absolute address -+# :1 -+# brr.anyz -, r:1b # branch to label 1 (searching backward), using relative address -+# :1 # multiple definitions of numeric labels (differentiated using f/b) -+# .set my_val, 3 # introduce alias for 3 -+# .set my_reg, r0 # and for r0 -+# mov my_reg, my_val # then use them -+# .set my_reg2, my_reg + my_val # r0 plus 3 is r3 -+# .macro my_add, a, b, c # a, b, c act as if .set on entry -+# .set my_val, 10 -+# add a, b, c -+# mov r0, my_val # 10 -+# .endm # forget all .sets since .macro (including arg .sets) -+# mov r0, my_val # 3 -+# my_add my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right) -+ -+import math -+import optparse -+import os -+import random -+import re -+import struct -+import sys +import time -+ -+############################################################################### -+# constants -+############################################################################### -+ -+# ops -+###### -+ -+# negatives are internal qasm ops -+ -+AOP_MOV = -3 # two operands -+AOP_BRA = -2 # two operands -+AOP_BRR = -1 # two operands -+AOP_NOP = 0x00 # no operands -+AOP_FADD = 0x01 -+AOP_FSUB = 0x02 -+AOP_FMIN = 0x03 -+AOP_FMAX = 0x04 -+AOP_FMINABS = 0x05 -+AOP_FMAXABS = 0x06 -+AOP_FTOI = 0x07 # two operands -+AOP_ITOF = 0x08 # two operands -+AOP_ADD = 0x0c -+AOP_SUB = 0x0d -+AOP_SHR = 0x0e -+AOP_ASR = 0x0f -+AOP_ROR = 0x10 -+AOP_SHL = 0x11 -+AOP_MIN = 0x12 -+AOP_MAX = 0x13 -+AOP_AND = 0x14 -+AOP_OR = 0x15 -+AOP_XOR = 0x16 -+AOP_NOT = 0x17 # two operands -+AOP_CLZ = 0x18 # two operands -+AOP_V8ADDS = 0x1e -+AOP_V8SUBS = 0x1f -+ -+MOP_MOV = -1 # two operands -+MOP_NOP = 0x0 # no operands -+MOP_FMUL = 0x1 -+MOP_MUL24 = 0x2 -+MOP_V8MULD = 0x3 -+MOP_V8MIN = 0x4 -+MOP_V8MAX = 0x5 -+MOP_V8ADDS = 0x6 -+MOP_V8SUBS = 0x7 -+ -+# ldi modes -+############ -+ -+LDI_32 = 0 -+LDI_EL_SIGNED = 1 -+LDI_EL_UNSIGNED = 3 -+LDI_SEMA = 4 -+ -+# conds -+######## -+ -+COND_NEVER = 0 -+COND_ALWAYS = 1 -+COND_IFZ = 2 -+COND_IFNZ = 3 -+COND_IFN = 4 -+COND_IFNN = 5 -+COND_IFC = 6 -+COND_IFNC = 7 -+ -+BCOND_ALLZ = 0 -+BCOND_ALLNZ = 1 -+BCOND_ANYZ = 2 -+BCOND_ANYNZ = 3 -+BCOND_ALLN = 4 -+BCOND_ALLNN = 5 -+BCOND_ANYN = 6 -+BCOND_ANYNN = 7 -+BCOND_ALLC = 8 -+BCOND_ALLNC = 9 -+BCOND_ANYC = 10 -+BCOND_ANYNC = 11 -+BCOND_ALWAYS = 15 -+ -+# packing/unpacking -+#################### -+ -+# regfile a pack modes -+PACK_A_NOP = 0 -+PACK_A_16A = 1 -+PACK_A_16B = 2 -+PACK_A_8888 = 3 -+PACK_A_8A = 4 -+PACK_A_8B = 5 -+PACK_A_8C = 6 -+PACK_A_8D = 7 -+PACK_A_32S = 8 -+PACK_A_16AS = 9 -+PACK_A_16BS = 10 -+PACK_A_8888S = 11 -+PACK_A_8AS = 12 -+PACK_A_8BS = 13 -+PACK_A_8CS = 14 -+PACK_A_8DS = 15 -+ -+# mul unit pack modes -+PACK_MUL_NOP = 0 -+PACK_MUL_8888 = 3 -+PACK_MUL_8A = 4 -+PACK_MUL_8B = 5 -+PACK_MUL_8C = 6 -+PACK_MUL_8D = 7 -+ -+# regfile a unpack modes -+UNPACK_A_NOP = 0 -+UNPACK_A_16A = 1 -+UNPACK_A_16B = 2 -+UNPACK_A_8R = 3 -+UNPACK_A_8A = 4 -+UNPACK_A_8B = 5 -+UNPACK_A_8C = 6 -+UNPACK_A_8D = 7 -+ -+# r4 unpack modes -+UNPACK_R4_NOP = 0 -+UNPACK_R4_16A = 1 -+UNPACK_R4_16B = 2 -+UNPACK_R4_8R = 3 -+UNPACK_R4_8A = 4 -+UNPACK_R4_8B = 5 -+UNPACK_R4_8C = 6 -+UNPACK_R4_8D = 7 -+ -+PACK_TYPE_INT = 0 -+PACK_TYPE_FLOAT = 1 -+PACK_TYPE_EITHER = -1 -+ -+PACK_MODE_A = 0 # regfile a -+PACK_MODE_M = 1 # mul unit -+PACK_MODE_EITHER = -1 -+ -+UNPACK_LOC_A = 0 # regfile a -+UNPACK_LOC_R4 = 1 # r4 -+UNPACK_LOC_AB = 2 # either regfile a or regfile b -+UNPACK_LOC_OTHER = 3 # somewhere else -+ -+# args -+####### -+ -+# loc_t, ie internal -+MUX_AC = 0 -+MUX_ANY = 1 -+MUX_A = 2 -+MUX_B = 3 -+RW_EITHER = 0 -+RW_READ = 1 -+RW_WRITE = 2 -+ -+RADDR_NOP = 39 -+ -+# negatives are for internal use -+RMUX_SEMA = -6 -+RMUX_LABEL = -5 -+RMUX_IMMV = -4 -+RMUX_IMM = -3 -+RMUX_AC = -2 -+RMUX_ANY = -1 -+RMUX_A0 = 0 # followed by A1, A2, A3, A4, A5 -+RMUX_A = 6 -+RMUX_B = 7 -+ -+WADDR_R0 = 32 # followed by R1, R2, R3 -+WADDR_NOP = 39 -+ -+WMUX_ANY = 0 -+WMUX_A = 1 -+WMUX_B = 2 -+ -+# signals -+########## -+ -+SIG_BKPT = 0 -+SIG_NORMAL = 1 -+SIG_THRSW = 2 -+SIG_THREND = 3 -+SIG_SBWAIT = 4 -+SIG_SBDONE = 5 -+SIG_INT = 6 # on a0 -+SIG_LTHRSW = 6 # on b0 -+SIG_LOADCV = 7 -+SIG_LOADC = 8 -+SIG_LDCEND = 9 -+SIG_LDTMU0 = 10 -+SIG_LDTMU1 = 11 -+SIG_ROTATE = 12 # on a0 -+SIG_LOADAM = 12 # on b0 -+SIG_SMALLIMMED = 13 -+SIG_IMMED = 14 -+SIG_BRANCH = 15 -+ -+# multi-line assembler constructs -+################################## -+ -+CONSTRUCT_MACRO = 0x1 -+CONSTRUCT_IF = 0x2 -+CONSTRUCT_ELSE = 0x4 -+CONSTRUCT_REP = 0x8 -+ -+############################################################################### -+# helpers -+############################################################################### -+ -+def asm_error(message, location = None): -+ if location is None: -+ location = current_location -+ if location == '': -+ sys.stderr.write('qasm ERROR: %s\n' % message) -+ else: -+ sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message)) -+ sys.exit(-1) -+ -+def asm_warning(message, location = None): -+ if disable_warnings or (nwarn_level != 0): -+ return -+ if location is None: -+ location = current_location -+ if location == '': -+ sys.stderr.write('qasm WARNING: %s\n' % message) -+ else: -+ sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message)) -+ if warnings_are_errors: -+ asm_error('warnings are errors!', location) -+ -+# smart_split('') = [] -+# smart_split('a') = ['a'] -+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6'] -+def smart_split(s, delim = ',', count = 0): -+ if len(s) == 0: -+ return [] -+ parts = [] -+ depth = 0 -+ i = 0 -+ for j in xrange(len(s)): -+ if s[j] in '([{': -+ depth += 1 -+ elif s[j] in ')]}': -+ depth -= 1 -+ elif (s[j] == delim) and (depth == 0): -+ parts.append(s[i:j]) -+ i = j + 1 -+ if len(parts) == count: -+ break -+ if depth != 0: -+ asm_error('bracket nesting fail') -+ parts.append(s[i:]) -+ return parts -+ -+def is_int(x): -+ return isinstance(x, int) or isinstance(x, long) -+ -+############################################################################### -+# "parsing" stuff -+############################################################################### -+ -+re_macro = re.compile('\\.macro\\s+(?P\\w+)(?P(\\s*,\\s*\\w+)*)$') -+re_if = re.compile('\\.if((?Pn?set)\\s+(?P\\w+)|\\s(?P.+))$') -+re_elif = re.compile('\\.elif((?Pn?set)\\s+(?P\\w+)|\\s(?P.+))$') -+re_rep = re.compile('\\.rep\\s+(?P\\w+)\\s*,(?P.+)$') -+re_include = re.compile('\\.include\\s(?P.+)$') -+re_set = re.compile('\\.set\\s+(?P\\w+)\\s*,(?P.+)$') -+re_unset = re.compile('\\.unset\\s+(?P\\w+)$') -+re_eval = re.compile('\\.eval\\s(?P.+)$') -+re_print_info_warn_error = re.compile('\\.(?Pprint|info|warn|error)\\s(?P.+)$') -+re_assert = re.compile('\\.assert\\s(?P.+)$') -+re_data = re.compile('\\.d(?P[124])\\s(?P.+)$') -+re_macro_inst = re.compile('(?P\\w+)(?P\\s.+|)$') -+re_label = re.compile(':(?P:?[a-zA-Z_]\\w*|\\d+)$') -+re_op = re.compile('(?P\\w+)(\\.(?P\\w+))??(\\.(?Psetf))?(?P\\s.+|)$') -+re_label_ref_left = re.compile('\\b([ar]):') -+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$') -+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals... -+ -+# ops -+###### -+ -+aops = { -+ 'mov': (AOP_MOV, 2), -+ 'bra': (AOP_BRA, 2), -+ 'brr': (AOP_BRR, 2), -+ 'nop': (AOP_NOP, 0), -+ 'fadd': (AOP_FADD, 3), -+ 'fsub': (AOP_FSUB, 3), -+ 'fmin': (AOP_FMIN, 3), -+ 'fmax': (AOP_FMAX, 3), -+ 'fminabs': (AOP_FMINABS, 3), -+ 'fmaxabs': (AOP_FMAXABS, 3), -+ 'ftoi': (AOP_FTOI, 2), -+ 'itof': (AOP_ITOF, 2), -+ 'add': (AOP_ADD, 3), -+ 'sub': (AOP_SUB, 3), -+ 'shr': (AOP_SHR, 3), -+ 'asr': (AOP_ASR, 3), -+ 'ror': (AOP_ROR, 3), -+ 'shl': (AOP_SHL, 3), -+ 'min': (AOP_MIN, 3), -+ 'max': (AOP_MAX, 3), -+ 'and': (AOP_AND, 3), -+ 'or': (AOP_OR, 3), -+ 'xor': (AOP_XOR, 3), -+ 'not': (AOP_NOT, 2), -+ 'clz': (AOP_CLZ, 2), -+ 'v8adds': (AOP_V8ADDS, 3), -+ 'v8subs': (AOP_V8SUBS, 3)} -+ -+def get_aop(aop): -+ if aop not in aops: -+ asm_error('invalid aop') -+ return aops[aop] -+ -+mops = { -+ 'mov': (MOP_MOV, 2), -+ 'nop': (MOP_NOP, 0), -+ 'fmul': (MOP_FMUL, 3), -+ 'mul24': (MOP_MUL24, 3), -+ 'v8muld': (MOP_V8MULD, 3), -+ 'v8min': (MOP_V8MIN, 3), -+ 'v8max': (MOP_V8MAX, 3), -+ 'v8adds': (MOP_V8ADDS, 3), -+ 'v8subs': (MOP_V8SUBS, 3)} -+ -+def get_mop(mop): -+ if mop not in mops: -+ asm_error('invalid mop') -+ return mops[mop] -+ -+# conds -+######## -+ -+conds = { -+ 'ifz': COND_IFZ, -+ 'ifnz': COND_IFNZ, -+ 'ifn': COND_IFN, -+ 'ifnn': COND_IFNN, -+ 'ifc': COND_IFC, -+ 'ifnc': COND_IFNC} -+ -+def get_cond(cond): -+ if not cond: -+ return COND_ALWAYS -+ if cond not in conds: -+ asm_error('invalid cond') -+ return conds[cond] -+ -+bconds = { -+ 'allz': BCOND_ALLZ, -+ 'allnz': BCOND_ALLNZ, -+ 'anyz': BCOND_ANYZ, -+ 'anynz': BCOND_ANYNZ, -+ 'alln': BCOND_ALLN, -+ 'allnn': BCOND_ALLNN, -+ 'anyn': BCOND_ANYN, -+ 'anynn': BCOND_ANYNN, -+ 'allc': BCOND_ALLC, -+ 'allnc': BCOND_ALLNC, -+ 'anyc': BCOND_ANYC, -+ 'anync': BCOND_ANYNC} -+ -+def get_bcond(bcond): -+ if not bcond: -+ return BCOND_ALWAYS -+ if bcond not in bconds: -+ asm_error('invalid bcond') -+ return bconds[bcond] -+ -+def get_setf(setf): -+ if not setf: -+ return False -+ return True -+ -+# packing/unpacking -+#################### -+ -+packs = { -+ '16a': (PACK_A_16A, PACK_TYPE_INT, PACK_MODE_A), -+ '16b': (PACK_A_16B, PACK_TYPE_INT, PACK_MODE_A), -+ '16af': (PACK_A_16A, PACK_TYPE_FLOAT, PACK_MODE_A), -+ '16bf': (PACK_A_16B, PACK_TYPE_FLOAT, PACK_MODE_A), -+ '8abcd': (PACK_A_8888, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8a': (PACK_A_8A, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8b': (PACK_A_8B, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8c': (PACK_A_8C, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8d': (PACK_A_8D, PACK_TYPE_EITHER, PACK_MODE_A), -+ 's': (PACK_A_32S, PACK_TYPE_EITHER, PACK_MODE_A), -+ '16as': (PACK_A_16AS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '16bs': (PACK_A_16BS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8abcds': (PACK_A_8888S, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8as': (PACK_A_8AS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8bs': (PACK_A_8BS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8cs': (PACK_A_8CS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8ds': (PACK_A_8DS, PACK_TYPE_EITHER, PACK_MODE_A), -+ '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8ac': (PACK_MUL_8A, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8bc': (PACK_MUL_8B, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8cc': (PACK_MUL_8C, PACK_TYPE_EITHER, PACK_MODE_M), -+ '8dc': (PACK_MUL_8D, PACK_TYPE_EITHER, PACK_MODE_M)} -+ -+def get_pack(pack): -+ if not pack: -+ return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER) -+ if pack not in packs: -+ asm_error('invalid pack') -+ return packs[pack] -+ -+a_unpacks = { -+ '16a': (UNPACK_A_16A, PACK_TYPE_INT), -+ '16b': (UNPACK_A_16B, PACK_TYPE_INT), -+ '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT), -+ '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT), -+ '8dr': (UNPACK_A_8R, PACK_TYPE_EITHER), -+ '8a': (UNPACK_A_8A, PACK_TYPE_INT), -+ '8b': (UNPACK_A_8B, PACK_TYPE_INT), -+ '8c': (UNPACK_A_8C, PACK_TYPE_INT), -+ '8d': (UNPACK_A_8D, PACK_TYPE_INT), -+ '8ac': (UNPACK_A_8A, PACK_TYPE_FLOAT), -+ '8bc': (UNPACK_A_8B, PACK_TYPE_FLOAT), -+ '8cc': (UNPACK_A_8C, PACK_TYPE_FLOAT), -+ '8dc': (UNPACK_A_8D, PACK_TYPE_FLOAT)} -+ -+def get_a_unpack(unpack): -+ if not unpack: -+ return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A) -+ if unpack not in a_unpacks: -+ asm_error('invalid ra unpack') -+ return a_unpacks[unpack] + (UNPACK_LOC_A,) -+ -+r4_unpacks = { -+ '16af': UNPACK_R4_16A, -+ '16bf': UNPACK_R4_16B, -+ '8dr': UNPACK_R4_8R, -+ '8ac': UNPACK_R4_8A, -+ '8bc': UNPACK_R4_8B, -+ '8cc': UNPACK_R4_8C, -+ '8dc': UNPACK_R4_8D} -+ -+def get_r4_unpack(unpack): -+ if not unpack: -+ return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4) -+ if unpack not in r4_unpacks: -+ asm_error('invalid r4 unpack') -+ return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4) -+ -+# args -+####### -+ -+class loc_t: -+ def __init__(self, mux, i, rot, r5_rot, pack, rw): -+ self.mux = mux -+ self.i = i -+ self.rot = rot % 16 -+ self.r5_rot = r5_rot % 16 -+ self.pack = pack -+ self.rw = rw -+ -+ def copy(self): -+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw) -+ -+ def __add__(self, i): -+ if not is_int(i): -+ raise Exception('can only add integer to loc') -+ return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw) -+ -+ def __sub__(self, i): -+ if not is_int(i): -+ raise Exception('can only subtract integer from loc') -+ return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw) -+ -+ def __cmp__(self, other): -+ if is_int(other): -+ return cmp(self.i, other) -+ if not isinstance(other, loc_t): -+ raise Exception('can only compare loc to integer or other loc') -+ if self.mux != other.mux: -+ return cmp(self.mux, other.mux) -+ if self.i != other.i: -+ return cmp(self.i, other.i) -+ if self.rot != other.rot: -+ return cmp(self.rot, other.rot) -+ if self.r5_rot != other.r5_rot: -+ return cmp(self.r5_rot, other.r5_rot) -+ return cmp(self.pack, other.pack) -+ -+ def is_r5(self): -+ return (self.mux == MUX_AC) and (self.i == 5) -+ -+ def shift(self, rot, left): -+ if isinstance(rot, loc_t) and rot.is_r5(): -+ if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack: -+ raise Exception('can\'t rotate by rotated/unpacked r5') -+ return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw) -+ if not is_int(rot): -+ raise Exception('can only rotate by integer or r5') -+ return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw) -+ -+ def __lshift__(self, rot): -+ return self.shift(rot, True) -+ -+ def __rshift__(self, rot): -+ return self.shift(rot, False) -+ -+ def __getattr__(self, name): -+ # discard the first character if it is an underscore. this is a total hack -+ # to allow packs starting with a digit to work -+ if name[0] == '_': -+ name = name[1:] -+ if (name in packs) or (name in a_unpacks) or (name in r4_unpacks): -+ if self.pack: -+ raise Exception('can\'t specify two packs') -+ return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw) -+ raise AttributeError() -+ -+ def __str__(self): -+ if self.mux == MUX_AC: -+ return 'r%d' % self.i -+ if self.mux == MUX_ANY: -+ return 'rany%d' % self.i -+ if self.mux == MUX_A: -+ return 'ra%d' % self.i -+ if self.mux == MUX_B: -+ return 'rb%d' % self.i -+ assert 0 -+ -+class sema_t: -+ def __init__(self, acq, i): -+ if not is_int(i): -+ raise Exception('semaphore index must be integer') -+ self.acq = acq -+ self.i = i -+ -+class label_t: -+ def __init__(self, rel, name, offset): -+ self.rel = rel -+ self.name = name -+ self.offset = offset -+ -+ def __add__(self, offset): -+ return label_t(self.rel, self.name, self.offset + offset) -+ -+ def __sub__(self, offset): -+ return label_t(self.rel, self.name, self.offset - offset) -+ -+class label_maker_t: -+ def __init__(self, rel): -+ self.rel = rel -+ -+ def __getattr__(self, name): -+ # we discard the first character. this is a total hack to allow numeric labels to work -+ if not re_label_ref_right.match(name[1:]): -+ raise Exception('invalid label reference') -+ return label_t(self.rel, name[1:], 0) -+ -+def bits(x, n): -+ if (x >> n) != 0: -+ raise Exception('%d doesn\'t fit in %d bits' % (x, n)) -+ return x -+ -+def bitsw(x, n): -+ if x == (1 << n): -+ x = 0 -+ return bits(x, n) -+ -+def bitsws(x, n): -+ if x == (1 << (n - 1)): -+ x = 0 -+ if -(1 << (n - 1)) <= x < 0: -+ x += 1 << n -+ return bits(x, n) -+ -+def vpm_setup(n, stride, addr, v2 = False): -+ horiz, laned, size, y, x, p = addr -+ if size not in (0, 1, 2): -+ raise Exception('addr size should be 0, 1, or 2') -+ if horiz: -+ if x != 0: -+ raise Exception('horizontal accesses must have x of 0') -+ else: -+ if (y & 0xf) != 0: -+ raise Exception('vertical accesses must be 16 row aligned') -+ hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size) -+ if v2: -+ return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) | -+ (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size)) -+ return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) | -+ (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size)) -+ -+def vdw_setup_0(n, m, addr): -+ horiz, size, y, x, p = addr -+ if size not in (0, 1, 2): -+ raise Exception('addr size should be 0, 1, or 2') -+ return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) | -+ (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size)) -+ -+def vdr_setup_0(n, m, addr, vpm_stride, stride): -+ horiz, size, y, x, p = addr -+ if size not in (0, 1, 2): -+ raise Exception('addr size should be 0, 1, or 2') -+ if (stride < 8) or (stride & (stride - 1)): -+ raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride') -+ log2_stride = 3 -+ while (1 << log2_stride) != stride: -+ log2_stride += 1 -+ return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) | -+ (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) | -+ (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4)) -+ -+class allocator_t: -+ def __init__(self, *available): -+ self.available = list(available) -+ self.allocated = {} -+ self.reserved = [] -+ -+ def copy(self): -+ a = allocator_t() -+ a.available = self.available[:] -+ a.allocated = self.allocated.copy() -+ a.reserved = self.reserved[:] -+ return a -+ -+ def forget(self): -+ self.__init__(self.available + self.allocated.values() + self.reserved) -+ -+ def reserve(self, *rs): -+ for r in rs: -+ self.available.remove(r) -+ self.reserved.append(r) -+ -+ def retire(self, name): -+ r = self.allocated.pop(name) -+ del r.__invert__ -+ del r.retire -+ self.available.append(r) -+ return r -+ -+ def __getattr__(self, name): -+ if name not in self.allocated: -+ r = self.available.pop() -+ r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax -+ r.__invert__ = r.retire -+ self.allocated[name] = r -+ return self.allocated[name] -+ -+def pragma_allow_xor_0(x): -+ global allow_xor_0 -+ -+ if not isinstance(x, bool): -+ raise Exception('allow_xor_0 must be bool') -+ x, allow_xor_0 = allow_xor_0, x -+ return x -+ -+def pragma_dont_warn_when_mul_rot_inp_r5(x): -+ global dont_warn_when_mul_rot_inp_r5 -+ -+ if not isinstance(x, bool): -+ raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool') -+ x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x -+ return x -+ -+arg_defs = { -+ # special reg names (these alias the regular names, but also have appropriate read/write restrictions) -+ 'w': loc_t(MUX_A, 15, 0, 0, None, RW_EITHER), -+ 'z': loc_t(MUX_B, 15, 0, 0, None, RW_EITHER), -+ 'unif': loc_t(MUX_ANY, 32, 0, 0, None, RW_READ), -+ 'vary': loc_t(MUX_ANY, 35, 0, 0, None, RW_READ), -+ 'tmurs': loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE), -+ 'r5quad': loc_t(MUX_A, 37, 0, 0, None, RW_WRITE), -+ 'r5rep': loc_t(MUX_B, 37, 0, 0, None, RW_WRITE), -+ 'elem_num': loc_t(MUX_A, 38, 0, 0, None, RW_READ), -+ 'qpu_num': loc_t(MUX_B, 38, 0, 0, None, RW_READ), -+ 'unif_addr': loc_t(MUX_A, 40, 0, 0, None, RW_WRITE), -+ 'unif_addr_rel': loc_t(MUX_B, 40, 0, 0, None, RW_WRITE), -+ 'x_coord': loc_t(MUX_A, 41, 0, 0, None, RW_EITHER), -+ 'y_coord': loc_t(MUX_B, 41, 0, 0, None, RW_EITHER), -+ 'ms_mask': loc_t(MUX_A, 42, 0, 0, None, RW_EITHER), -+ 'rev_flag': loc_t(MUX_B, 42, 0, 0, None, RW_EITHER), -+ 'stencil': loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE), -+ 'tlbz': loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE), -+ 'tlbm': loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE), -+ 'tlbc': loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE), -+ 'vpm': loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER), -+ 'vr_busy': loc_t(MUX_A, 49, 0, 0, None, RW_READ), -+ 'vw_busy': loc_t(MUX_B, 49, 0, 0, None, RW_READ), -+ 'vr_setup': loc_t(MUX_A, 49, 0, 0, None, RW_WRITE), -+ 'vw_setup': loc_t(MUX_B, 49, 0, 0, None, RW_WRITE), -+ 'vr_wait': loc_t(MUX_A, 50, 0, 0, None, RW_READ), -+ 'vw_wait': loc_t(MUX_B, 50, 0, 0, None, RW_READ), -+ 'vr_addr': loc_t(MUX_A, 50, 0, 0, None, RW_WRITE), -+ 'vw_addr': loc_t(MUX_B, 50, 0, 0, None, RW_WRITE), -+ 'mutex': loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER), -+ 'recip': loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE), -+ 'recipsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE), -+ 'rsqrt': loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE), -+ 'exp': loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE), -+ 'log': loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE), -+ 't0s': loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE), -+ 't0t': loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE), -+ 't0r': loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE), -+ 't0b': loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE), -+ 't1s': loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE), -+ 't1t': loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE), -+ 't1r': loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE), -+ 't1b': loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE), -+ -+ # semaphore acq/rel -+ 'sacq': lambda i: sema_t(True, i), -+ 'srel': lambda i: sema_t(False, i), -+ -+ # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label) -+ 'r_label_maker': label_maker_t(True), -+ 'a_label_maker': label_maker_t(False), -+ -+ # handy functions -+ 'f': lambda x: struct.unpack('I', struct.pack('f', x))[0], -+ 'sqrt': math.sqrt, -+ 'sin': math.sin, -+ 'cos': math.cos, -+ 'atan2': math.atan2, -+ 'pi': math.pi, -+ 'rseed': random.seed, -+ 'rand': lambda: int(random.getrandbits(32)), -+ 'bits': bits, -+ 'bitsw': bitsw, -+ 'bitsws': bitsws, -+ -+ # handy vpm/vdw/vdr stuff -+ 'h32': lambda y: (1, 0, 0, y, 0, 0), -+ 'h16l': lambda y, p: (1, 1, 1, y, 0, p), -+ 'h16p': lambda y, p: (1, 0, 1, y, 0, p), -+ 'h8l': lambda y, p: (1, 1, 2, y, 0, p), -+ 'h8p': lambda y, p: (1, 0, 2, y, 0, p), -+ 'v32': lambda y, x: (0, 0, 0, y, x, 0), -+ 'v16l': lambda y, x, p: (0, 1, 1, y, x, p), -+ 'v16p': lambda y, x, p: (0, 0, 1, y, x, p), -+ 'v8l': lambda y, x, p: (0, 1, 2, y, x, p), -+ 'v8p': lambda y, x, p: (0, 0, 2, y, x, p), -+ 'dma_h32': lambda y, x: (1, 0, y, x, 0), -+ 'dma_h16p': lambda y, x, p: (1, 1, y, x, p), -+ 'dma_h8p': lambda y, x, p: (1, 2, y, x, p), -+ 'dma_v32': lambda y, x: (0, 0, y, x, 0), -+ 'dma_v16p': lambda y, x, p: (0, 1, y, x, p), -+ 'dma_v8p': lambda y, x, p: (0, 2, y, x, p), -+ 'vpm_setup': vpm_setup, -+ 'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True), -+ 'vdw_setup_0': vdw_setup_0, -+ 'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13), -+ 'vdr_setup_0': vdr_setup_0, -+ 'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride -+ 'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13), -+ -+ # annotations -+ 'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)), -+ 'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff), -+ 'preserve_cond': ('preserve_cond', 1), -+ -+ # somewhat experimental register allocator -+ 'allocator_t': allocator_t, -+ -+ # pragmas -+ 'pragma_allow_xor_0': pragma_allow_xor_0, -+ 'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5} -+ -+# accumulators and regs (regular names -- r0, ra0, etc) -+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6)) -+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64)) -+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64)) -+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64)) -+ -+def arg_eval(arg, sets): -+ s = (arg.strip().split('.', 1) + [None])[:2] -+ if s[0] == '-': -+ return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE) -+ arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings... -+ arg = re_pack.sub('._\\1', arg) -+ try: -+ # todo: i would like to be able to pass both arg_defs and sets in here -+ # (with sets hiding arg_defs in the case of conflicts), but the obvious -+ # dict(arg_defs, **sets) won't permit things such as: -+ # .set f, lambda x: y -+ # .set y, 4 -+ # (the y in the lambda will be looked up in the temporary dict we created -+ # when evaluating the f .set, which doesn't contain y) -+ # -+ # instead, sets is initially set to (a copy of) arg_defs. to simulate the -+ # hiding behaviour, on an unset, we restore any hidden arg_defs value. -+ # also, before dumping sets at the end, we strip out the arg_defs stuff -+ # (this isn't entirely correct as we want to dump sets that are hiding -+ # arg_defs) -+ return eval(arg, sets) -+ except Exception, e: -+ asm_error(e) -+ except: -+ asm_error('unknown error while evaluating argument') -+ -+# doesn't check/fixup pack -+def check_and_fixup_loc(loc, read): -+ if (not read) and (loc.rw == RW_READ): -+ asm_error('writing to read-only hardware register') -+ if read and (loc.rw == RW_WRITE): -+ asm_error('reading from write-only hardware register') -+ if not read: -+ # conceptually, we are writing to a location rotated right by -+ # loc.rot/loc.r5_rot. but we are actually rotating the output right by -+ # -loc.rot/-loc.r5_rot then writing it to the unrotated location -+ loc.rot = -loc.rot % 16 -+ loc.r5_rot = -loc.r5_rot % 16 -+ if (loc.rot != 0) and (loc.r5_rot != 0): -+ asm_error('can\'t rotate by both r5 and immediate') -+ if (loc.r5_rot != 0) and (loc.r5_rot != 1): -+ asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read]) -+ if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later -+ if not read: -+ asm_error('target doesn\'t support write rotation') -+ if loc.mux == MUX_ANY: -+ loc.mux = MUX_A # can't do rotated read from regfile b -+ if loc.mux != MUX_A: -+ asm_error('rotation on read only allowed from regfile a') -+ if loc.i >= 32: -+ asm_warning('rotation only works from physical regfile') -+ if loc.mux == MUX_AC: -+ if (loc.i < 0) or (loc.i >= 6): -+ asm_error('reg out of range') -+ if not read: -+ if loc.i == 4: -+ asm_error('not allowed to write to r4') -+ if loc.i == 5: -+ -+ asm_error('not allowed to write to r5 -- please specify r5quad or r5rep') -+ elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B): -+ if (loc.i < 0) or (loc.i >= 64): -+ asm_error('reg out of range') -+ else: -+ assert 0 -+ -+def get_dst(dst, sets): -+ if not dst: -+ return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0 -+ dst = arg_eval(dst, sets) -+ if not isinstance(dst, loc_t): -+ asm_error('invalid dst') -+ dst = dst.copy() -+ check_and_fixup_loc(dst, False) -+ pack = get_pack(dst.pack) -+ if dst.mux == MUX_AC: -+ if pack[2] == PACK_MODE_A: -+ asm_warning('ra packing only works when writing to physical regfile') -+ return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot -+ return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot -+ if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation -+ if (pack[2] == PACK_MODE_A) and (dst.i >= 32): -+ asm_warning('ra packing only works when writing to physical regfile') -+ return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot -+ if dst.mux == MUX_ANY: -+ return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot -+ if dst.mux == MUX_B: -+ if pack[2] == PACK_MODE_A: -+ asm_error('this packing operation can only be used for regfile a') -+ return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot -+ assert 0 -+ -+def get_src(src, sets): -+ if not src: -+ return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None -+ src = arg_eval(src, sets) -+ if isinstance(src, sema_t): -+ if not have_sema: -+ asm_error('target does not support semaphores') -+ if (src.i < 0) or (src.i >= 16): -+ asm_error('semaphore number must be in [0, 16)') -+ return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if isinstance(src, label_t): -+ return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if isinstance(src, list): -+ if len(src) != 16: -+ asm_error('vector immediate must have length 16') -+ src = src[:] -+ for i in xrange(16): -+ if not is_int(src[i]): -+ asm_error('all elements of vector immediate must be integers') -+ src[i] &= (1 << 32) - 1 -+ return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if is_int(src): -+ return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0 -+ if not isinstance(src, loc_t): -+ asm_error('invalid src') -+ src = src.copy() -+ check_and_fixup_loc(src, True) -+ if mulw_rotate: -+ srot, sr5rot = 0, 0 -+ drot, dr5rot = src.rot, src.r5_rot -+ else: -+ srot, sr5rot = src.rot, src.r5_rot -+ drot, dr5rot = 0, 0 -+ if src.mux == MUX_AC: -+ if src.i == 4: -+ return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot -+ if src.pack: -+ asm_error('unpack only allowed for regfile a or r4') -+ return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot -+ if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b -+ return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot -+ if src.mux == MUX_ANY: -+ return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot -+ if src.mux == MUX_B: -+ if src.pack: -+ asm_error('unpack only allowed for regfile a or r4') -+ return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot -+ assert 0 -+ -+# signals -+########## -+ -+sigs = { -+ 'bkpt': SIG_BKPT, -+ 'thrsw': SIG_THRSW, -+ 'thrend': SIG_THREND, -+ 'sbwait': SIG_SBWAIT, -+ 'sbdone': SIG_SBDONE, -+ 'int': SIG_INT, -+ 'loadcv': SIG_LOADCV, -+ 'loadc': SIG_LOADC, -+ 'ldcend': SIG_LDCEND, -+ 'ldtmu0': SIG_LDTMU0, -+ 'ldtmu1': SIG_LDTMU1} -+ -+def get_sig(sig): -+ if sig not in sigs: -+ return SIG_NORMAL -+ return sigs[sig] -+ -+# annotations -+############## -+ -+def get_annots(annot, sets): -+ annots = arg_eval(annot, sets) -+ if isinstance(annots, list): -+ annots = annots[:] -+ else: -+ annots = [annots] -+ for i, annot in enumerate(annots): -+ if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or -+ (not is_int(annot[1]))): -+ asm_error('annotation must be (string, integer) pair, or a list of such pairs') -+ annots[i] = (annot[0], annot[1] & ((1 << 32) - 1)) -+ return annots -+ -+############################################################################### -+# core -+############################################################################### -+ -+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats): -+ needfloat = PACK_TYPE_EITHER -+ havefloata = False -+ havefloatr4 = False -+ unpacka = None -+ unpackr4 = None -+ forcebs = [False, False, False, False] -+ forcerafloat = False -+ -+ pm = PACK_MODE_EITHER -+ for i in (0, 1, 2, 3): -+ if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB): -+ assert rpacks[i][0] == 0 -+ else: -+ if rpacks[i][2] == UNPACK_LOC_A: -+ if unpacka is None: -+ unpacka = rpacks[i][0] -+ elif unpacka != rpacks[i][0]: -+ asm_error('conflicting unpack operations on regfile a') -+ havefloata = havefloata or rfloats[i] -+ elif rpacks[i][2] == UNPACK_LOC_R4: -+ if unpackr4 is None: -+ unpackr4 = rpacks[i][0] -+ elif unpackr4 != rpacks[i][0]: -+ asm_error('conflicting unpack operations on r4') -+ havefloatr4 = havefloatr4 or rfloats[i] -+ else: -+ assert 0 -+ -+ if rpacks[i][1] != PACK_TYPE_EITHER: -+ if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]): -+ asm_error('conflicting unpack float requirements') -+ needfloat = rpacks[i][1] -+ for i in (0, 1, 2, 3): -+ if rpacks[i][2] == UNPACK_LOC_AB: -+ if (unpacka is not None) and (unpacka != UNPACK_A_NOP): -+ forcebs[i] = True # non-nop unpack from regfile a. must use b -+ -+ if unpacka: -+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat: -+ havefloata = True -+ forcerafloat = True -+ havefloat = havefloata -+ else: -+ havefloat = havefloatr4 -+ -+ if (needfloat == PACK_TYPE_FLOAT) and (not havefloat): -+ asm_error('float unpack operation used in integer alu operations') -+ if (needfloat == PACK_TYPE_INT) and havefloat: -+ asm_error('integer unpack operation used in float alu operation') -+ -+ unpack = 0 -+ if unpacka and unpackr4: -+ asm_error('cannot specify pack operation for both regfile a and r4') -+ if unpacka: -+ pm = PACK_MODE_A -+ unpack = unpacka -+ elif unpackr4: -+ pm = PACK_MODE_M -+ unpack = unpackr4 -+ -+ pack = 0 -+ if wpacks[0][2] == PACK_MODE_M: -+ asm_error('mul-unit pack operation used on add result') -+ for i in (0, 1): -+ if wpacks[i][2] == PACK_MODE_A: -+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A): -+ asm_error('conflicting pack modes') -+ pm = PACK_MODE_A -+ pack = wpacks[i][0] -+ elif wpacks[i][2] == PACK_MODE_M: -+ if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M): -+ asm_error('conflicting pack modes') -+ pm = PACK_MODE_M -+ pack = wpacks[i][0] -+ -+ if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]): -+ asm_error('float pack operation used with integer alu result') -+ if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]: -+ asm_error('integer pack operation used with float alu result') -+ -+ if pm == PACK_MODE_EITHER: -+ pm = PACK_MODE_A -+ return pm, pack, unpack, forcebs, forcerafloat -+ -+# immediates that can be encoded with SIG_SMALLIMMED -+bimms = {} -+bimms.update((i, i) for i in xrange(16)) -+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32)) -+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40)) -+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48)) -+ -+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux): -+ if rmux == RMUX_SEMA: -+ asm_error('semaphore op can only be used with mov') -+ if rmux == RMUX_LABEL: -+ asm_error('label not allowed here') -+ if rmux == RMUX_IMMV: -+ asm_error('vector immediate can only be used with mov') -+ if rmux == RMUX_IMM: -+ if raddr not in bimms: -+ asm_error('can\'t encode immediate 0x%08x' % raddr) -+ raddr = bimms[raddr] -+ if not immb: -+ if raddr_b is not None: -+ asm_error('regfile b and immediates don\'t mix') -+ raddr_b = raddr -+ immb = True -+ elif raddr_b != raddr: -+ asm_error('can only encode one rotation/immediate') -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ if rmux == RMUX_AC: -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr -+ if rmux == RMUX_ANY: -+ if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr): -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ if (not immb) and (raddr_b == raddr): -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ if raddr_a is None: -+ assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5)) -+ raddr_a = raddr -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ if raddr_b is None: -+ assert not immb -+ raddr_b = raddr -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ asm_error('no free read slots') -+ if rmux == RMUX_A: -+ if (not mulw_rotate) and (raddr_a is not None) and ( -+ ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))): -+ asm_error('conflicting rotations from regfile a') -+ if raddr_a is None: -+ raddr_a = raddr[0] -+ elif raddr_a != raddr[0]: -+ asm_error('can only read from one location in each regfile') -+ arot_r5 = raddr[2] -+ if raddr[1] == 0: -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ raddr = 48 + raddr[1] -+ if not immb: -+ if raddr_b is not None: -+ asm_error('regfile b and rotation don\'t mix') -+ raddr_b = raddr -+ immb = True -+ elif raddr_b != raddr: -+ asm_error('can only encode one rotation/immediate') -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_A -+ if rmux == RMUX_B: -+ if immb: -+ asm_error('regfile b and rotation/immediates don\'t mix') -+ if raddr_b is None: -+ raddr_b = raddr -+ elif raddr_b != raddr: -+ asm_error('can only read from one location in each regfile') -+ return raddr_a, raddr_b, immb, arot_r5, RMUX_B -+ assert 0 -+ -+# ok if: -+# - accumulator (r0-r3) -+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy, -+# and vw_busy. it's also true of r5 if it was written by r5rep, but not if it -+# was written by r5quad. so, by default, r5 isn't considered uniform. todo: -+# what about vr_wait/vw_wait/mutex? -+def read_rot_ok(rmux, raddr_a, raddr_b): -+ return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or -+ ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy -+ ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy -+ -+def asm_flush_prog_data(): -+ global prog_data -+ -+ while len(prog_data) & 7: -+ prog_data.append(0) -+ for i in xrange(0, len(prog_data), 8): -+ prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0), -+ (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {})) -+ prog_data = [] -+ -+def asm_line(sets, location, line): -+ global current_location, construct, nwarn_level -+ -+ prev_location = current_location -+ current_location = location -+ -+ try: -+ if construct != None: -+ if re_macro.match(line): -+ construct_stack.append(CONSTRUCT_MACRO) -+ elif re_if.match(line): -+ construct_stack.append(CONSTRUCT_IF) -+ elif re_rep.match(line): -+ construct_stack.append(CONSTRUCT_REP) -+ else: -+ else_m = line == '.else' -+ elif_m = re_elif.match(line) -+ if elif_m: -+ end_construct = CONSTRUCT_IF -+ else: -+ end_construct = { -+ '.endm': CONSTRUCT_MACRO, -+ '.else': CONSTRUCT_IF, -+ '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE, -+ '.endr': CONSTRUCT_REP}.get(line) -+ if end_construct is not None: -+ end_construct &= construct_stack.pop() -+ if end_construct == 0: -+ if elif_m: -+ asm_error('unexpected .elif') -+ asm_error('unexpected %s' % line) -+ if len(construct_stack) == 0: -+ lines = construct -+ construct = None -+ if end_construct == CONSTRUCT_MACRO: -+ return -+ if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE): -+ condition_if, condition_else = lines[0] -+ lines = lines[1:] -+ if condition_if: -+ for location, line in lines: -+ asm_line(sets, location, line) -+ if else_m: -+ construct = [(condition_else, False)] -+ construct_stack.append(CONSTRUCT_ELSE) -+ elif elif_m: -+ if elif_m.group('set'): -+ condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets)) -+ else: -+ condition_if = condition_else and arg_eval(elif_m.group('condition'), sets) -+ condition_else = condition_else and (not condition_if) -+ construct = [(condition_if, condition_else)] -+ construct_stack.append(CONSTRUCT_IF) -+ return -+ if end_construct == CONSTRUCT_REP: -+ name, count = lines[0] -+ lines = lines[1:] -+ for i in xrange(count): -+ sets[name] = i -+ for location, line in lines: -+ asm_line(sets, location, line) -+ return -+ assert 0 -+ if else_m: -+ construct_stack.append(CONSTRUCT_ELSE) -+ elif elif_m: -+ construct_stack.append(CONSTRUCT_IF) -+ construct.append((current_location, line)) -+ return -+ -+ if line in ('.endm', '.else', '.endif', '.endr'): -+ asm_error('unexpected %s' % line) -+ if re_elif.match(line): -+ asm_error('unexpected .elif') -+ -+ m = re_macro.match(line) -+ if m: -+ construct = [] -+ construct_stack.append(CONSTRUCT_MACRO) -+ macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct) -+ return -+ -+ m = re_if.match(line) -+ if m: -+ if m.group('set'): -+ condition = (m.group('set') == 'nset') ^ (m.group('name') in sets) -+ else: -+ # not not forces condition to a bool (this matters if condition is -+ # something mutable like a list) -+ condition = not not arg_eval(m.group('condition'), sets) -+ construct = [(condition, not condition)] -+ construct_stack.append(CONSTRUCT_IF) -+ return -+ -+ m = re_rep.match(line) -+ if m: -+ count = arg_eval(m.group('count'), sets) -+ if not is_int(count): -+ asm_error('.rep count must be integer') -+ construct = [(m.group('name'), count)] -+ construct_stack.append(CONSTRUCT_REP) -+ return -+ -+ m = re_include.match(line) -+ if m: -+ filename = arg_eval(m.group('filename'), sets) -+ if not isinstance(filename, str): -+ asm_error('expected string') -+ asm_file(sets, '%s: %s' % (current_location, filename), filename) -+ return -+ -+ m = re_set.match(line) -+ if m: -+ sets[m.group('name')] = arg_eval(m.group('val'), sets) -+ return -+ -+ m = re_unset.match(line) -+ if m: -+ name = m.group('name') -+ if name not in sets: -+ asm_error('%s not set' % name) -+ if name in arg_defs: # todo: see arg_eval -+ sets[name] = arg_defs[name] -+ else: -+ del sets[name] -+ return -+ -+ m = re_eval.match(line) -+ if m: -+ arg_eval(m.group('expr'), sets) -+ return -+ -+ m = re_print_info_warn_error.match(line) -+ if m: -+ def print_fn(message): -+ print message -+ def info_fn(message): -+ sys.stderr.write('%s\n' % message) -+ {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[ -+ m.group('print_info_warn_error')](arg_eval(m.group('message'), sets)) -+ return -+ -+ m = re_assert.match(line) -+ if m: -+ if not arg_eval(m.group('condition'), sets): -+ asm_error('assertion failure: \'%s\'' % m.group('condition')) -+ return -+ -+ m = re_data.match(line) -+ if m: -+ size = int(m.group('size')) -+ for datum in smart_split(m.group('data')): -+ datum = arg_eval(datum, sets) -+ if not is_int(datum): -+ asm_error('datum must be integer') -+ prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size)) -+ return -+ -+ m = re_macro_inst.match(line) -+ if m: -+ name = m.group('name') -+ if name in macros: -+ params, lines = macros[name] -+ args = smart_split(m.group('args')) -+ if len(args) > len(params): -+ asm_error('too many arguments to macro') -+ sets = sets.copy() -+ sets.update(zip(params, (arg_eval(arg, sets) for arg in args))) -+ for param in params[len(args):]: -+ if param in sets: -+ if param in arg_defs: # todo: see arg_eval -+ sets[param] = arg_defs[param] -+ else: -+ del sets[param] -+ for location, line in lines: -+ asm_line(sets, '%s: %s' % (current_location, location), line) -+ return -+ -+ if line == '.pushnwarn': -+ nwarn_level += 1 -+ return -+ if line == '.popnwarn': -+ if nwarn_level == 0: -+ asm_error('.popnwarn without .pushnwarn') -+ nwarn_level -= 1 -+ return -+ -+ # everything below assumes prog is up to date -+ asm_flush_prog_data() -+ -+ m = re_label.match(line) -+ if m: -+ name = m.group('name') -+ if name[0].isdigit(): -+ labels.setdefault(name, []).append(len(prog)) -+ else: -+ if name[0] == ':': -+ undecorated_name = name[1:] -+ else: -+ undecorated_name = name -+ if (undecorated_name in labels) or ((':' + undecorated_name) in labels): -+ asm_error('named label defined twice') -+ labels[name] = len(prog) -+ return -+ -+ annots = line.split('@') -+ ops = [op.strip() for op in annots[0].split(';')] -+ annots = sum((get_annots(annot, sets) for annot in annots[1:]), []) -+ sig = get_sig(ops[-1]) -+ if sig != SIG_NORMAL: -+ ops = ops[:-1] -+ if len(ops) > 2: -+ asm_error('too many ops') -+ elif (len(ops) == 1) and (ops[0] == ''): -+ ops = [] -+ ops = (ops + ['nop', 'nop'])[:2] -+ m = re_op.match(ops[0]) -+ if not m: -+ asm_error('invalid syntax') -+ aop, aargs_n = get_aop(m.group('op')) -+ if (aop == AOP_BRA) or (aop == AOP_BRR): -+ acond = get_bcond(m.group('cond')) -+ else: -+ acond = get_cond(m.group('cond')) -+ asf = get_setf(m.group('sf')) -+ aargs = smart_split(m.group('args')) -+ if len(aargs) != aargs_n: -+ asm_error('wrong operand count') -+ ard, ara, arb = (aargs + [None, None, None])[:3] -+ m = re_op.match(ops[1]) -+ if not m: -+ asm_error('invalid syntax') -+ mop, margs_n = get_mop(m.group('op')) -+ mcond = get_cond(m.group('cond')) -+ msf = get_setf(m.group('sf')) -+ margs = smart_split(m.group('args')) -+ if len(margs) != margs_n: -+ asm_error('wrong operand count') -+ mrd, mra, mrb = (margs + [None, None, None])[:3] -+ # eval srcs first so allocator can retire and reuse registers for dst -+ aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets) -+ abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets) -+ maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets) -+ mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets) -+ awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets) -+ mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets) -+ if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or -+ ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))): -+ asm_error('cannot have 2 arguments with different rotations') -+ if aarmux is not None: -+ awrot = (awrot + aadrot) % 16 -+ awrot_r5 = (awrot_r5 + aadrot_r5) % 16 -+ if (awrot != 0) or awrot_r5: -+ asm_error('rotate not allowed on add write') -+ if marmux is not None: -+ mwrot = (mwrot + madrot) % 16 -+ mwrot_r5 = (mwrot_r5 + madrot_r5) % 16 -+ -+ afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI) -+ afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF) -+ pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes( -+ [aarpack, abrpack, marpack, mbrpack], -+ [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL], -+ aop == AOP_FTOI, -+ [awpack, mwpack], -+ [afloatw, mop == MOP_FMUL]) -+ if forcebs[0]: -+ aarmux = RMUX_B -+ if forcebs[1]: -+ abrmux = RMUX_B -+ if forcebs[2]: -+ marmux = RMUX_B -+ if forcebs[3]: -+ mbrmux = RMUX_B -+ -+ # extend nops to 3 operands -+ if aop == AOP_NOP: -+ awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC -+ if mop == MOP_NOP: -+ mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC -+ -+ # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand) -+ if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ): -+ if forcerafloat: -+ assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand -+ # instead of duplicating the 2nd operand, take the ra operand from -+ # the mul op thus forcing the ra value to be considered a float for -+ # the purposes of unpacking -+ if marmux == RMUX_A: -+ abraddr, abrmux = maraddr, marmux -+ else: -+ assert mbrmux == RMUX_A -+ abraddr, abrmux = mbraddr, mbrmux -+ else: -+ abraddr, abrmux = aaraddr, aarmux -+ else: -+ assert not forcerafloat # can only forcerafloat if we have an unused operand -+ -+ # handle write addrs -+ if (awmux == mwmux) and (awmux != WMUX_ANY): -+ asm_error('add/mul ops not allowed to write to same regfile') -+ ws = (awmux == WMUX_B) or (mwmux == WMUX_A) -+ -+ # handle branch -+ if (aop == AOP_BRA) or (aop == AOP_BRR): -+ # check setf -+ if asf: -+ asm_error('setf not allowed on bra/brr') -+ -+ # check pack/unpack -+ if (pack != 0) or (unpack != 0): -+ asm_error('pack/unpack not allowed with bra/brr') -+ -+ # handle read address -+ if aarmux == RMUX_LABEL: -+ if (aop == AOP_BRA) and aaraddr[1]: -+ asm_warning('bra with rel label') -+ if (aop == AOP_BRR) and (not aaraddr[1]): -+ asm_warning('brr with abs label') -+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM -+ if aarmux == RMUX_ANY: -+ aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A -+ if (aarmux != RMUX_IMM) and (aarmux != RMUX_A): -+ asm_error('branch destination must be either label, immediate, or from regfile a') -+ if aarmux == RMUX_IMM: -+ imm = aaraddr -+ raddr = 0 # can't use RADDR_NOP -+ elif aarmux == RMUX_A: -+ if (aaraddr[1] != 0) or (aaraddr[2] != 0): -+ asm_error('rotation of read from regfile a not allowed with branch') -+ if aop == AOP_BRR: -+ asm_warning('brr with ra') -+ imm = 0 -+ raddr = aaraddr[0] -+ else: -+ assert 0 -+ -+ # check mul op is nop -+ if mop != MOP_NOP: -+ asm_error('mul op not allowed with branch') -+ -+ # check sig -+ if sig != SIG_NORMAL: -+ asm_error('no signal allowed with branch') -+ -+ if raddr >= 32: -+ asm_error('can only branch to register locations in physical regfile') -+ if raddr & 1: -+ asm_warning('branch instruction will destroy flags (see hw-2780)') -+ -+ # construct branch instruction -+ prog.append((imm, -+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28), -+ line, annots)) -+ -+ return -+ -+ # use COND_NEVER when possible (might save power / allow mul setf) -+ if not dict(annots).get('preserve_cond', 0): -+ if (awaddr == WADDR_NOP) and (not asf): -+ acond = COND_NEVER -+ if (mwaddr == WADDR_NOP) and (not msf): -+ mcond = COND_NEVER -+ -+ # attempt to convert movs to ldi -+ if (# no mul setf -+ (not msf) and -+ # ops must either be nop or mov of sema/label/imm/immv -+ ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and -+ ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and -+ # but we don't want 2 nops -+ ((aop != AOP_NOP) or (mop != MOP_NOP)) and -+ # if both ops are movs, srcs must be identical -+ ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and -+ # no signal -+ (sig == SIG_NORMAL)): -+ # make sure aarmux/aaraddr contains the value -+ if aop != AOP_MOV: -+ aarmux = marmux -+ aaraddr = maraddr -+ -+ # convert immediate -+ if aarmux == RMUX_SEMA: -+ ldi_mode = LDI_SEMA -+ elif aarmux == RMUX_LABEL: -+ ldi_mode = LDI_32 -+ aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM -+ elif aarmux == RMUX_IMMV: -+ signed, unsigned = True, True -+ imm = 0 -+ for i, elem in enumerate(aaraddr): -+ if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1): -+ signed = False -+ if elem not in (0, 1, 2, 3): -+ unsigned = False -+ imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i)) -+ if not (signed or unsigned): -+ asm_error('can\'t encode vector immediate') -+ if signed: -+ ldi_mode = LDI_EL_SIGNED -+ else: -+ ldi_mode = LDI_EL_UNSIGNED -+ aaraddr, aarmux = imm, RMUX_IMM -+ elif aarmux == RMUX_IMM: -+ ldi_mode = LDI_32 -+ else: -+ assert 0 -+ -+ # construct ldi instruction -+ prog.append((aaraddr, -+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28), -+ line, annots)) -+ -+ return -+ -+ # convert movs to alu ops -+ if aop == AOP_MOV: -+ if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0): -+ aop = AOP_XOR -+ aaraddr, aarmux = 0, RMUX_AC -+ abraddr, abrmux = 0, RMUX_AC -+ else: -+ aop = AOP_OR -+ abraddr, abrmux = aaraddr, aarmux -+ if mop == MOP_MOV: -+ if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0): -+ mop = MOP_V8SUBS -+ maraddr, marmux = 0, RMUX_AC -+ mbraddr, mbrmux = 0, RMUX_AC -+ else: -+ mop = MOP_V8MIN -+ mbraddr, mbrmux = maraddr, marmux -+ -+ # normal alu instruction... -+ -+ # handle setf -+ if asf and (aop == AOP_NOP): -+ asm_error('nop.setf is not allowed in add pipe') -+ if msf and (mop == MOP_NOP): -+ asm_warning('nop.setf, really?') -+ if (aop == AOP_NOP) or (acond == COND_NEVER): -+ sf = msf -+ else: -+ if msf: -+ asm_error('setf only allowed on mul op if add op is nop or add condition is never') -+ sf = asf -+ -+ # handle read addrs -+ raddr_a = None -+ raddr_b = None -+ immb = False -+ arot_r5 = False -+ muxes = [0, 0, 0, 0] -+ if mwrot != 0: -+ raddr_b = 48 + mwrot -+ immb = True -+ if mwrot_r5 and have_am: -+ raddr_b = 48 -+ immb = True -+ for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last -+ for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux): -+ if f(rmux): -+ raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux) -+ add_a, add_b, mul_a, mul_b = muxes -+ if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)): -+ # some output elements might not be as expected -+ if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)): -+ bad_elems = 0xffff -+ else: -+ bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111 -+ if mwrot > 12: -+ bad_elems ^= 0xffff -+ bad_elems &= dict(annots).get('mul_used', 0xffff) -+ if not msf: -+ if mwaddr == WADDR_NOP: -+ # not writing anywhere and not setting flags. no elements used -+ bad_elems = 0 -+ elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or -+ ((not ws) and (mwaddr == 37))): -+ # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/ -+ # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags. -+ # only use element 0 -+ bad_elems &= 0x0001 -+ elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or -+ ((not ws) and (mwaddr == 42))): -+ # writing to r5quad/x_coord/y_coord/rev_flag and not setting -+ # flags. only use elements 0, 4, 8, and 12 -+ bad_elems &= 0x1111 -+ if bad_elems: -+ asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected') -+ if raddr_a is None: -+ raddr_a = RADDR_NOP -+ if raddr_b is None: -+ raddr_b = RADDR_NOP -+ if immb: -+ if sig != SIG_NORMAL: -+ asm_error('rotation/immediates and signal don\'t mix') -+ sig = SIG_SMALLIMMED -+ if arot_r5 or (mwrot_r5 and (not have_am)): -+ if sig != SIG_NORMAL: -+ asm_error('rotation/immediates/signal don\'t mix') -+ sig = SIG_ROTATE -+ -+ # construct instruction -+ prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29), -+ (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28), -+ line, annots)) -+ finally: -+ current_location = prev_location -+ -+def preprocess_passthrough(file): -+ line_number = 0 -+ for line in file: -+ line_number += 1 -+ yield line_number, line -+ -+def asm_file(sets, location, filename, preprocess = None): -+ global current_dir, current_location -+ -+ if filename is None: -+ location = '' -+ file = sys.stdin -+ -+ prev_dir = current_dir -+ else: -+ filename = os.path.normpath(os.path.join(current_dir, filename)) -+ -+ try: -+ file = open(filename) -+ except Exception, e: -+ asm_error(e) -+ except: -+ asm_error('unknown error while opening file %s' % filename) -+ -+ prev_dir = current_dir -+ current_dir = os.path.dirname(filename) -+ -+ prev_location = current_location -+ current_location = location -+ -+ if preprocess is None: -+ preprocess = preprocess_passthrough -+ -+ try: -+ for line_number, line in preprocess(file): -+ # strip off comments and whitespace -+ line = line.split('#')[0].strip() -+ if line == '': -+ continue -+ -+ asm_line(sets, '%s: %d' % (current_location, line_number), line) -+ finally: -+ current_dir = prev_dir -+ current_location = prev_location -+ -+def asm_end_prog(): -+ # check we aren't in a multi-line construct (eg .macro or .rep) -+ if construct != None: -+ asm_error({ -+ CONSTRUCT_MACRO: '.macro without .endm', -+ CONSTRUCT_IF: '.if/.elif without .endif', -+ CONSTRUCT_ELSE: '.else without .endif', -+ CONSTRUCT_REP: '.rep without .endr'}[construct_stack[-1]]) -+ -+ # check no warnings level back to 0 -+ if nwarn_level != 0: -+ asm_error('.pushnwarn without .popnwarn') -+ -+ # flush queued up data -+ asm_flush_prog_data() -+ -+ # fixup all the label references we can -+ for pc in xrange(len(prog)): -+ if isinstance(prog[pc][0], tuple): -+ location, label, rel, offset = prog[pc][0] -+ if label[0].isdigit(): -+ label_pcs = labels.get(label[:-1], []) -+ if label[-1] == 'b': -+ label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:] -+ else: -+ label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1] -+ if label_pcs == []: -+ asm_error('search for label reached begin/end of file', location = location) -+ imm = label_pcs[0] -+ elif label in labels: -+ imm = labels[label] -+ elif (':' + label) in labels: -+ imm = labels[':' + label] -+ elif external_link: -+ continue # let the external linker deal with it -+ else: -+ asm_error('undefined label', location = location) -+ imm = (imm * 8) + offset -+ if rel: -+ imm -= (pc + 4) * 8 # relative to instruction after delay slots -+ imm &= (1 << 32) - 1 -+ else: -+ if not external_link: -+ asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location) -+ imm = (location, label, rel, offset, imm) -+ prog[pc] = (imm,) + prog[pc][1:] -+ -+def asm_init(): -+ global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level -+ -+ current_dir = os.getcwd() -+ current_location = '' -+ prog = [] -+ prog_data = [] -+ macros = { -+ 'sacq': (['dst', 'i'], [('candyland', 'mov dst, sacq(i)')]), -+ 'srel': (['dst', 'i'], [('candyland', 'mov dst, srel(i)')])} -+ labels = {} -+ construct = None -+ construct_stack = [] -+ nwarn_level = 0 -+ -+def asm_reset_prog(): -+ global prog, labels -+ -+ prog = [] -+ labels = {} -+ -+############################################################################### -+# dumping -+############################################################################### -+ -+def print_lines(lines): -+ for line in lines: -+ print line -+ -+class dumper_t: -+ def external_link(self): return False -+ def begin(self): pass -+ def label(self, pc, name): pass -+ def line(self, pc, ls, ms, line, annots, first): pass -+ def end(self): pass -+ def sets(self, sets): pass -+ def direct(self, line): pass -+ -+class clif_dumper_t(dumper_t): -+ def __init__(self): -+ self.annot_mode = 0 -+ -+ def external_link(self): -+ return True -+ -+ def parse_annot_mode(self, line): -+ l = line.split(',') -+ self.annot_mode = int(l[0]) -+ if self.annot_mode not in (0, 1, 2): -+ asm_error('bad annot mode') -+ if self.annot_mode == 2: -+ if len(l) != 2: -+ asm_error('expected buffer name') -+ self.annot_name = l[1].strip() -+ self.annot_offset = 0 -+ elif len(l) != 1: -+ asm_error('unexpected comma') -+ -+ def label(self, pc, name): -+ if (self.annot_mode != 1) and (name[0] == ':'): -+ if self.annot_mode == 2: -+ name = name + '_annotations' -+ print '@label %s' % name[1:] -+ else: -+ print '// :%s' % name -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if self.annot_mode == 0: -+ if isinstance(ls, tuple): -+ if len(ls) == 5: -+ location, label, rel, offset, offset_from_prog = ls -+ assert not rel -+ ls = '[. - %d + %d]' % (pc * 8, offset_from_prog) -+ else: -+ location, label, rel, offset = ls -+ if rel: -+ asm_error('relative external label references not allowed in this mode', location = location) -+ ls = '[%s + %d]' % (label, offset) -+ else: -+ ls = '0x%08x' % ls -+ print '%s 0x%08x // %s' % (ls, ms, line) -+ elif self.annot_mode == 1: -+ print '// %s' % line -+ for annot in annots: -+ print '0x%08x 0x%08x // %s' % ({ -+ # todo: would rather not have these hard coded -+ 'mul_used': 1, -+ 'preserve_cond': 2, -+ 'geomd_open': 3, -+ 'geomd_i': 4, -+ 'geomd_tris_clear': 5, -+ 'geomd_verts': 6, -+ 'geomd_tris_add': 7, -+ 'geomd_tris_set_center': 8, -+ 'geomd_region_clear': 9, -+ 'geomd_region_set': 10, -+ 'geomd_images_clear': 11, -+ 'geomd_images_l': 12, -+ 'geomd_images_b': 13, -+ 'geomd_images_r': 14, -+ 'geomd_images_t': 15, -+ 'geomd_images_add_vpm': 16, -+ 'trace_4c': 17, -+ 'geomd_images_add_tex': 18,}[annot[0]], annot[1], annot[0]) -+ if len(annots) != 0: -+ print '0x00000000 // end' -+ else: -+ assert self.annot_mode == 2 -+ if len(annots) == 0: -+ print '0x00000000 // %s' % line -+ else: -+ print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line) -+ self.annot_offset += (len(annots) * 8) + 4 -+ -+ def direct(self, line): -+ print line -+ -+class plain_dumper_t(dumper_t): -+ def line(self, pc, ls, ms, line, annots, first): -+ print '0x%08x, 0x%08x, // %s' % (ls, ms, line) -+ -+class c_c_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, array_name): -+ self.header_name = header_name -+ self.array_name = array_name -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ self.external_labels = set() -+ self.lines = [] -+ -+ print '#include "%s.h"' % self.header_name -+ print '' -+ print '#ifdef _MSC_VER' -+ print ' #include ' -+ print ' /* cast through uintptr_t to avoid warnings */' -+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))' -+ print '#else' -+ print ' #define POINTER_TO_UINT(X) ((unsigned int)(X))' -+ print '#endif' -+ print '' -+ print '#ifdef __cplusplus' -+ print 'extern "C" { /* the types are probably wrong... */' -+ print '#endif' -+ -+ def label(self, pc, name): -+ self.lines.append('// :%s' % name) -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if isinstance(ls, tuple): -+ if len(ls) == 5: -+ location, label, rel, offset, offset_from_prog = ls -+ assert not rel -+ ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog) -+ else: -+ location, label, rel, offset = ls -+ if rel: -+ asm_error('relative external label references not allowed in this mode', location = location) -+ if label not in self.external_labels: -+ self.external_labels.add(label) -+ print 'extern uint8_t %s[];' % label -+ ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset) -+ else: -+ ls = '0x%08x' % ls -+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line)) -+ -+ def end(self): -+ print '#ifdef __cplusplus' -+ print '}' -+ print '#endif' -+ print '' -+ print '#ifdef _MSC_VER' -+ print '__declspec(align(8))' -+ print '#elif defined(__GNUC__)' -+ print '__attribute__((aligned(8)))' -+ print '#endif' -+ print 'unsigned int %s[] = {' % self.array_name -+ print_lines(self.lines) -+ print '};' -+ print '#ifdef __HIGHC__' -+ print '#pragma Align_to(8, %s)' % self.array_name -+ print '#endif' -+ -+class c_h_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, array_name): -+ self.full_header_name = full_header_name -+ self.array_name = array_name -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ print '#ifndef %s_H' % self.full_header_name -+ print '#define %s_H' % self.full_header_name -+ print '' -+ print 'extern unsigned int %s[];' % self.array_name -+ print '' -+ -+ def label(self, pc, name): -+ if name[0] == ':': -+ print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2) -+ -+ def end(self): -+ print '' -+ print '#endif' -+ -+class ml_c_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, name, annots): -+ self.header_name = header_name -+ self.name = name -+ self.annots = annots -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ if self.annots: -+ self.annot_lines = [] -+ self.lines = [] -+ self.external_labels = set() -+ self.link_lines = [] -+ -+ print '#include "%s.h"' % self.header_name -+ print '#include ' -+ if self.annots: -+ print '#ifdef SIMPENROSE' -+ print '#include ' -+ print '#include "v3d/verification/tools/2760sim/simpenrose.h"' -+ print '' -+ -+ def label(self, pc, name): -+ self.lines.append('// :%s' % name) -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if self.annots: -+ if len(annots) == 0: -+ self.annot_lines.append('NULL,') -+ else: -+ print 'static unsigned int const annotations_%d[] = {' % pc -+ for annot in annots: -+ print ' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]) -+ print ' SIMPENROSE_SHADER_ANNOTATION_END};' -+ print '' -+ self.annot_lines.append('annotations_%d,' % pc) -+ if isinstance(ls, tuple): -+ self.link_lines.append(' assert(p[%d] == 0xdeadbeef);' % (pc * 2)) -+ if len(ls) == 5: -+ location, label, rel, offset, offset_from_prog = ls -+ assert not rel -+ self.link_lines.append(' p[%d] = base + %d;' % (pc * 2, offset_from_prog)) -+ else: -+ location, label, rel, offset = ls -+ self.external_labels.add(label) -+ if rel: -+ self.link_lines.append(' p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8)) -+ else: -+ self.link_lines.append(' p[%d] = %s + %d;' % (pc * 2, label, offset)) -+ ls = '0xdeadbeef' -+ else: -+ ls = '0x%08x' % ls -+ self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line)) -+ -+ def end(self): -+ if self.annots: -+ print 'unsigned int const *const %s_annotations_array[] = {' % self.name -+ print_lines(self.annot_lines) -+ print '};' -+ print '#endif' -+ print '' -+ print 'static unsigned int const array[] = {' -+ print_lines(self.lines) -+ print '};' -+ print '' -+ print 'void %s_link(void *p_in, unsigned int base' % self.name -+ for label in sorted(self.external_labels): -+ print ' , unsigned int %s' % label -+ print ' )' -+ print '{' -+ print ' unsigned int *p = (unsigned int *)p_in;' -+ print ' unsigned int i;' -+ print ' for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper() -+ print ' p[i] = array[i];' -+ print ' }' -+ print_lines(self.link_lines) -+ print '}' -+ -+class ml_h_dumper_t(dumper_t): -+ def __init__(self, header_name, full_header_name, name, annots): -+ self.full_header_name = full_header_name -+ self.name = name -+ self.annots = annots -+ -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ self.external_labels = set() -+ self.lines_n = 0 -+ -+ print '#ifndef %s_H' % self.full_header_name -+ print '#define %s_H' % self.full_header_name -+ print '' -+ if self.annots: -+ print '#ifdef SIMPENROSE' -+ print ' extern unsigned int const *const %s_annotations_array[];' % self.name -+ print '#endif' -+ print '' -+ -+ def label(self, pc, name): -+ if name[0] == ':': -+ print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8) -+ if self.annots: -+ print '#ifdef SIMPENROSE' -+ print ' #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc) -+ print '#endif' -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if isinstance(ls, tuple) and (len(ls) != 5): -+ self.external_labels.add(ls[1]) -+ self.lines_n += 1 -+ -+ def end(self): -+ print '' -+ print 'extern void %s_link(void *p, unsigned int base' % self.name -+ for label in sorted(self.external_labels): -+ print ' , unsigned int %s' % label -+ print ' );' -+ print '' -+ print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8)) -+ print '' -+ print '#endif' -+ -+def print_lines_lc(lines): -+ for line in lines: -+ print '%s \\' % line -+ -+def print_groups_lc(groups): -+ first = True -+ for group in groups: -+ if first: -+ print '{ \\' -+ else: -+ print ', { \\' -+ print_lines_lc(group) -+ print '} \\' -+ first = False -+ -+class inline_c_dumper_t(dumper_t): -+ def __init__(self, annots): -+ self.annots = annots -+ self.iteration = False -+ -+ def begin_iteration(self): -+ assert not self.iteration -+ self.iteration = True -+ self.iteration_lines = [] -+ if self.annots: -+ self.iteration_annot_lines = [] -+ self.annot_arrs = [] -+ -+ def end_iteration(self): -+ assert self.iteration -+ self.iteration = False -+ print '%d, \\' % self.iteration_n -+ if self.annots: -+ print '( \\' -+ print_groups_lc(self.iteration_lines) -+ if self.annots: -+ print '), ( \\' -+ print_groups_lc(self.iteration_annot_lines) -+ print '), ( \\' -+ for annot_arr in self.annot_arrs: -+ print_lines_lc(annot_arr) -+ print ') \\' -+ -+ def begin(self): -+ self.n = 0 -+ self.lines = [] -+ if self.annots: -+ self.annot_lines = [] -+ if not self.iteration: -+ self.annot_arrs = [] -+ -+ def label(self, pc, name): -+ self.lines.append('/* :%s */' % name) -+ if self.annots: -+ self.annot_lines.append('/* :%s */' % name) -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ self.n += 1 -+ if first: -+ prefix = '' -+ else: -+ prefix = ', ' -+ self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line)) -+ if self.annots: -+ if len(annots) == 0: -+ a = 'NULL' -+ else: -+ a = 'annotations_%d' % len(self.annot_arrs) -+ annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)] -+ for annot in annots: -+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])) -+ annot_arr.append(' SIMPENROSE_SHADER_ANNOTATION_END};') -+ self.annot_arrs.append(annot_arr) -+ self.annot_lines.append('%s%s /* %s */' % (prefix, a, line)) -+ -+ def end(self): -+ if self.iteration: -+ if len(self.iteration_lines) == 0: -+ self.iteration_n = self.n -+ elif self.iteration_n != self.n: -+ asm_error('number of instructions differs between iterations') -+ self.iteration_lines.append(self.lines) -+ if self.annots: -+ self.iteration_annot_lines.append(self.annot_lines) -+ else: -+ if self.annots: -+ print '( \\' -+ print_lines_lc(self.lines) -+ if self.annots: -+ print '), ( \\' -+ print_lines_lc(self.annot_lines) -+ print '), ( \\' -+ for annot_arr in self.annot_arrs: -+ print_lines_lc(annot_arr) -+ print ') \\' -+ -+ def direct(self, line): -+ print line -+ -+class asvc_dumper_t(dumper_t): -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ print '.align 8' -+ -+ def label(self, pc, name): -+ if name[0] == ':': -+ print '%s::' % name[1:] -+ else: -+ print '%s:' % name -+ -+ def line(self, pc, ls, ms, line, annots, first): -+ if isinstance(ls, tuple): -+ location, label, rel, offset = ls[:4] -+ if rel: -+ ls = '%s + %d - (. + 32)' % (label, offset) -+ else: -+ ls = '%s + %d' % (label, offset) -+ else: -+ ls = '0x%08x' % ls -+ print '.word %s, 0x%08x ; %s' % (ls, ms, line) -+ -+def is_ra_or_rb(val): -+ return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B)) -+ -+class aliases_dumper_t(dumper_t): -+ def external_link(self): -+ return True -+ -+ def begin(self): -+ print '#ifndef JUST_DQASM_ARGS' -+ -+ def label(self, pc, name): -+ if not name[0].isdigit(): -+ if name[0] == ':': -+ name = name[1:] -+ print '"bs%s", "bs%x",' % (name, pc * 8) -+ print '"bu%s", "bu%x",' % (name, pc * 8) -+ -+ def end(self): -+ print '#endif' -+ -+ # todo: handle things other than ra and rb? dqasm only allows ra and rb atm -+ def sets(self, sets): -+ dqasm_args = [] -+ print '#ifndef JUST_DQASM_ARGS' -+ for name in sets: -+ if is_ra_or_rb(sets[name]): -+ dqasm_args.append('-r%s=%s' % (sets[name], name)) -+ print '"%s", "%s",' % (name, sets[name]) -+ elif isinstance(sets[name], list): -+ for i, val in enumerate(sets[name]): -+ if is_ra_or_rb(val): -+ dqasm_args.append('-r%s=%s[%d]' % (val, name, i)) -+ print '"%s[%d]", "%s",' % (name, i, val) -+ print '#endif' -+ print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args) -+ -+def dump(dumper): -+ if (len(prog) != 0) or (len(labels) != 0): -+ dumper.begin() -+ -+ sorted_labels = [] -+ for name in labels: -+ if name[0].isdigit(): -+ for pc in labels[name]: -+ sorted_labels.append((pc, name)) -+ else: -+ sorted_labels.append((labels[name], name)) -+ sorted_labels.sort(reverse = True) -+ -+ first = True -+ for pc in xrange(len(prog)): -+ ls, ms, line, annots = prog[pc] -+ while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc): -+ dumper.label(*sorted_labels.pop()) -+ dumper.line(pc, ls, ms, line, annots, first) -+ first = False -+ for sorted_label in sorted_labels: -+ assert sorted_label[0] == len(prog) -+ dumper.label(*sorted_label) -+ -+ dumper.end() -+ -+############################################################################### -+# preprocessing -+############################################################################### -+ -+def preprocess_inline_c(dumper): -+ def preprocess(file): -+ ls = None -+ line_number = 0 -+ for line in file: -+ line_number += 1 -+ while True: -+ if ls is None: -+ l = line.split('%[', 1) -+ if len(l) == 1: -+ dumper.direct(l[0].rstrip()) -+ break -+ dumper.direct('%s \\' % l[0].rstrip()) -+ line = l[1] -+ ls = [] -+ else: -+ l = line.split('%]', 1) -+ ls.append((line_number, l[0])) -+ if len(l) == 1: -+ break -+ line = l[1] -+ l = ls[-1][1].split('%|', 1) -+ if len(l) == 1: -+ for l_number, l in ls: -+ yield l_number, l -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ else: -+ ls[-1] = (ls[-1][0], l[0]) -+ if hasattr(dumper, 'begin_iteration'): -+ dumper.begin_iteration() -+ for repls in l[1].split('%,'): -+ repls = [repl.strip() for repl in repls.split('%/')] -+ for l_number, l in ls: -+ for i, repl in enumerate(repls): -+ l = l.replace('%' + str(i), repl) -+ yield l_number, l -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ if hasattr(dumper, 'end_iteration'): -+ dumper.end_iteration() -+ ls = None -+ return preprocess -+ -+def preprocess_clif(dumper): -+ def preprocess(file): -+ in_asm = False -+ line_number = 0 -+ for line in file: -+ line_number += 1 -+ if in_asm: -+ if line.strip() == '%]': -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ in_asm = False -+ else: -+ yield line_number, line -+ else: -+ if line.strip() == '%[': -+ in_asm = True -+ elif (line[:1] == '%') and (line[:2] != '%@'): -+ yield line_number, line[1:] -+ else: -+ asm_end_prog() -+ dump(dumper) -+ asm_reset_prog() -+ if line[:2] == '%@': -+ if hasattr(dumper, 'parse_annot_mode'): -+ dumper.parse_annot_mode(line[2:]) -+ else: -+ dumper.direct(line.rstrip()) -+ return preprocess -+ -+############################################################################### -+# main -+############################################################################### ++import string ++import os ++import tempfile ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++class tstats: ++ close_threshold = 0.01 ++ ++ def __init__(self, stats_dict=None): ++ if stats_dict != None: ++ self.name = stats_dict["name"] ++ self.elapsed = float(stats_dict["elapsed"]) ++ self.user = float(stats_dict["user"]) ++ self.sys = float(stats_dict["sys"]) ++ ++ def times_str(self): ++ ctime = self.sys + self.user ++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed) ++ ++ def dict(self): ++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys} ++ ++ def is_close(self, other): ++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold ++ ++ def __lt__(self, other): ++ return self.elapsed < other.elapsed ++ def __gt__(self, other): ++ return self.elapsed > other.elapsed ++ ++ def time_file(name, prefix): ++ stats = tstats() ++ stats.name = name ++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name, ++ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog); ++ pinfo = os.wait4(cproc.pid, 0) ++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ stats.elapsed = end_time - start_time ++ stats.user = pinfo[2].ru_utime ++ stats.sys = pinfo[2].ru_stime ++ return stats ++ ++ ++def common_prefix(s1, s2): ++ for i in range(min(len(s1),len(s2))): ++ if s1[i] != s2[i]: ++ return s1[:i] ++ return s1[:i+1] + +def main(): -+ global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5 -+ global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate ++ global flog + -+ asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work ++ argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog=""" ++To blank the screen before starting use "xdg-screensaver activate" ++(For some reason this doesn't seem to work from within python). ++""") + -+ # parse command line -+ parser = optparse.OptionParser(usage = 'usage: %prog [options] ') -+ parser.add_option('-m', '--mode', dest = 'mode', -+ help = ' should be clif, plain, ' + -+ 'c_c:,,, ' + -+ 'c_h:,,, ' + -+ 'ml_c:,,[,annots], ' + -+ 'ml_h:,,[,annots], ' + -+ 'inline_c[:annots], asvc, or aliases[:]', metavar = '') -+ parser.add_option('-t', '--target', dest = 'target', -+ help = ' should be a0, b0, or hera', metavar = '') -+ parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False) -+ parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False) -+ parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False) -+ parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False) -+ parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '=') -+ options, args = parser.parse_args() -+ if len(args) == 0: -+ filename = None -+ elif len(args) == 1: -+ filename = args[0] -+ else: -+ parser.print_help() -+ sys.exit(-1) ++ argp.add_argument("streams", nargs='*') ++ argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename") ++ argp.add_argument("--csv_in", help="CSV input filename") ++ argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).") + -+ # handle mode -+ mode = options.mode or 'clif' # assume clif if no mode specified -+ if mode == 'clif': -+ dumper = clif_dumper_t() -+ preprocess = preprocess_clif(dumper) -+ elif mode == 'plain': -+ dumper = plain_dumper_t() -+ preprocess = None -+ elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'): -+ mode_options = mode[4:].split(',') -+ if len(mode_options) != 3: -+ asm_error('badly formatted mode on command line') -+ dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options) -+ preprocess = None -+ elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'): -+ mode_options = mode[5:].split(',') -+ if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')): -+ asm_error('badly formatted mode on command line') -+ dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t -+ }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4])) -+ preprocess = None -+ elif mode == 'inline_c': -+ dumper = inline_c_dumper_t(False) -+ preprocess = preprocess_inline_c(dumper) -+ elif mode == 'inline_c:annots': -+ dumper = inline_c_dumper_t(True) -+ preprocess = preprocess_inline_c(dumper) -+ elif mode == 'asvc': -+ dumper = asvc_dumper_t() -+ preprocess = None -+ elif mode == 'aliases': -+ dumper = aliases_dumper_t() -+ preprocess = None -+ elif mode == 'aliases:inline_c': -+ dumper = aliases_dumper_t() -+ preprocess = preprocess_inline_c(dumper) -+ else: -+ asm_error('invalid mode') -+ external_link = dumper.external_link() ++ args = argp.parse_args() + -+ # handle target -+ target = options.target or 'b0' # assume b0 if no target specified -+ if target == 'a0': -+ have_sema = False -+ have_am = False -+ mulw_rotate = False -+ have_lthrsw = False -+ elif target == 'b0': -+ have_sema = True -+ have_am = True -+ mulw_rotate = True -+ have_lthrsw = True -+ elif target == 'hera': -+ have_sema = True -+ have_am = False -+ mulw_rotate = True -+ have_lthrsw = True -+ else: -+ asm_error('invalid target') -+ if have_am: -+ sigs['loadam'] = SIG_LOADAM -+ arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE) -+ if have_lthrsw: -+ sigs['lthrsw'] = SIG_LTHRSW -+ del sigs['int'] -+ arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE) ++ csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"]) ++ csv_out.writeheader() + -+ # handle misc options -+ allow_xor_0 = options.allow_xor_0 -+ dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5 -+ warnings_are_errors = options.warnings_are_errors -+ disable_warnings = options.disable_warnings ++ stats_in = {} ++ if args.csv_in != None: ++ with open(args.csv_in, 'r', newline='') as f_in: ++ stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} + -+ # make options visible to asm -+ arg_defs['mode'] = mode -+ arg_defs['target'] = target ++ flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt") + -+ # arg_defs all setup at this point -+ sets = arg_defs.copy() # todo: see arg_eval ++ streams = args.streams ++ if not streams: ++ if not stats_in: ++ print ("No source streams specified") ++ return 1 ++ prefix = "" if args.prefix == None else args.prefix ++ streams = [k for k in stats_in] ++ elif args.prefix != None: ++ prefix = args.prefix ++ else: ++ prefix = streams[0] ++ for f in streams[1:]: ++ prefix = common_prefix(prefix, f) ++ pp = prefix.rpartition(os.sep) ++ prefix = pp[0] + pp[1] ++ streams = [s[len(prefix):] for s in streams] + -+ # handle command line sets -+ re_options_set = re.compile('(?P\\w+)=(?P.+)$') -+ for options_set in options.sets: -+ m = re_options_set.match(options_set) -+ if not m: -+ asm_error('badly formatted set on command line') -+ sets[m.group('name')] = arg_eval(m.group('val'), sets) ++ for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()): ++ print ("====", f) ++ ++ t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999}) ++ for i in range(3): ++ t = tstats.time_file(f, prefix) ++ print ("...", t.times_str()) ++ if t0 > t: ++ t0 = t ++ ++ if t0.name in stats_in: ++ pstat = stats_in[t0.name] ++ print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str()) ++ ++ csv_out.writerow(t0.dict()) ++ ++ print () ++ ++ return 0 + -+ # assemble input file and dump -+ asm_file(sets, filename, filename, preprocess) -+ asm_end_prog() -+ dump(dumper) -+ for name in arg_defs: # todo: see arg_eval -+ del sets[name] -+ dumper.sets(sets) + +if __name__ == '__main__': -+ main() ++ exit(main()) ++ +diff --git a/pi-util/make_array.py b/pi-util/make_array.py +new file mode 100755 +index 0000000000..864fa5e704 +--- /dev/null ++++ b/pi-util/make_array.py +@@ -0,0 +1,19 @@ ++#!/usr/bin/env python ++ ++# Usage ++# make_array file.bin ++# Produces file.h with array of bytes. ++# ++import sys ++for file in sys.argv[1:]: ++ prefix,suffix = file.split('.') ++ assert suffix=='bin' ++ name=prefix.split('/')[-1] ++ print 'Converting',file ++ with open(prefix+'.h','wb') as out: ++ print >>out, 'static const unsigned char',name,'[] = {' ++ with open(file,'rb') as fd: ++ for byte in fd.read(): ++ print >>out, '%d,' % ord(byte) ++ print >>out,'};' ++ diff --git a/pi-util/qem.sh b/pi-util/qem.sh -new file mode 100644 -index 0000000..47dd071 +new file mode 100755 +index 0000000000..5ce2eeaf72 --- /dev/null +++ b/pi-util/qem.sh @@ -0,0 +1,9 @@ +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex -+QASM=python\ pi-util/qasm.py ++QASM=python\ ../local/bin/qasm.py +SRC_FILE=libavcodec/rpi_shader.qasm +DST_BASE=shader + @@ -21696,101 +30228,9 @@ index 0000000..47dd071 +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h + -diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py -new file mode 100755 -index 0000000..6a9a33f ---- /dev/null -+++ b/pi-util/rebase_liblinks.py -@@ -0,0 +1,37 @@ -+#!/usr/bin/env python -+ -+import os, sys -+from stat import * -+ -+def walktree(top, callback, n, prefix): -+ '''recursively descend the directory tree rooted at top, -+ calling the callback function for each regular file''' -+ -+ for f in os.listdir(top): -+ pathname = os.path.join(top, f) -+ mode = os.lstat(pathname).st_mode -+ if S_ISDIR(mode): -+ # It's a directory, recurse into it -+ walktree(pathname, callback, n+1, prefix) -+ elif S_ISLNK(mode): -+ # It's a file, call the callback function -+ callback(pathname, os.readlink(pathname), n, prefix) -+ -+def visitfile(file, linkname, n, prefix): -+ if (linkname.startswith(prefix + 'lib/')): -+ newlink = "../" * n + linkname[len(prefix):] -+ print 'relinking', file, "->", newlink -+ os.remove(file) -+ os.symlink(newlink, file) -+ -+if __name__ == '__main__': -+ argc = len(sys.argv) -+ if argc == 2: -+ walktree(sys.argv[1], visitfile, 0, "/") -+ elif argc == 3: -+ walktree(sys.argv[1], visitfile, 0, sys.argv[2]) -+ else: -+ print "rebase_liblinks.py []" -+ -+ -+ -diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh -new file mode 100755 -index 0000000..d8bdd91 ---- /dev/null -+++ b/pi-util/syncroot.sh -@@ -0,0 +1,43 @@ -+set -e -+ -+if [ "$1" == "" ]; then -+ echo Usage: $0 \ [\] -+ echo src_dir is a source for rsync so may contain m/c name. -+ echo rootname will be set to \"raspian_jessie_pi1\" if missing -+ echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1 -+ exit 1 -+fi -+ -+SYSROOT_NAME=$2 -+if [ "$SYSROOT_NAME" == "" ]; then -+ SYSROOT_NAME=raspian_jessie_pi1 -+fi -+ -+DST_ROOT=`pwd` -+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot -+SRC=$1 -+ -+echo Sync src: $SRC -+echo Sync dest: $DST -+ -+mkdir -p $DST/lib -+mkdir -p $DST/opt/vc/include -+mkdir -p $DST/usr/lib/pkgconfig -+mkdir -p $DST/usr/bin -+mkdir -p $DST/usr/share -+ -+#### MUST NOT include /opt/vc/include/*GL* -+# Creates conflicts with GL includes inside Chrome -+ -+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib -+rsync -rl $SRC/opt/vc/lib $DST/opt/vc -+rsync -l $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include -+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include -+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include -+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib -+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib -+rsync -rl $SRC/usr/include $DST/usr -+ -+pi-util/rebase_liblinks.py $DST -+ -+ diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py -new file mode 100644 -index 0000000..5935a11 +new file mode 100755 +index 0000000000..5935a11ca5 --- /dev/null +++ b/pi-util/v3dusage.py @@ -0,0 +1,128 @@