From 34bc440c43f261effaabfefb6eef46a2766426a9 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Fri, 11 Mar 2022 17:11:52 +0100 Subject: [PATCH 1/2] ffmpeg: update rpi patch Patch created using revisions dc91b91..34fb1cd from branch dev/4.4/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg --- .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch | 166 ++++++++++-------- 1 file changed, 88 insertions(+), 78 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch index e5d29c0507..1ccf22ba72 100644 --- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch +++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch @@ -47380,7 +47380,7 @@ index 8dbc7fc104..7d5fadcd3d 100644 /** * Enqueues a V4L2Buffer diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index ff1ea8e57b..b2c40636a2 100644 +index ff1ea8e57b..d8a86e8261 100644 --- a/libavcodec/v4l2_context.c +++ b/libavcodec/v4l2_context.c @@ -27,11 +27,13 @@ @@ -47625,7 +47625,7 @@ index ff1ea8e57b..b2c40636a2 100644 return 1; } -@@ -280,171 +291,267 @@ static int v4l2_stop_encode(V4L2Context *ctx) +@@ -280,171 +291,274 @@ static int v4l2_stop_encode(V4L2Context *ctx) return 0; } @@ -47698,6 +47698,7 @@ index ff1ea8e57b..b2c40636a2 100644 + + while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) { + const int err = errno; ++ av_assert0(AVERROR(err) < 0); + if (err != EINTR) { + av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", + ctx->name, av_err2str(AVERROR(err))); @@ -47811,6 +47812,12 @@ index ff1ea8e57b..b2c40636a2 100644 - /* if re-init failed, abort */ - ctx->done = 1; - return NULL; +- } +- if (ret) { +- /* if re-init was successful drop the buffer (if there was one) +- * since we had to reconfigure capture (unmap all buffers) +- */ +- return NULL; + if (evt.type == V4L2_EVENT_SOURCE_CHANGE) + return do_source_change(m); + @@ -47821,6 +47828,7 @@ index ff1ea8e57b..b2c40636a2 100644 +// Get a buffer +// If output then just gets the buffer in the expected way +// If capture then runs the capture state m/c to deal with res change etc. ++// If return value == 0 then *ppavbuf != NULL + +static int +get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout) @@ -47849,52 +47857,40 @@ index ff1ea8e57b..b2c40636a2 100644 + av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name); + return AVERROR_EOF; } -- if (ret) { -- /* if re-init was successful drop the buffer (if there was one) -- * since we had to reconfigure capture (unmap all buffers) -- */ -- return NULL; -+ +- } + +- /* 2. dequeue the buffer */ +- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { + // If capture && timeout == -1 then also wait for rx buffer free + if (is_cap && timeout == -1 && m->output.streamon && !m->draining) + pfd.events |= poll_out; -+ + +- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { +- /* there is a capture buffer ready */ +- if (pfd.revents & (POLLIN | POLLRDNORM)) +- goto dequeue; + // If nothing Qed all we will get is POLLERR - avoid that + if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) || + (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || + (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { + av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); + return AVERROR(EAGAIN); - } -- } - -- /* 2. dequeue the buffer */ -- if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) { -+ // Timeout kludged s.t. "forever" eventually gives up & produces logging -+ // If waiting for an event when we have seen a last_frame then we expect -+ // it to be ready already so force a short timeout -+ ret = poll(&pfd, 1, -+ ff_v4l2_ctx_eos(ctx) ? 10 : -+ timeout == -1 ? 3000 : timeout); - -- if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) { -- /* there is a capture buffer ready */ -- if (pfd.revents & (POLLIN | POLLRDNORM)) -- goto dequeue; -+ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", -+ ctx->name, ret, timeout, pfd.events, pfd.revents); ++ } - /* the driver is ready to accept more input; instead of waiting for the capture - * buffer to complete we return NULL so input can proceed (we are single threaded) - */ - if (pfd.revents & (POLLOUT | POLLWRNORM)) - return NULL; ++ // Timeout kludged s.t. "forever" eventually gives up & produces logging ++ // If waiting for an event when we have seen a last_frame then we expect ++ // it to be ready already so force a short timeout ++ ret = poll(&pfd, 1, ++ ff_v4l2_ctx_eos(ctx) ? 10 : ++ timeout == -1 ? 3000 : timeout); + if (ret < 0) { -+ const int err = errno; -+ if (err == EINTR) -+ continue; -+ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, err, strerror(err)); -+ return AVERROR(err); ++ ret = AVERROR(errno); // Remember errno before logging etc. ++ av_assert0(ret < 0); } -dequeue: @@ -47905,6 +47901,23 @@ index ff1ea8e57b..b2c40636a2 100644 - memset(planes, 0, sizeof(planes)); - buf.length = VIDEO_MAX_PLANES; - buf.m.planes = planes; ++ av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n", ++ ctx->name, ret, timeout, pfd.events, pfd.revents); ++ ++ if (ret < 0) { ++ if (ret == AVERROR(EINTR)) ++ continue; ++ av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret)); ++ return ret; + } + +- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); +- if (ret) { +- if (errno != EAGAIN) { +- ctx->done = 1; +- if (errno != EPIPE) +- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", +- ctx->name, av_err2str(AVERROR(errno))); + if (ret == 0) { + if (timeout == -1) + av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events); @@ -47915,19 +47928,11 @@ index ff1ea8e57b..b2c40636a2 100644 + ctx->done = 1; + return ret; + } -+ } -+ return AVERROR(EAGAIN); - } - -- ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf); -- if (ret) { -- if (errno != EAGAIN) { -- ctx->done = 1; -- if (errno != EPIPE) -- av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", -- ctx->name, av_err2str(AVERROR(errno))); -- } + } - return NULL; ++ return AVERROR(EAGAIN); ++ } ++ + if ((pfd.revents & POLLERR) != 0) { + av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name); + return AVERROR_UNKNOWN; @@ -47949,6 +47954,13 @@ index ff1ea8e57b..b2c40636a2 100644 - ctx->done = 1; -#endif + continue; ++ } ++ ++ if ((pfd.revents & poll_cap) != 0) { ++ ret = dq_buf(ctx, ppavbuf); ++ if (ret == AVERROR(EPIPE)) ++ continue; ++ return ret; } - avbuf = &ctx->buffers[buf.index]; @@ -47957,18 +47969,13 @@ index ff1ea8e57b..b2c40636a2 100644 - if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { - memcpy(avbuf->planes, planes, sizeof(planes)); - avbuf->buf.m.planes = avbuf->planes; -+ if ((pfd.revents & poll_cap) != 0) { -+ ret = dq_buf(ctx, ppavbuf); -+ if (ret == AVERROR(EPIPE)) -+ continue; -+ return ret; ++ if ((pfd.revents & poll_out) != 0) { ++ if (is_cap) ++ return AVERROR(EAGAIN); ++ return dq_buf(ctx, ppavbuf); } - return avbuf; + -+ if ((pfd.revents & poll_out) != 0) { -+ return is_cap ? 0 : dq_buf(ctx, ppavbuf); -+ } -+ + av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents); + return AVERROR_UNKNOWN; } @@ -48015,7 +48022,7 @@ index ff1ea8e57b..b2c40636a2 100644 } return NULL; -@@ -452,25 +559,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) +@@ -452,25 +566,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) static int v4l2_release_buffers(V4L2Context* ctx) { @@ -48045,18 +48052,18 @@ index ff1ea8e57b..b2c40636a2 100644 + .type = ctx->type, + .count = 0, /* 0 -> unmap all buffers from the driver */ + }; ++ ++ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { ++ if (errno == EINTR) ++ continue; ++ ++ ret = AVERROR(errno); - for (j = 0; j < buffer->num_planes; j++) { - struct V4L2Plane_info *p = &buffer->plane_info[j]; - if (p->mm_addr && p->length) - if (munmap(p->mm_addr, p->length) < 0) - av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); -+ while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) { -+ if (errno == EINTR) -+ continue; -+ -+ ret = AVERROR(errno); -+ + av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", + ctx->name, av_err2str(AVERROR(errno))); + @@ -48075,7 +48082,7 @@ index ff1ea8e57b..b2c40636a2 100644 } static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) -@@ -499,6 +626,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm +@@ -499,6 +633,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) { @@ -48084,7 +48091,7 @@ index ff1ea8e57b..b2c40636a2 100644 enum AVPixelFormat pixfmt = ctx->av_pix_fmt; struct v4l2_fmtdesc fdesc; int ret; -@@ -517,6 +646,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) +@@ -517,6 +653,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) if (ret) return AVERROR(EINVAL); @@ -48098,7 +48105,7 @@ index ff1ea8e57b..b2c40636a2 100644 pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); ret = v4l2_try_raw_format(ctx, pixfmt); if (ret){ -@@ -569,18 +705,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) +@@ -569,18 +712,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) * *****************************************************************************/ @@ -48170,24 +48177,24 @@ index ff1ea8e57b..b2c40636a2 100644 + { + if (cmd == VIDIOC_STREAMOFF) + flush_all_buffers_status(ctx); - -- ctx->streamon = (cmd == VIDIOC_STREAMON); ++ + ctx->streamon = (cmd == VIDIOC_STREAMON); + av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, + cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); + } -- return 0; +- ctx->streamon = (cmd == VIDIOC_STREAMON); + // Both stream off & on effectively clear flag_last + ctx->flag_last = 0; -+ + +- return 0; + ff_mutex_unlock(&ctx->lock); + + return ret; } int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) -@@ -608,7 +810,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) +@@ -608,7 +817,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) return ff_v4l2_buffer_enqueue(avbuf); } @@ -48197,7 +48204,7 @@ index ff1ea8e57b..b2c40636a2 100644 { V4L2m2mContext *s = ctx_to_m2mctx(ctx); V4L2Buffer* avbuf; -@@ -616,8 +819,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +@@ -616,8 +826,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) if (!pkt->size) { ret = v4l2_stop_decode(ctx); @@ -48208,7 +48215,7 @@ index ff1ea8e57b..b2c40636a2 100644 s->draining = 1; return 0; } -@@ -626,8 +830,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +@@ -626,8 +837,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) if (!avbuf) return AVERROR(EAGAIN); @@ -48222,7 +48229,7 @@ index ff1ea8e57b..b2c40636a2 100644 return ret; return ff_v4l2_buffer_enqueue(avbuf); -@@ -636,19 +843,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +@@ -636,19 +850,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) { V4L2Buffer *avbuf; @@ -48245,7 +48252,7 @@ index ff1ea8e57b..b2c40636a2 100644 return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); } -@@ -656,19 +854,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) +@@ -656,19 +861,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) { V4L2Buffer *avbuf; @@ -48268,7 +48275,7 @@ index ff1ea8e57b..b2c40636a2 100644 return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); } -@@ -702,78 +891,158 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) +@@ -702,78 +898,158 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) int ff_v4l2_context_set_format(V4L2Context* ctx) { @@ -61254,32 +61261,35 @@ index 0000000000..92bc13a3df + diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh new file mode 100755 -index 0000000000..98ab9d6de9 +index 0000000000..b3b2d5509d --- /dev/null +++ b/pi-util/clean_usr_libs.sh -@@ -0,0 +1,23 @@ +@@ -0,0 +1,26 @@ +set -e +U=/usr/lib/arm-linux-gnueabihf +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* -+rm -f $U/libavresample.* +rm -f $U/libavutil.* ++rm -f $U/libswresample.* ++rm -f $U/libswscale.* +U=/usr/lib/arm-linux-gnueabihf/neon/vfp +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* -+rm -f $U/libavresample.* +rm -f $U/libavutil.* ++rm -f $U/libswresample.* ++rm -f $U/libswscale.* +U=/usr/lib/aarch64-linux-gnu +rm -f $U/libavcodec.* +rm -f $U/libavdevice.* +rm -f $U/libavfilter.* +rm -f $U/libavformat.* -+rm -f $U/libavresample.* +rm -f $U/libavutil.* ++rm -f $U/libswresample.* ++rm -f $U/libswscale.* + diff --git a/pi-util/conf_arm64_native.sh b/pi-util/conf_arm64_native.sh new file mode 100644 From 74d19598a5b2197b17b90d0c9d6503521d37bd03 Mon Sep 17 00:00:00 2001 From: Matthias Reichl Date: Thu, 24 Mar 2022 18:17:58 +0100 Subject: [PATCH 2/2] ffmpeg: update rpi patch Patch created using revisions dc91b91..5bab299 from branch dev/4.4/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg --- .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch | 3134 ++++++++++++++++- 1 file changed, 3099 insertions(+), 35 deletions(-) diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch index 1ccf22ba72..f88fe6d562 100644 --- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch +++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch @@ -460,6 +460,1990 @@ index 33a280cf69..be3b73e7c4 100644 +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h +$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h +endif +diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile +index 954461f81d..7078dc6089 100644 +--- a/libavcodec/aarch64/Makefile ++++ b/libavcodec/aarch64/Makefile +@@ -35,6 +35,8 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o + + # subsystems + NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o ++NEON-OBJS-$(CONFIG_BLOCKDSP) += aarch64/blockdsp_init_aarch64.o \ ++ aarch64/blockdsp_neon.o + NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o + NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o + NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o +@@ -44,10 +46,12 @@ NEON-OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_neon.o + NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \ + aarch64/hpeldsp_neon.o + NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o +-NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/simple_idct_neon.o ++NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \ ++ aarch64/simple_idct_neon.o + NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o + NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o + NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o ++NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o + NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o + + # decoders/encoders +diff --git a/libavcodec/aarch64/blockdsp_init_aarch64.c b/libavcodec/aarch64/blockdsp_init_aarch64.c +new file mode 100644 +index 0000000000..9f3280f007 +--- /dev/null ++++ b/libavcodec/aarch64/blockdsp_init_aarch64.c +@@ -0,0 +1,42 @@ ++/* ++ * AArch64 NEON optimised block operations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++ ++#include "libavutil/attributes.h" ++#include "libavutil/cpu.h" ++#include "libavutil/arm/cpu.h" ++#include "libavcodec/avcodec.h" ++#include "libavcodec/blockdsp.h" ++ ++void ff_clear_block_neon(int16_t *block); ++void ff_clear_blocks_neon(int16_t *blocks); ++ ++av_cold void ff_blockdsp_init_aarch64(BlockDSPContext *c) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) { ++ c->clear_block = ff_clear_block_neon; ++ c->clear_blocks = ff_clear_blocks_neon; ++ } ++} +diff --git a/libavcodec/aarch64/blockdsp_neon.S b/libavcodec/aarch64/blockdsp_neon.S +new file mode 100644 +index 0000000000..e4a4959ccc +--- /dev/null ++++ b/libavcodec/aarch64/blockdsp_neon.S +@@ -0,0 +1,43 @@ ++/* ++ * AArch64 NEON optimised block operations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/aarch64/asm.S" ++ ++function ff_clear_block_neon, export=1 ++ movi v0.16b, #0 ++ movi v1.16b, #0 ++ st1 {v0.16b, v1.16b}, [x0], #32 ++ st1 {v0.16b, v1.16b}, [x0], #32 ++ st1 {v0.16b, v1.16b}, [x0], #32 ++ st1 {v0.16b, v1.16b}, [x0] ++ ret ++endfunc ++ ++function ff_clear_blocks_neon, export=1 ++ movi v0.16b, #0 ++ movi v1.16b, #0 ++ .rept 23 ++ st1 {v0.16b, v1.16b}, [x0], #32 ++ .endr ++ st1 {v0.16b, v1.16b}, [x0] ++ ret ++endfunc +diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c +index 742a3372e3..eec21aa5a2 100644 +--- a/libavcodec/aarch64/idctdsp_init_aarch64.c ++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c +@@ -27,19 +27,29 @@ + #include "libavcodec/idctdsp.h" + #include "idct.h" + ++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t); ++ + av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) + { + int cpu_flags = av_get_cpu_flags(); + +- if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) { +- if (avctx->idct_algo == FF_IDCT_AUTO || +- avctx->idct_algo == FF_IDCT_SIMPLEAUTO || +- avctx->idct_algo == FF_IDCT_SIMPLENEON) { +- c->idct_put = ff_simple_idct_put_neon; +- c->idct_add = ff_simple_idct_add_neon; +- c->idct = ff_simple_idct_neon; +- c->perm_type = FF_IDCT_PERM_PARTTRANS; ++ if (have_neon(cpu_flags)) { ++ if (!avctx->lowres && !high_bit_depth) { ++ if (avctx->idct_algo == FF_IDCT_AUTO || ++ avctx->idct_algo == FF_IDCT_SIMPLEAUTO || ++ avctx->idct_algo == FF_IDCT_SIMPLENEON) { ++ c->idct_put = ff_simple_idct_put_neon; ++ c->idct_add = ff_simple_idct_add_neon; ++ c->idct = ff_simple_idct_neon; ++ c->perm_type = FF_IDCT_PERM_PARTTRANS; ++ } + } ++ ++ c->add_pixels_clamped = ff_add_pixels_clamped_neon; ++ c->put_pixels_clamped = ff_put_pixels_clamped_neon; ++ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; + } + } +diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S +new file mode 100644 +index 0000000000..7f47611206 +--- /dev/null ++++ b/libavcodec/aarch64/idctdsp_neon.S +@@ -0,0 +1,130 @@ ++/* ++ * IDCT AArch64 NEON optimisations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// Clamp 16-bit signed block coefficients to unsigned 8-bit ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit results ++// x2 = row stride for results, bytes ++function ff_put_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0] ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v4.8b, v4.8h ++ st1 {v0.8b}, [x1], x2 ++ sqxtun v0.8b, v5.8h ++ st1 {v1.8b}, [x1], x2 ++ sqxtun v1.8b, v6.8h ++ st1 {v2.8b}, [x1], x2 ++ sqxtun v2.8b, v7.8h ++ st1 {v3.8b}, [x1], x2 ++ st1 {v4.8b}, [x1], x2 ++ st1 {v0.8b}, [x1], x2 ++ st1 {v1.8b}, [x1], x2 ++ st1 {v2.8b}, [x1] ++ ret ++endfunc ++ ++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128) ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit results ++// x2 = row stride for results, bytes ++function ff_put_signed_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ movi v4.8b, #128 ++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] ++ sqxtn v0.8b, v0.8h ++ sqxtn v1.8b, v1.8h ++ sqxtn v2.8b, v2.8h ++ sqxtn v3.8b, v3.8h ++ sqxtn v5.8b, v16.8h ++ add v0.8b, v0.8b, v4.8b ++ sqxtn v6.8b, v17.8h ++ add v1.8b, v1.8b, v4.8b ++ sqxtn v7.8b, v18.8h ++ add v2.8b, v2.8b, v4.8b ++ sqxtn v16.8b, v19.8h ++ add v3.8b, v3.8b, v4.8b ++ st1 {v0.8b}, [x1], x2 ++ add v0.8b, v5.8b, v4.8b ++ st1 {v1.8b}, [x1], x2 ++ add v1.8b, v6.8b, v4.8b ++ st1 {v2.8b}, [x1], x2 ++ add v2.8b, v7.8b, v4.8b ++ st1 {v3.8b}, [x1], x2 ++ add v3.8b, v16.8b, v4.8b ++ st1 {v0.8b}, [x1], x2 ++ st1 {v1.8b}, [x1], x2 ++ st1 {v2.8b}, [x1], x2 ++ st1 {v3.8b}, [x1] ++ ret ++endfunc ++ ++// Add 16-bit signed block coefficients to unsigned 8-bit ++// On entry: ++// x0 -> array of 64x 16-bit coefficients ++// x1 -> 8-bit input and results ++// x2 = row stride for 8-bit input and results, bytes ++function ff_add_pixels_clamped_neon, export=1 ++ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64 ++ mov x3, x1 ++ ld1 {v4.8b}, [x1], x2 ++ ld1 {v5.8b}, [x1], x2 ++ ld1 {v6.8b}, [x1], x2 ++ ld1 {v7.8b}, [x1], x2 ++ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0] ++ uaddw v0.8h, v0.8h, v4.8b ++ uaddw v1.8h, v1.8h, v5.8b ++ uaddw v2.8h, v2.8h, v6.8b ++ ld1 {v4.8b}, [x1], x2 ++ uaddw v3.8h, v3.8h, v7.8b ++ ld1 {v5.8b}, [x1], x2 ++ sqxtun v0.8b, v0.8h ++ ld1 {v6.8b}, [x1], x2 ++ sqxtun v1.8b, v1.8h ++ ld1 {v7.8b}, [x1] ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ uaddw v4.8h, v16.8h, v4.8b ++ st1 {v0.8b}, [x3], x2 ++ uaddw v0.8h, v17.8h, v5.8b ++ st1 {v1.8b}, [x3], x2 ++ uaddw v1.8h, v18.8h, v6.8b ++ st1 {v2.8b}, [x3], x2 ++ uaddw v2.8h, v19.8h, v7.8b ++ sqxtun v4.8b, v4.8h ++ sqxtun v0.8b, v0.8h ++ st1 {v3.8b}, [x3], x2 ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ st1 {v4.8b}, [x3], x2 ++ st1 {v0.8b}, [x3], x2 ++ st1 {v1.8b}, [x3], x2 ++ st1 {v2.8b}, [x3] ++ ret ++endfunc +diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c +index 13dfd74940..161d5a972b 100644 +--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c ++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c +@@ -21,10 +21,28 @@ + #include "libavutil/attributes.h" + #include "libavutil/cpu.h" + #include "libavutil/aarch64/cpu.h" ++#include "libavutil/intreadwrite.h" + #include "libavcodec/vc1dsp.h" + + #include "config.h" + ++void ff_vc1_inv_trans_8x8_neon(int16_t *block); ++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++ ++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); ++ ++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); ++ + void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, +@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + ++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); ++ ++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) ++{ ++ /* Dealing with starting and stopping, and removing escape bytes, are ++ * comparatively less time-sensitive, so are more clearly expressed using ++ * a C wrapper around the assembly inner loop. Note that we assume a ++ * little-endian machine that supports unaligned loads. */ ++ int dsize = 0; ++ while (size >= 4) ++ { ++ int found = 0; ++ while (!found && (((uintptr_t) dst) & 7) && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ if (!found) ++ { ++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); ++ dst += skip; ++ src += skip; ++ size -= skip; ++ dsize += skip; ++ while (!found && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ } ++ if (found) ++ { ++ *dst++ = *src++; ++ *dst++ = *src++; ++ ++src; ++ size -= 3; ++ dsize += 2; ++ } ++ } ++ while (size > 0) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ return dsize; ++} ++ + av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp) + { + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { ++ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon; ++ dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon; ++ dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon; ++ dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon; ++ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon; ++ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; ++ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon; ++ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; ++ ++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; ++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; ++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; ++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; ++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; ++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; ++ + dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; ++ ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; + } + } +diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S +new file mode 100644 +index 0000000000..529c21d285 +--- /dev/null ++++ b/libavcodec/aarch64/vc1dsp_neon.S +@@ -0,0 +1,1552 @@ ++/* ++ * VC1 AArch64 NEON optimisations ++ * ++ * Copyright (c) 2022 Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/aarch64/asm.S" ++ ++// VC-1 8x8 inverse transform ++// On entry: ++// x0 -> array of 16-bit inverse transform coefficients, in column-major order ++// On exit: ++// array at x0 updated to hold transformed block; also now held in row-major order ++function ff_vc1_inv_trans_8x8_neon, export=1 ++ ld1 {v1.16b, v2.16b}, [x0], #32 ++ ld1 {v3.16b, v4.16b}, [x0], #32 ++ ld1 {v5.16b, v6.16b}, [x0], #32 ++ shl v1.8h, v1.8h, #2 // 8/2 * src[0] ++ sub x1, x0, #3*32 ++ ld1 {v16.16b, v17.16b}, [x0] ++ shl v7.8h, v2.8h, #4 // 16 * src[8] ++ shl v18.8h, v2.8h, #2 // 4 * src[8] ++ shl v19.8h, v4.8h, #4 // 16 * src[24] ++ ldr d0, .Lcoeffs_it8 ++ shl v5.8h, v5.8h, #2 // 8/2 * src[32] ++ shl v20.8h, v6.8h, #4 // 16 * src[40] ++ shl v21.8h, v6.8h, #2 // 4 * src[40] ++ shl v22.8h, v17.8h, #4 // 16 * src[56] ++ ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40] ++ mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16] ++ sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40] ++ ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56] ++ sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56] ++ shl v3.8h, v3.8h, #3 // 16/2 * src[16] ++ mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ ssra v1.8h, v1.8h, #1 // 12/2 * src[0] ++ ssra v5.8h, v5.8h, #1 // 12/2 * src[32] ++ mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ shl v21.8h, v16.8h, #3 // 16/2 * src[48] ++ mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 ++ sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 ++ mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2 ++ add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 ++ mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 ++ add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2 ++ mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2 ++ sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2 ++ neg v3.8h, v7.8h // -t1 ++ neg v4.8h, v20.8h // +t2 ++ neg v6.8h, v19.8h // +t3 ++ ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1 ++ ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1 ++ neg v7.8h, v18.8h // +t4 ++ ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1 ++ ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1 ++ ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1 ++ ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1 ++ ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1 ++ ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1 ++ srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3 ++ srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3 ++ srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3 ++ srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3 ++ srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3 ++ srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3 ++ srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3 ++ srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3 ++ trn2 v17.8h, v3.8h, v4.8h ++ trn2 v18.8h, v5.8h, v6.8h ++ trn2 v19.8h, v2.8h, v1.8h ++ trn2 v20.8h, v7.8h, v16.8h ++ trn1 v21.4s, v17.4s, v18.4s ++ trn2 v17.4s, v17.4s, v18.4s ++ trn1 v18.4s, v19.4s, v20.4s ++ trn2 v19.4s, v19.4s, v20.4s ++ trn1 v3.8h, v3.8h, v4.8h ++ trn2 v4.2d, v21.2d, v18.2d ++ trn1 v20.2d, v17.2d, v19.2d ++ trn1 v5.8h, v5.8h, v6.8h ++ trn1 v1.8h, v2.8h, v1.8h ++ trn1 v2.8h, v7.8h, v16.8h ++ trn1 v6.2d, v21.2d, v18.2d ++ trn2 v7.2d, v17.2d, v19.2d ++ shl v16.8h, v20.8h, #4 // 16 * src[24] ++ shl v17.8h, v4.8h, #4 // 16 * src[40] ++ trn1 v18.4s, v3.4s, v5.4s ++ trn1 v19.4s, v1.4s, v2.4s ++ shl v21.8h, v7.8h, #4 // 16 * src[56] ++ shl v22.8h, v6.8h, #2 // 4 * src[8] ++ shl v23.8h, v4.8h, #2 // 4 * src[40] ++ trn2 v3.4s, v3.4s, v5.4s ++ trn2 v1.4s, v1.4s, v2.4s ++ shl v2.8h, v6.8h, #4 // 16 * src[8] ++ sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40] ++ ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40] ++ sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56] ++ trn1 v22.2d, v18.2d, v19.2d ++ trn2 v18.2d, v18.2d, v19.2d ++ trn1 v19.2d, v3.2d, v1.2d ++ ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56] ++ mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ shl v21.8h, v22.8h, #2 // 8/2 * src[0] ++ shl v18.8h, v18.8h, #2 // 8/2 * src[32] ++ mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ shl v6.8h, v19.8h, #3 // 16/2 * src[16] ++ trn2 v1.2d, v3.2d, v1.2d ++ mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ ssra v21.8h, v21.8h, #1 // 12/2 * src[0] ++ ssra v18.8h, v18.8h, #1 // 12/2 * src[32] ++ mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16] ++ shl v19.8h, v1.8h, #3 // 16/2 * src[48] ++ mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 ++ add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 ++ mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 ++ neg v21.8h, v17.8h // +t2 ++ mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 ++ neg v4.8h, v5.8h // +t3 ++ sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2 ++ sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2 ++ neg v24.8h, v16.8h // +t4 ++ add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2 ++ add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2 ++ ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1 ++ neg v3.8h, v2.8h // -t1 ++ ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1 ++ ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1 ++ ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1 ++ srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1 ++ srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1 ++ srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1 ++ srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1 ++ srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7 ++ srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7 ++ srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7 ++ srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7 ++ srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7 ++ srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7 ++ st1 {v2.16b, v3.16b}, [x1], #32 ++ srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7 ++ srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7 ++ st1 {v4.16b, v5.16b}, [x1], #32 ++ st1 {v16.16b, v17.16b}, [x1], #32 ++ st1 {v0.16b, v1.16b}, [x1] ++ ret ++endfunc ++ ++// VC-1 8x4 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x4_neon, export=1 ++ ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32 ++ mov x3, x0 ++ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2] ++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector ++ ld1 {v5.8b}, [x0], x1 ++ trn2 v6.4h, v1.4h, v3.4h ++ trn2 v7.4h, v2.4h, v4.4h ++ trn1 v1.4h, v1.4h, v3.4h ++ trn1 v2.4h, v2.4h, v4.4h ++ trn2 v3.4h, v16.4h, v18.4h ++ trn2 v4.4h, v17.4h, v19.4h ++ trn1 v16.4h, v16.4h, v18.4h ++ trn1 v17.4h, v17.4h, v19.4h ++ ld1 {v18.8b}, [x0], x1 ++ trn1 v19.2s, v6.2s, v3.2s ++ trn2 v3.2s, v6.2s, v3.2s ++ trn1 v6.2s, v7.2s, v4.2s ++ trn2 v4.2s, v7.2s, v4.2s ++ trn1 v7.2s, v1.2s, v16.2s ++ trn1 v20.2s, v2.2s, v17.2s ++ shl v21.4h, v19.4h, #4 // 16 * src[1] ++ trn2 v1.2s, v1.2s, v16.2s ++ shl v16.4h, v3.4h, #4 // 16 * src[3] ++ trn2 v2.2s, v2.2s, v17.2s ++ shl v17.4h, v6.4h, #4 // 16 * src[5] ++ ld1 {v22.8b}, [x0], x1 ++ shl v23.4h, v4.4h, #4 // 16 * src[7] ++ mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2] ++ ld1 {v25.8b}, [x0] ++ shl v26.4h, v19.4h, #2 // 4 * src[1] ++ shl v27.4h, v6.4h, #2 // 4 * src[5] ++ ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7] ++ ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5] ++ sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7] ++ sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5] ++ shl v7.4h, v7.4h, #2 // 8/2 * src[0] ++ shl v20.4h, v20.4h, #2 // 8/2 * src[4] ++ mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7] ++ shl v1.4h, v1.4h, #3 // 16/2 * src[2] ++ mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5] ++ ssra v7.4h, v7.4h, #1 // 12/2 * src[0] ++ mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5] ++ ssra v20.4h, v20.4h, #1 // 12/2 * src[4] ++ mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7] ++ shl v3.4h, v2.4h, #3 // 16/2 * src[6] ++ mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6] ++ mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7] ++ mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7] ++ sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6] ++ mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7] ++ add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4] ++ mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7] ++ sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4] ++ neg v6.4h, v21.4h // -t1 ++ add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 ++ sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 ++ add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2 ++ sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2 ++ sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ neg v3.4h, v17.4h // +t2 ++ neg v4.4h, v16.4h // +t3 ++ neg v28.4h, v23.4h // +t4 ++ ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1 ++ ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1 ++ ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1 ++ ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1 ++ ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1 ++ ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1 ++ ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1 ++ ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1 ++ trn1 v1.2d, v7.2d, v1.2d ++ trn1 v2.2d, v20.2d, v2.2d ++ trn1 v3.2d, v24.2d, v27.2d ++ trn1 v4.2d, v19.2d, v26.2d ++ srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3 ++ srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3 ++ srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3 ++ srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3 ++ trn2 v6.8h, v1.8h, v2.8h ++ trn1 v1.8h, v1.8h, v2.8h ++ trn2 v2.8h, v3.8h, v4.8h ++ trn1 v3.8h, v3.8h, v4.8h ++ trn2 v4.4s, v6.4s, v2.4s ++ trn1 v7.4s, v1.4s, v3.4s ++ trn2 v1.4s, v1.4s, v3.4s ++ mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24] ++ trn1 v2.4s, v6.4s, v2.4s ++ mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24] ++ mul v6.8h, v7.8h, v0.h[6] // 17 * src[0] ++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[16] ++ mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] ++ mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24] ++ add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16] ++ sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16] ++ neg v2.8h, v3.8h // -t4/2 ++ neg v6.8h, v4.8h // -t3/2 ++ ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1 ++ ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1 ++ ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1 ++ ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1 ++ srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7 ++ srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7 ++ srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7 ++ srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v1.8h, v1.8h, v18.8b ++ uaddw v2.8h, v2.8h, v22.8b ++ uaddw v3.8h, v3.8h, v25.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3], x1 ++ st1 {v3.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 4x8 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x8_neon, export=1 ++ mov x3, #16 ++ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector ++ mov x4, x0 ++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 ++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 ++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 ++ ld1 {v4.d}[0], [x2], x3 // 30 31 32 33 ++ ld1 {v1.d}[1], [x2], x3 // 40 41 42 43 ++ ld1 {v2.d}[1], [x2], x3 // 50 51 52 53 ++ ld1 {v3.d}[1], [x2], x3 // 60 61 62 63 ++ ld1 {v4.d}[1], [x2] // 70 71 72 73 ++ ld1 {v5.s}[0], [x0], x1 ++ ld1 {v6.s}[0], [x0], x1 ++ ld1 {v7.s}[0], [x0], x1 ++ trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53 ++ trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52 ++ trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73 ++ trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72 ++ ld1 {v4.s}[0], [x0], x1 ++ trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73 ++ trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70 ++ trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71 ++ mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3] ++ ld1 {v5.s}[1], [x0], x1 ++ mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3] ++ ld1 {v6.s}[1], [x0], x1 ++ trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72 ++ mul v3.8h, v18.8h, v0.h[6] // 17 * src[0] ++ ld1 {v7.s}[1], [x0], x1 ++ mul v1.8h, v1.8h, v0.h[6] // 17 * src[2] ++ ld1 {v4.s}[1], [x0] ++ mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3] ++ mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] ++ add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2] ++ sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2] ++ neg v3.8h, v16.8h // -t3/2 ++ ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1 ++ neg v18.8h, v17.8h // -t4/2 ++ ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1 ++ ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1 ++ ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1 ++ srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3 ++ srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3 ++ srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3 ++ srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3 ++ trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73 ++ trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71 ++ trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61 ++ trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63 ++ trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53 ++ trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73 ++ trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43 ++ mov d18, v3.d[1] // 50 51 52 53 ++ shl v19.4h, v3.4h, #4 // 16 * src[8] ++ mov d20, v16.d[1] // 70 71 72 73 ++ shl v21.4h, v16.4h, #4 // 16 * src[24] ++ mov d22, v17.d[1] // 40 41 42 43 ++ shl v23.4h, v3.4h, #2 // 4 * src[8] ++ shl v24.4h, v18.4h, #4 // 16 * src[40] ++ shl v25.4h, v20.4h, #4 // 16 * src[56] ++ shl v26.4h, v18.4h, #2 // 4 * src[40] ++ trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63 ++ ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40] ++ sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56] ++ shl v17.4h, v17.4h, #2 // 8/2 * src[0] ++ sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40] ++ shl v22.4h, v22.4h, #2 // 8/2 * src[32] ++ mov d23, v1.d[1] // 60 61 62 63 ++ ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56] ++ mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16] ++ shl v1.4h, v1.4h, #3 // 16/2 * src[16] ++ mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40] ++ ssra v17.4h, v17.4h, #1 // 12/2 * src[0] ++ mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40] ++ ssra v22.4h, v22.4h, #1 // 12/2 * src[32] ++ mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56] ++ shl v3.4h, v23.4h, #3 // 16/2 * src[48] ++ mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56] ++ mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48] ++ mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56] ++ add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32] ++ sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48] ++ sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32] ++ mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56] ++ mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56] ++ add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56] ++ sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 ++ sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 ++ neg v23.4h, v24.4h // +t2 ++ sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2 ++ add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2 ++ neg v17.4h, v21.4h // +t3 ++ sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2 ++ add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2 ++ neg v16.4h, v19.4h // -t1 ++ neg v27.4h, v2.4h // +t4 ++ ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1 ++ srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1 ++ ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1 ++ srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1 ++ ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1 ++ srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1 ++ ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1 ++ srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1 ++ trn1 v0.2d, v20.2d, v0.2d ++ trn1 v2.2d, v18.2d, v22.2d ++ trn1 v3.2d, v25.2d, v3.2d ++ trn1 v1.2d, v26.2d, v1.2d ++ srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7 ++ srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7 ++ srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7 ++ srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v2.8h, v2.8h, v6.8b ++ uaddw v3.8h, v3.8h, v7.8b ++ uaddw v1.8h, v1.8h, v4.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x4], x1 ++ st1 {v2.s}[0], [x4], x1 ++ st1 {v3.s}[0], [x4], x1 ++ st1 {v1.s}[0], [x4], x1 ++ st1 {v0.s}[1], [x4], x1 ++ st1 {v2.s}[1], [x4], x1 ++ st1 {v3.s}[1], [x4], x1 ++ st1 {v1.s}[1], [x4] ++ ret ++endfunc ++ ++// VC-1 4x4 inverse transform ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients) ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x4_neon, export=1 ++ mov x3, #16 ++ ldr d0, .Lcoeffs_it4 ++ mov x4, x0 ++ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03 ++ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13 ++ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23 ++ ld1 {v4.d}[0], [x2] // 30 31 32 33 ++ ld1 {v5.s}[0], [x0], x1 ++ ld1 {v5.s}[1], [x0], x1 ++ ld1 {v6.s}[0], [x0], x1 ++ trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13 ++ trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12 ++ ld1 {v6.s}[1], [x0] ++ trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33 ++ trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32 ++ trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33 ++ trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30 ++ trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31 ++ trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32 ++ mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3] ++ mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3] ++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] ++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[2] ++ mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3] ++ mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3] ++ add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2] ++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2] ++ neg v7.4h, v3.4h // -t3/2 ++ neg v16.4h, v4.4h // -t4/2 ++ ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1 ++ ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1 ++ ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1 ++ ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1 ++ srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3 ++ srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3 ++ srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3 ++ srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3 ++ trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31 ++ trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21 ++ trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33 ++ trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23 ++ trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33 ++ trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03 ++ trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13 ++ trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23 ++ mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24] ++ mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24] ++ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0] ++ mul v1.4h, v1.4h, v0.h[2] // 17 * src[16] ++ mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24] ++ mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24] ++ add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16] ++ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16] ++ neg v3.4h, v2.4h // -t4/2 ++ neg v7.4h, v4.4h // -t3/2 ++ ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1 ++ ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1 ++ ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1 ++ ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1 ++ trn1 v0.2d, v4.2d, v3.2d ++ trn1 v1.2d, v2.2d, v7.2d ++ srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7 ++ srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7 ++ uaddw v0.8h, v0.8h, v5.8b ++ uaddw v1.8h, v1.8h, v6.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x4], x1 ++ st1 {v0.s}[1], [x4], x1 ++ st1 {v1.s}[0], [x4], x1 ++ st1 {v1.s}[1], [x4] ++ ret ++endfunc ++ ++// VC-1 8x8 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x8_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.8b}, [x0], x1 ++ ld1 {v1.8b}, [x0], x1 ++ ld1 {v2.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v3.8b}, [x0], x1 ++ ld1 {v4.8b}, [x0], x1 ++ add w2, w2, #1 ++ ld1 {v5.8b}, [x0], x1 ++ asr w2, w2, #1 ++ ld1 {v6.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v7.8b}, [x0] ++ add w0, w2, #16 ++ asr w0, w0, #5 ++ dup v16.8h, w0 ++ uaddw v0.8h, v16.8h, v0.8b ++ uaddw v1.8h, v16.8h, v1.8b ++ uaddw v2.8h, v16.8h, v2.8b ++ uaddw v3.8h, v16.8h, v3.8b ++ uaddw v4.8h, v16.8h, v4.8b ++ uaddw v5.8h, v16.8h, v5.8b ++ sqxtun v0.8b, v0.8h ++ uaddw v6.8h, v16.8h, v6.8b ++ sqxtun v1.8b, v1.8h ++ uaddw v7.8h, v16.8h, v7.8b ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ sqxtun v4.8b, v4.8h ++ st1 {v0.8b}, [x3], x1 ++ sqxtun v0.8b, v5.8h ++ st1 {v1.8b}, [x3], x1 ++ sqxtun v1.8b, v6.8h ++ st1 {v2.8b}, [x3], x1 ++ sqxtun v2.8b, v7.8h ++ st1 {v3.8b}, [x3], x1 ++ st1 {v4.8b}, [x3], x1 ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 8x4 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_8x4_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.8b}, [x0], x1 ++ ld1 {v1.8b}, [x0], x1 ++ ld1 {v2.8b}, [x0], x1 ++ add w2, w2, w2, lsl #1 ++ ld1 {v3.8b}, [x0] ++ add w0, w2, #1 ++ asr w0, w0, #1 ++ add w0, w0, w0, lsl #4 ++ add w0, w0, #64 ++ asr w0, w0, #7 ++ dup v4.8h, w0 ++ uaddw v0.8h, v4.8h, v0.8b ++ uaddw v1.8h, v4.8h, v1.8b ++ uaddw v2.8h, v4.8h, v2.8b ++ uaddw v3.8h, v4.8h, v3.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3], x1 ++ st1 {v2.8b}, [x3], x1 ++ st1 {v3.8b}, [x3] ++ ret ++endfunc ++ ++// VC-1 4x8 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x8_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.s}[0], [x0], x1 ++ ld1 {v1.s}[0], [x0], x1 ++ ld1 {v2.s}[0], [x0], x1 ++ add w2, w2, w2, lsl #4 ++ ld1 {v3.s}[0], [x0], x1 ++ add w2, w2, #4 ++ asr w2, w2, #3 ++ add w2, w2, w2, lsl #1 ++ ld1 {v0.s}[1], [x0], x1 ++ add w2, w2, #16 ++ asr w2, w2, #5 ++ dup v4.8h, w2 ++ ld1 {v1.s}[1], [x0], x1 ++ ld1 {v2.s}[1], [x0], x1 ++ ld1 {v3.s}[1], [x0] ++ uaddw v0.8h, v4.8h, v0.8b ++ uaddw v1.8h, v4.8h, v1.8b ++ uaddw v2.8h, v4.8h, v2.8b ++ uaddw v3.8h, v4.8h, v3.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ sqxtun v2.8b, v2.8h ++ sqxtun v3.8b, v3.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3], x1 ++ st1 {v2.s}[0], [x3], x1 ++ st1 {v3.s}[0], [x3], x1 ++ st1 {v0.s}[1], [x3], x1 ++ st1 {v1.s}[1], [x3], x1 ++ st1 {v2.s}[1], [x3], x1 ++ st1 {v3.s}[1], [x3] ++ ret ++endfunc ++ ++// VC-1 4x4 inverse transform, DC case ++// On entry: ++// x0 -> array of 8-bit samples, in row-major order ++// x1 = row stride for 8-bit sample array ++// x2 -> 16-bit inverse transform DC coefficient ++// On exit: ++// array at x0 updated by saturated addition of (narrowed) transformed block ++function ff_vc1_inv_trans_4x4_dc_neon, export=1 ++ ldrsh w2, [x2] ++ mov x3, x0 ++ ld1 {v0.s}[0], [x0], x1 ++ ld1 {v1.s}[0], [x0], x1 ++ ld1 {v0.s}[1], [x0], x1 ++ add w2, w2, w2, lsl #4 ++ ld1 {v1.s}[1], [x0] ++ add w0, w2, #4 ++ asr w0, w0, #3 ++ add w0, w0, w0, lsl #4 ++ add w0, w0, #64 ++ asr w0, w0, #7 ++ dup v2.8h, w0 ++ uaddw v0.8h, v2.8h, v0.8b ++ uaddw v1.8h, v2.8h, v1.8b ++ sqxtun v0.8b, v0.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3], x1 ++ st1 {v0.s}[1], [x3], x1 ++ st1 {v1.s}[1], [x3] ++ ret ++endfunc ++ ++.align 5 ++.Lcoeffs_it8: ++.quad 0x000F00090003 ++.Lcoeffs_it4: ++.quad 0x0011000B0005 ++.Lcoeffs: ++.quad 0x00050002 ++ ++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// w1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter4_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ sxtw x1, w1 // technically, stride is signed int ++ ldr d0, .Lcoeffs ++ ld1 {v1.s}[0], [x0], x1 // P5 ++ ld1 {v2.s}[0], [x3], x1 // P1 ++ ld1 {v3.s}[0], [x3], x1 // P2 ++ ld1 {v4.s}[0], [x0], x1 // P6 ++ ld1 {v5.s}[0], [x3], x1 // P3 ++ ld1 {v6.s}[0], [x0], x1 // P7 ++ ld1 {v7.s}[0], [x3] // P4 ++ ld1 {v16.s}[0], [x0] // P8 ++ ushll v17.8h, v1.8b, #1 // 2*P5 ++ dup v18.8h, w2 // pq ++ ushll v2.8h, v2.8b, #1 // 2*P1 ++ uxtl v3.8h, v3.8b // P2 ++ uxtl v4.8h, v4.8b // P6 ++ uxtl v19.8h, v5.8b // P3 ++ mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2 ++ uxtl v3.8h, v6.8b // P7 ++ mls v17.4h, v4.4h, v0.h[1] // 2*P5-5*P6 ++ ushll v5.8h, v5.8b, #1 // 2*P3 ++ uxtl v6.8h, v7.8b // P4 ++ mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v3.8h, v16.8b // P8 ++ mla v2.4h, v19.4h, v0.h[1] // 2*P1-5*P2+5*P3 ++ uxtl v1.8h, v1.8b // P5 ++ mls v5.4h, v6.4h, v0.h[1] // 2*P3-5*P4 ++ mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ sub v3.4h, v6.4h, v1.4h // P4-P5 ++ mls v2.4h, v6.4h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ mla v5.4h, v1.4h, v0.h[1] // 2*P3-5*P4+5*P5 ++ mls v5.4h, v4.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ abs v4.4h, v3.4h ++ srshr v7.4h, v17.4h, #3 ++ srshr v2.4h, v2.4h, #3 ++ sshr v4.4h, v4.4h, #1 // clip ++ srshr v5.4h, v5.4h, #3 ++ abs v7.4h, v7.4h // a2 ++ sshr v3.4h, v3.4h, #8 // clip_sign ++ abs v2.4h, v2.4h // a1 ++ cmeq v16.4h, v4.4h, #0 // test clip == 0 ++ abs v17.4h, v5.4h // a0 ++ sshr v5.4h, v5.4h, #8 // a0_sign ++ cmhs v19.4h, v2.4h, v7.4h // test a1 >= a2 ++ cmhs v18.4h, v17.4h, v18.4h // test a0 >= pq ++ sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign ++ bsl v19.8b, v7.8b, v2.8b // a3 ++ orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq ++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 ++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.8b, v2.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w0, v5.s[1] // move to gp reg ++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ cmhs v5.4h, v0.4h, v4.4h ++ tbnz w0, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered ++ bsl v5.8b, v4.8b, v0.8b // FFMIN(d, clip) ++ bic v0.8b, v5.8b, v2.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v6.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.s}[0], [x3], x1 ++ st1 {v1.s}[0], [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// w1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter4_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ sxtw x1, w1 // technically, stride is signed int ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ ld1 {v3.8b}, [x3], x1 ++ ld1 {v4.8b}, [x3] ++ dup v5.8h, w2 // pq ++ trn1 v6.8b, v1.8b, v2.8b ++ trn2 v1.8b, v1.8b, v2.8b ++ trn1 v2.8b, v3.8b, v4.8b ++ trn2 v3.8b, v3.8b, v4.8b ++ trn1 v4.4h, v6.4h, v2.4h // P1, P5 ++ trn1 v7.4h, v1.4h, v3.4h // P2, P6 ++ trn2 v2.4h, v6.4h, v2.4h // P3, P7 ++ trn2 v1.4h, v1.4h, v3.4h // P4, P8 ++ ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5 ++ uxtl v6.8h, v7.8b // P2, P6 ++ uxtl v7.8h, v2.8b // P3, P7 ++ uxtl v1.8h, v1.8b // P4, P8 ++ mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6 ++ ushll v2.8h, v2.8b, #1 // 2*P3, 2*P7 ++ uxtl v4.8h, v4.8b // P1, P5 ++ mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 ++ mov d6, v6.d[1] // P6 ++ mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 ++ mov d4, v4.d[1] // P5 ++ mls v2.4h, v1.4h, v0.h[1] // 2*P3-5*P4 ++ mla v2.4h, v4.4h, v0.h[1] // 2*P3-5*P4+5*P5 ++ sub v7.4h, v1.4h, v4.4h // P4-P5 ++ mls v2.4h, v6.4h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ srshr v3.8h, v3.8h, #3 ++ abs v6.4h, v7.4h ++ sshr v7.4h, v7.4h, #8 // clip_sign ++ srshr v2.4h, v2.4h, #3 ++ abs v3.8h, v3.8h // a1, a2 ++ sshr v6.4h, v6.4h, #1 // clip ++ mov d16, v3.d[1] // a2 ++ abs v17.4h, v2.4h // a0 ++ cmeq v18.4h, v6.4h, #0 // test clip == 0 ++ sshr v2.4h, v2.4h, #8 // a0_sign ++ cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2 ++ cmhs v5.4h, v17.4h, v5.4h // test a0 >= pq ++ sub v2.4h, v7.4h, v2.4h // clip_sign - a0_sign ++ bsl v19.8b, v16.8b, v3.8b // a3 ++ orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq ++ uqsub v5.4h, v17.4h, v19.4h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.4h, v19.4h, v17.4h // test a3 >= a0 ++ mul v0.4h, v5.4h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w2, v5.s[1] // move to gp reg ++ ushr v0.4h, v0.4h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ cmhs v5.4h, v0.4h, v6.4h ++ tbnz w2, #0, 1f // none of the 4 pixel pairs should be updated if this one is not filtered ++ bsl v5.8b, v6.8b, v0.8b // FFMIN(d, clip) ++ bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v4.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ mls v1.4h, v0.4h, v2.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ sqxtun v3.8b, v4.8h ++ sqxtun v2.8b, v1.8h ++ st2 {v2.b, v3.b}[0], [x0], x1 ++ st2 {v2.b, v3.b}[1], [x0], x1 ++ st2 {v2.b, v3.b}[2], [x0], x1 ++ st2 {v2.b, v3.b}[3], [x0] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// w1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter8_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ sxtw x1, w1 // technically, stride is signed int ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x0], x1 // P5 ++ movi v2.2d, #0x0000ffff00000000 ++ ld1 {v3.8b}, [x3], x1 // P1 ++ ld1 {v4.8b}, [x3], x1 // P2 ++ ld1 {v5.8b}, [x0], x1 // P6 ++ ld1 {v6.8b}, [x3], x1 // P3 ++ ld1 {v7.8b}, [x0], x1 // P7 ++ ushll v16.8h, v1.8b, #1 // 2*P5 ++ ushll v3.8h, v3.8b, #1 // 2*P1 ++ ld1 {v17.8b}, [x3] // P4 ++ uxtl v4.8h, v4.8b // P2 ++ ld1 {v18.8b}, [x0] // P8 ++ uxtl v5.8h, v5.8b // P6 ++ dup v19.8h, w2 // pq ++ uxtl v20.8h, v6.8b // P3 ++ mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2 ++ uxtl v4.8h, v7.8b // P7 ++ ushll v6.8h, v6.8b, #1 // 2*P3 ++ mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6 ++ uxtl v7.8h, v17.8b // P4 ++ uxtl v17.8h, v18.8b // P8 ++ mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v1.8h, v1.8b // P5 ++ mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3 ++ sub v4.8h, v7.8h, v1.8h // P4-P5 ++ mls v6.8h, v7.8h, v0.h[1] // 2*P3-5*P4 ++ mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ abs v17.8h, v4.8h ++ sshr v4.8h, v4.8h, #8 // clip_sign ++ mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ sshr v17.8h, v17.8h, #1 // clip ++ mla v6.8h, v1.8h, v0.h[1] // 2*P3-5*P4+5*P5 ++ srshr v16.8h, v16.8h, #3 ++ mls v6.8h, v5.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ cmeq v5.8h, v17.8h, #0 // test clip == 0 ++ srshr v3.8h, v3.8h, #3 ++ abs v16.8h, v16.8h // a2 ++ abs v3.8h, v3.8h // a1 ++ srshr v6.8h, v6.8h, #3 ++ cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2 ++ abs v20.8h, v6.8h // a0 ++ sshr v6.8h, v6.8h, #8 // a0_sign ++ bsl v18.16b, v16.16b, v3.16b // a3 ++ cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq ++ sub v4.8h, v4.8h, v6.8h // clip_sign - a0_sign ++ uqsub v6.8h, v20.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0 ++ orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq ++ mul v0.8h, v6.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0 ++ cmtst v2.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either ++ mov w0, v5.s[1] // move to gp reg ++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ mov w2, v5.s[3] ++ orr v2.16b, v3.16b, v2.16b ++ cmhs v3.8h, v0.8h, v17.8h ++ and w0, w0, w2 ++ bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip) ++ tbnz w0, #0, 1f // none of the 8 pixel pairs should be updated in this case ++ bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered ++ mls v7.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ mla v1.8h, v0.8h, v4.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v7.8h ++ sqxtun v1.8b, v1.8h ++ st1 {v0.8b}, [x3], x1 ++ st1 {v1.8b}, [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// w1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter8_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ sxtw x1, w1 // technically, stride is signed int ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ add x4, x0, x1, lsl #2 ++ ld1 {v3.8b}, [x3], x1 ++ ld1 {v4.8b}, [x3], x1 ++ ld1 {v5.8b}, [x3], x1 ++ ld1 {v6.8b}, [x3], x1 ++ ld1 {v7.8b}, [x3], x1 ++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... ++ ld1 {v17.8b}, [x3] ++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... ++ trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... ++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... ++ dup v4.8h, w2 // pq ++ trn1 v18.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... ++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... ++ trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... ++ trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... ++ trn1 v20.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... ++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... ++ trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... ++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... ++ trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... ++ trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... ++ trn2 v17.4h, v18.4h, v20.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... ++ trn2 v5.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... ++ trn1 v7.2s, v6.2s, v3.2s // P1 ++ trn1 v18.2s, v19.2s, v16.2s // P2 ++ trn2 v3.2s, v6.2s, v3.2s // P5 ++ trn2 v6.2s, v19.2s, v16.2s // P6 ++ trn1 v16.2s, v2.2s, v17.2s // P3 ++ trn2 v2.2s, v2.2s, v17.2s // P7 ++ ushll v7.8h, v7.8b, #1 // 2*P1 ++ trn1 v17.2s, v1.2s, v5.2s // P4 ++ ushll v19.8h, v3.8b, #1 // 2*P5 ++ trn2 v1.2s, v1.2s, v5.2s // P8 ++ uxtl v5.8h, v18.8b // P2 ++ uxtl v6.8h, v6.8b // P6 ++ uxtl v18.8h, v16.8b // P3 ++ mls v7.8h, v5.8h, v0.h[1] // 2*P1-5*P2 ++ uxtl v2.8h, v2.8b // P7 ++ ushll v5.8h, v16.8b, #1 // 2*P3 ++ mls v19.8h, v6.8h, v0.h[1] // 2*P5-5*P6 ++ uxtl v16.8h, v17.8b // P4 ++ uxtl v1.8h, v1.8b // P8 ++ mla v19.8h, v2.8h, v0.h[1] // 2*P5-5*P6+5*P7 ++ uxtl v2.8h, v3.8b // P5 ++ mla v7.8h, v18.8h, v0.h[1] // 2*P1-5*P2+5*P3 ++ sub v3.8h, v16.8h, v2.8h // P4-P5 ++ mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4 ++ mls v19.8h, v1.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8 ++ abs v1.8h, v3.8h ++ sshr v3.8h, v3.8h, #8 // clip_sign ++ mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4 ++ sshr v1.8h, v1.8h, #1 // clip ++ mla v5.8h, v2.8h, v0.h[1] // 2*P3-5*P4+5*P5 ++ srshr v17.8h, v19.8h, #3 ++ mls v5.8h, v6.8h, v0.h[0] // 2*P3-5*P4+5*P5-2*P6 ++ cmeq v6.8h, v1.8h, #0 // test clip == 0 ++ srshr v7.8h, v7.8h, #3 ++ abs v17.8h, v17.8h // a2 ++ abs v7.8h, v7.8h // a1 ++ srshr v5.8h, v5.8h, #3 ++ cmhs v18.8h, v7.8h, v17.8h // test a1 >= a2 ++ abs v19.8h, v5.8h // a0 ++ sshr v5.8h, v5.8h, #8 // a0_sign ++ bsl v18.16b, v17.16b, v7.16b // a3 ++ cmhs v4.8h, v19.8h, v4.8h // test a0 >= pq ++ sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign ++ uqsub v5.8h, v19.8h, v18.8h // a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v7.8h, v18.8h, v19.8h // test a3 >= a0 ++ orr v4.16b, v6.16b, v4.16b // test clip == 0 || a0 >= pq ++ mul v0.8h, v5.8h, v0.h[1] // a0 >= a3 ? 5*(a0-a3) : 0 ++ orr v5.16b, v4.16b, v7.16b // test clip == 0 || a0 >= pq || a3 >= a0 ++ mov w2, v5.s[1] // move to gp reg ++ ushr v0.8h, v0.8h, #3 // a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ mov w3, v5.s[3] ++ cmhs v5.8h, v0.8h, v1.8h ++ and w5, w2, w3 ++ bsl v5.16b, v1.16b, v0.16b // FFMIN(d, clip) ++ tbnz w5, #0, 2f // none of the 8 pixel pairs should be updated in this case ++ bic v0.16b, v5.16b, v4.16b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ sqxtun v1.8b, v2.8h ++ sqxtun v0.8b, v16.8h ++ tbnz w2, #0, 1f // none of the first 4 pixel pairs should be updated if so ++ st2 {v0.b, v1.b}[0], [x0], x1 ++ st2 {v0.b, v1.b}[1], [x0], x1 ++ st2 {v0.b, v1.b}[2], [x0], x1 ++ st2 {v0.b, v1.b}[3], [x0] ++1: tbnz w3, #0, 2f // none of the second 4 pixel pairs should be updated if so ++ st2 {v0.b, v1.b}[4], [x4], x1 ++ st2 {v0.b, v1.b}[5], [x4], x1 ++ st2 {v0.b, v1.b}[6], [x4], x1 ++ st2 {v0.b, v1.b}[7], [x4] ++2: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of lower block ++// w1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter16_neon, export=1 ++ sub x3, x0, w1, sxtw #2 ++ sxtw x1, w1 // technically, stride is signed int ++ ldr d0, .Lcoeffs ++ ld1 {v1.16b}, [x0], x1 // P5 ++ movi v2.2d, #0x0000ffff00000000 ++ ld1 {v3.16b}, [x3], x1 // P1 ++ ld1 {v4.16b}, [x3], x1 // P2 ++ ld1 {v5.16b}, [x0], x1 // P6 ++ ld1 {v6.16b}, [x3], x1 // P3 ++ ld1 {v7.16b}, [x0], x1 // P7 ++ ushll v16.8h, v1.8b, #1 // 2*P5[0..7] ++ ushll v17.8h, v3.8b, #1 // 2*P1[0..7] ++ ld1 {v18.16b}, [x3] // P4 ++ uxtl v19.8h, v4.8b // P2[0..7] ++ ld1 {v20.16b}, [x0] // P8 ++ uxtl v21.8h, v5.8b // P6[0..7] ++ dup v22.8h, w2 // pq ++ ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15] ++ mls v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] ++ ushll2 v19.8h, v1.16b, #1 // 2*P5[8..15] ++ uxtl2 v4.8h, v4.16b // P2[8..15] ++ mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] ++ uxtl2 v5.8h, v5.16b // P6[8..15] ++ uxtl v23.8h, v6.8b // P3[0..7] ++ uxtl v24.8h, v7.8b // P7[0..7] ++ mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] ++ ushll v4.8h, v6.8b, #1 // 2*P3[0..7] ++ uxtl v25.8h, v18.8b // P4[0..7] ++ mls v19.8h, v5.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] ++ uxtl2 v26.8h, v6.16b // P3[8..15] ++ mla v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ uxtl2 v7.8h, v7.16b // P7[8..15] ++ ushll2 v6.8h, v6.16b, #1 // 2*P3[8..15] ++ mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ uxtl2 v18.8h, v18.16b // P4[8..15] ++ uxtl v23.8h, v20.8b // P8[0..7] ++ mls v4.8h, v25.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] ++ uxtl v24.8h, v1.8b // P5[0..7] ++ uxtl2 v20.8h, v20.16b // P8[8..15] ++ mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ uxtl2 v1.8h, v1.16b // P5[8..15] ++ sub v26.8h, v25.8h, v24.8h // P4[0..7]-P5[0..7] ++ mla v19.8h, v7.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ sub v7.8h, v18.8h, v1.8h // P4[8..15]-P5[8..15] ++ mls v6.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] ++ abs v27.8h, v26.8h ++ sshr v26.8h, v26.8h, #8 // clip_sign[0..7] ++ mls v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ abs v28.8h, v7.8h ++ sshr v27.8h, v27.8h, #1 // clip[0..7] ++ mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ sshr v7.8h, v7.8h, #8 // clip_sign[8..15] ++ sshr v23.8h, v28.8h, #1 // clip[8..15] ++ mla v4.8h, v24.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ cmeq v28.8h, v27.8h, #0 // test clip[0..7] == 0 ++ srshr v17.8h, v17.8h, #3 ++ mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ cmeq v29.8h, v23.8h, #0 // test clip[8..15] == 0 ++ srshr v16.8h, v16.8h, #3 ++ mls v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ abs v17.8h, v17.8h // a1[0..7] ++ mla v6.8h, v1.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ srshr v3.8h, v3.8h, #3 ++ mls v4.8h, v21.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ abs v16.8h, v16.8h // a2[0..7] ++ srshr v19.8h, v19.8h, #3 ++ mls v6.8h, v5.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7] ++ abs v3.8h, v3.8h // a1[8..15] ++ srshr v4.8h, v4.8h, #3 ++ abs v19.8h, v19.8h // a2[8..15] ++ bsl v5.16b, v16.16b, v17.16b // a3[0..7] ++ srshr v6.8h, v6.8h, #3 ++ cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15] ++ abs v17.8h, v4.8h // a0[0..7] ++ sshr v4.8h, v4.8h, #8 // a0_sign[0..7] ++ bsl v16.16b, v19.16b, v3.16b // a3[8..15] ++ uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ abs v19.8h, v6.8h // a0[8..15] ++ cmhs v20.8h, v17.8h, v22.8h // test a0[0..7] >= pq ++ cmhs v5.8h, v5.8h, v17.8h // test a3[0..7] >= a0[0..7] ++ sub v4.8h, v26.8h, v4.8h // clip_sign[0..7] - a0_sign[0..7] ++ sshr v6.8h, v6.8h, #8 // a0_sign[8..15] ++ mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ orr v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq ++ cmhs v21.8h, v19.8h, v22.8h // test a0[8..15] >= pq ++ cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15] ++ mul v0.8h, v17.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ sub v6.8h, v7.8h, v6.8h // clip_sign[8..15] - a0_sign[8..15] ++ orr v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ orr v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq ++ cmtst v17.2d, v5.2d, v2.2d // if 2nd of each group of is not filtered, then none of the others in the group should be either ++ mov w0, v5.s[1] // move to gp reg ++ cmhs v19.8h, v3.8h, v27.8h ++ ushr v0.8h, v0.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ mov w2, v5.s[3] ++ orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ orr v16.16b, v20.16b, v17.16b ++ bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7]) ++ cmtst v2.2d, v5.2d, v2.2d ++ cmhs v3.8h, v0.8h, v23.8h ++ mov w4, v5.s[1] ++ mov w5, v5.s[3] ++ and w0, w0, w2 ++ bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ orr v2.16b, v7.16b, v2.16b ++ bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15]) ++ mls v25.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] ++ and w2, w4, w5 ++ bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ mla v24.8h, v5.8h, v4.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] ++ and w0, w0, w2 ++ mls v18.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] ++ sqxtun v2.8b, v25.8h ++ tbnz w0, #0, 1f // none of the 16 pixel pairs should be updated in this case ++ mla v1.8h, v0.8h, v6.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] ++ sqxtun v0.8b, v24.8h ++ sqxtun2 v2.16b, v18.8h ++ sqxtun2 v0.16b, v1.8h ++ st1 {v2.16b}, [x3], x1 ++ st1 {v0.16b}, [x3] ++1: ret ++endfunc ++ ++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks ++// On entry: ++// x0 -> top-left pel of right block ++// w1 = row stride, bytes ++// w2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter16_neon, export=1 ++ sub x3, x0, #4 // where to start reading ++ sxtw x1, w1 // technically, stride is signed int ++ ldr d0, .Lcoeffs ++ ld1 {v1.8b}, [x3], x1 // P1[0], P2[0]... ++ sub x0, x0, #1 // where to start writing ++ ld1 {v2.8b}, [x3], x1 ++ add x4, x0, x1, lsl #3 ++ ld1 {v3.8b}, [x3], x1 ++ add x5, x0, x1, lsl #2 ++ ld1 {v4.8b}, [x3], x1 ++ add x6, x4, x1, lsl #2 ++ ld1 {v5.8b}, [x3], x1 ++ ld1 {v6.8b}, [x3], x1 ++ ld1 {v7.8b}, [x3], x1 ++ trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]... ++ ld1 {v17.8b}, [x3], x1 ++ trn2 v1.8b, v1.8b, v2.8b // P2[0], P2[1], P4[0]... ++ ld1 {v2.8b}, [x3], x1 ++ trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]... ++ ld1 {v19.8b}, [x3], x1 ++ trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]... ++ ld1 {v4.8b}, [x3], x1 ++ trn1 v20.8b, v5.8b, v6.8b // P1[4], P1[5], P3[4]... ++ ld1 {v21.8b}, [x3], x1 ++ trn2 v5.8b, v5.8b, v6.8b // P2[4], P2[5], P4[4]... ++ ld1 {v6.8b}, [x3], x1 ++ trn1 v22.8b, v7.8b, v17.8b // P1[6], P1[7], P3[6]... ++ ld1 {v23.8b}, [x3], x1 ++ trn2 v7.8b, v7.8b, v17.8b // P2[6], P2[7], P4[6]... ++ ld1 {v17.8b}, [x3], x1 ++ trn1 v24.8b, v2.8b, v19.8b // P1[8], P1[9], P3[8]... ++ ld1 {v25.8b}, [x3] ++ trn2 v2.8b, v2.8b, v19.8b // P2[8], P2[9], P4[8]... ++ trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]... ++ trn1 v26.8b, v4.8b, v21.8b // P1[10], P1[11], P3[10]... ++ trn2 v4.8b, v4.8b, v21.8b // P2[10], P2[11], P4[10]... ++ trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]... ++ trn1 v27.4h, v20.4h, v22.4h // P1[4], P1[5], P1[6], P1[7], P5[4]... ++ trn1 v28.8b, v6.8b, v23.8b // P1[12], P1[13], P3[12]... ++ trn2 v6.8b, v6.8b, v23.8b // P2[12], P2[13], P4[12]... ++ trn1 v23.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]... ++ trn1 v29.4h, v24.4h, v26.4h // P1[8], P1[9], P1[10], P1[11], P5[8]... ++ trn1 v30.8b, v17.8b, v25.8b // P1[14], P1[15], P3[14]... ++ trn2 v17.8b, v17.8b, v25.8b // P2[14], P2[15], P4[14]... ++ trn1 v25.4h, v2.4h, v4.4h // P2[8], P2[9], P2[10], P2[11], P6[8]... ++ trn1 v31.2s, v19.2s, v27.2s // P1[0..7] ++ trn2 v19.2s, v19.2s, v27.2s // P5[0..7] ++ trn1 v27.2s, v21.2s, v23.2s // P2[0..7] ++ trn2 v21.2s, v21.2s, v23.2s // P6[0..7] ++ trn1 v23.4h, v28.4h, v30.4h // P1[12], P1[13], P1[14], P1[15], P5[12]... ++ trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]... ++ trn1 v18.4h, v6.4h, v17.4h // P2[12], P2[13], P2[14], P2[15], P6[12]... ++ trn2 v20.4h, v20.4h, v22.4h // P3[4], P3[5], P3[6], P3[7], P7[4]... ++ trn2 v22.4h, v24.4h, v26.4h // P3[8], P3[9], P3[10], P3[11], P7[8]... ++ trn1 v24.2s, v29.2s, v23.2s // P1[8..15] ++ trn2 v23.2s, v29.2s, v23.2s // P5[8..15] ++ trn1 v26.2s, v25.2s, v18.2s // P2[8..15] ++ trn2 v18.2s, v25.2s, v18.2s // P6[8..15] ++ trn2 v25.4h, v28.4h, v30.4h // P3[12], P3[13], P3[14], P3[15], P7[12]... ++ trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]... ++ trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]... ++ trn2 v2.4h, v2.4h, v4.4h // P4[8], P4[9], P4[10], P4[11], P8[8]... ++ trn2 v4.4h, v6.4h, v17.4h // P4[12], P4[13], P4[14], P4[15], P8[12]... ++ ushll v5.8h, v31.8b, #1 // 2*P1[0..7] ++ ushll v6.8h, v19.8b, #1 // 2*P5[0..7] ++ trn1 v7.2s, v16.2s, v20.2s // P3[0..7] ++ uxtl v17.8h, v27.8b // P2[0..7] ++ trn2 v16.2s, v16.2s, v20.2s // P7[0..7] ++ uxtl v20.8h, v21.8b // P6[0..7] ++ trn1 v21.2s, v22.2s, v25.2s // P3[8..15] ++ ushll v24.8h, v24.8b, #1 // 2*P1[8..15] ++ trn2 v22.2s, v22.2s, v25.2s // P7[8..15] ++ ushll v25.8h, v23.8b, #1 // 2*P5[8..15] ++ trn1 v27.2s, v1.2s, v3.2s // P4[0..7] ++ uxtl v26.8h, v26.8b // P2[8..15] ++ mls v5.8h, v17.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7] ++ uxtl v17.8h, v18.8b // P6[8..15] ++ mls v6.8h, v20.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7] ++ trn1 v18.2s, v2.2s, v4.2s // P4[8..15] ++ uxtl v28.8h, v7.8b // P3[0..7] ++ mls v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15] ++ uxtl v16.8h, v16.8b // P7[0..7] ++ uxtl v26.8h, v21.8b // P3[8..15] ++ mls v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15] ++ uxtl v22.8h, v22.8b // P7[8..15] ++ ushll v7.8h, v7.8b, #1 // 2*P3[0..7] ++ uxtl v27.8h, v27.8b // P4[0..7] ++ trn2 v1.2s, v1.2s, v3.2s // P8[0..7] ++ ushll v3.8h, v21.8b, #1 // 2*P3[8..15] ++ trn2 v2.2s, v2.2s, v4.2s // P8[8..15] ++ uxtl v4.8h, v18.8b // P4[8..15] ++ mla v5.8h, v28.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ uxtl v1.8h, v1.8b // P8[0..7] ++ mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ uxtl v2.8h, v2.8b // P8[8..15] ++ uxtl v16.8h, v19.8b // P5[0..7] ++ mla v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ uxtl v18.8h, v23.8b // P5[8..15] ++ dup v19.8h, w2 // pq ++ mla v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7] ++ sub v22.8h, v4.8h, v18.8h // P4[8..15]-P5[8..15] ++ mls v7.8h, v27.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7] ++ abs v23.8h, v21.8h ++ mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15] ++ abs v26.8h, v22.8h ++ sshr v21.8h, v21.8h, #8 // clip_sign[0..7] ++ mls v5.8h, v27.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ sshr v23.8h, v23.8h, #1 // clip[0..7] ++ sshr v26.8h, v26.8h, #1 // clip[8..15] ++ mls v6.8h, v1.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ sshr v1.8h, v22.8h, #8 // clip_sign[8..15] ++ cmeq v22.8h, v23.8h, #0 // test clip[0..7] == 0 ++ mls v24.8h, v4.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ cmeq v28.8h, v26.8h, #0 // test clip[8..15] == 0 ++ srshr v5.8h, v5.8h, #3 ++ mls v25.8h, v2.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ srshr v2.8h, v6.8h, #3 ++ mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ srshr v6.8h, v24.8h, #3 ++ mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ abs v5.8h, v5.8h // a1[0..7] ++ srshr v24.8h, v25.8h, #3 ++ mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ abs v2.8h, v2.8h // a2[0..7] ++ abs v6.8h, v6.8h // a1[8..15] ++ mls v7.8h, v20.8h, v0.h[0] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ abs v17.8h, v24.8h // a2[8..15] ++ cmhs v20.8h, v5.8h, v2.8h // test a1[0..7] >= a2[0..7] ++ srshr v3.8h, v3.8h, #3 ++ cmhs v24.8h, v6.8h, v17.8h // test a1[8..15] >= a2[8.15] ++ srshr v7.8h, v7.8h, #3 ++ bsl v20.16b, v2.16b, v5.16b // a3[0..7] ++ abs v2.8h, v3.8h // a0[8..15] ++ sshr v3.8h, v3.8h, #8 // a0_sign[8..15] ++ bsl v24.16b, v17.16b, v6.16b // a3[8..15] ++ abs v5.8h, v7.8h // a0[0..7] ++ sshr v6.8h, v7.8h, #8 // a0_sign[0..7] ++ cmhs v7.8h, v2.8h, v19.8h // test a0[8..15] >= pq ++ sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15] ++ uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v2.8h, v24.8h, v2.8h // test a3[8..15] >= a0[8..15] ++ uqsub v17.8h, v5.8h, v20.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ cmhs v19.8h, v5.8h, v19.8h // test a0[0..7] >= pq ++ orr v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq ++ sub v6.8h, v21.8h, v6.8h // clip_sign[0..7] - a0_sign[0..7] ++ mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ cmhs v5.8h, v20.8h, v5.8h // test a3[0..7] >= a0[0..7] ++ orr v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq ++ mul v0.8h, v17.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ orr v2.16b, v7.16b, v2.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ orr v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ mov w7, v2.s[1] ++ mov w8, v2.s[3] ++ ushr v0.8h, v0.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ mov w2, v5.s[1] // move to gp reg ++ cmhs v2.8h, v3.8h, v26.8h ++ mov w3, v5.s[3] ++ cmhs v5.8h, v0.8h, v23.8h ++ bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15]) ++ and w9, w7, w8 ++ bsl v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7]) ++ and w10, w2, w3 ++ bic v0.16b, v2.16b, v7.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ and w9, w10, w9 ++ bic v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ mls v4.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 ++ tbnz w9, #0, 4f // none of the 16 pixel pairs should be updated in this case ++ mls v27.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 ++ mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 ++ sqxtun v2.8b, v4.8h ++ mla v18.8h, v0.8h, v1.8h // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 ++ sqxtun v0.8b, v27.8h ++ sqxtun v1.8b, v16.8h ++ sqxtun v3.8b, v18.8h ++ tbnz w2, #0, 1f ++ st2 {v0.b, v1.b}[0], [x0], x1 ++ st2 {v0.b, v1.b}[1], [x0], x1 ++ st2 {v0.b, v1.b}[2], [x0], x1 ++ st2 {v0.b, v1.b}[3], [x0] ++1: tbnz w3, #0, 2f ++ st2 {v0.b, v1.b}[4], [x5], x1 ++ st2 {v0.b, v1.b}[5], [x5], x1 ++ st2 {v0.b, v1.b}[6], [x5], x1 ++ st2 {v0.b, v1.b}[7], [x5] ++2: tbnz w7, #0, 3f ++ st2 {v2.b, v3.b}[0], [x4], x1 ++ st2 {v2.b, v3.b}[1], [x4], x1 ++ st2 {v2.b, v3.b}[2], [x4], x1 ++ st2 {v2.b, v3.b}[3], [x4] ++3: tbnz w8, #0, 4f ++ st2 {v2.b, v3.b}[4], [x6], x1 ++ st2 {v2.b, v3.b}[5], [x6], x1 ++ st2 {v2.b, v3.b}[6], [x6], x1 ++ st2 {v2.b, v3.b}[7], [x6] ++4: ret ++endfunc ++ ++// Copy at most the specified number of bytes from source to destination buffer, ++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence ++// On entry: ++// x0 -> source buffer ++// w1 = max number of bytes to copy ++// x2 -> destination buffer, optimally 8-byte aligned ++// On exit: ++// w0 = number of bytes not copied ++function ff_vc1_unescape_buffer_helper_neon, export=1 ++ // Offset by 80 to screen out cases that are too short for us to handle, ++ // and also make it easy to test for loop termination, or to determine ++ // whether we need an odd number of half-iterations of the loop. ++ subs w1, w1, #80 ++ b.mi 90f ++ ++ // Set up useful constants ++ movi v20.4s, #3, lsl #24 ++ movi v21.4s, #3, lsl #16 ++ ++ tst w1, #32 ++ b.ne 1f ++ ++ ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48 ++ ext v25.16b, v0.16b, v1.16b, #1 ++ ext v26.16b, v0.16b, v1.16b, #2 ++ ext v27.16b, v0.16b, v1.16b, #3 ++ ext v29.16b, v1.16b, v2.16b, #1 ++ ext v30.16b, v1.16b, v2.16b, #2 ++ ext v31.16b, v1.16b, v2.16b, #3 ++ bic v24.16b, v0.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v1.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ add w1, w1, #32 ++ b 3f ++ ++1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48 ++ ext v25.16b, v3.16b, v4.16b, #1 ++ ext v26.16b, v3.16b, v4.16b, #2 ++ ext v27.16b, v3.16b, v4.16b, #3 ++ ext v29.16b, v4.16b, v5.16b, #1 ++ ext v30.16b, v4.16b, v5.16b, #2 ++ ext v31.16b, v4.16b, v5.16b, #3 ++ bic v24.16b, v3.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v4.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ // Drop through... ++2: mov v0.16b, v5.16b ++ ld1 {v1.16b, v2.16b}, [x0], #32 ++ cmeq v28.4s, v28.4s, #0 ++ cmeq v29.4s, v29.4s, #0 ++ cmeq v30.4s, v30.4s, #0 ++ cmeq v31.4s, v31.4s, #0 ++ orr v24.16b, v24.16b, v25.16b ++ orr v26.16b, v26.16b, v27.16b ++ orr v28.16b, v28.16b, v29.16b ++ orr v30.16b, v30.16b, v31.16b ++ ext v25.16b, v0.16b, v1.16b, #1 ++ orr v22.16b, v24.16b, v26.16b ++ ext v26.16b, v0.16b, v1.16b, #2 ++ ext v27.16b, v0.16b, v1.16b, #3 ++ ext v29.16b, v1.16b, v2.16b, #1 ++ orr v23.16b, v28.16b, v30.16b ++ ext v30.16b, v1.16b, v2.16b, #2 ++ ext v31.16b, v1.16b, v2.16b, #3 ++ bic v24.16b, v0.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ orr v22.16b, v22.16b, v23.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v1.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ addv s22, v22.4s ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ mov w3, v22.s[0] ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ cbnz w3, 90f ++ st1 {v3.16b, v4.16b}, [x2], #32 ++3: mov v3.16b, v2.16b ++ ld1 {v4.16b, v5.16b}, [x0], #32 ++ cmeq v28.4s, v28.4s, #0 ++ cmeq v29.4s, v29.4s, #0 ++ cmeq v30.4s, v30.4s, #0 ++ cmeq v31.4s, v31.4s, #0 ++ orr v24.16b, v24.16b, v25.16b ++ orr v26.16b, v26.16b, v27.16b ++ orr v28.16b, v28.16b, v29.16b ++ orr v30.16b, v30.16b, v31.16b ++ ext v25.16b, v3.16b, v4.16b, #1 ++ orr v22.16b, v24.16b, v26.16b ++ ext v26.16b, v3.16b, v4.16b, #2 ++ ext v27.16b, v3.16b, v4.16b, #3 ++ ext v29.16b, v4.16b, v5.16b, #1 ++ orr v23.16b, v28.16b, v30.16b ++ ext v30.16b, v4.16b, v5.16b, #2 ++ ext v31.16b, v4.16b, v5.16b, #3 ++ bic v24.16b, v3.16b, v20.16b ++ bic v25.16b, v25.16b, v20.16b ++ bic v26.16b, v26.16b, v20.16b ++ orr v22.16b, v22.16b, v23.16b ++ bic v27.16b, v27.16b, v20.16b ++ bic v28.16b, v4.16b, v20.16b ++ bic v29.16b, v29.16b, v20.16b ++ bic v30.16b, v30.16b, v20.16b ++ bic v31.16b, v31.16b, v20.16b ++ addv s22, v22.4s ++ eor v24.16b, v24.16b, v21.16b ++ eor v25.16b, v25.16b, v21.16b ++ eor v26.16b, v26.16b, v21.16b ++ eor v27.16b, v27.16b, v21.16b ++ eor v28.16b, v28.16b, v21.16b ++ mov w3, v22.s[0] ++ eor v29.16b, v29.16b, v21.16b ++ eor v30.16b, v30.16b, v21.16b ++ eor v31.16b, v31.16b, v21.16b ++ cmeq v24.4s, v24.4s, #0 ++ cmeq v25.4s, v25.4s, #0 ++ cmeq v26.4s, v26.4s, #0 ++ cmeq v27.4s, v27.4s, #0 ++ cbnz w3, 91f ++ st1 {v0.16b, v1.16b}, [x2], #32 ++ subs w1, w1, #64 ++ b.pl 2b ++ ++90: add w0, w1, #80 ++ ret ++ ++91: sub w1, w1, #32 ++ b 90b ++endfunc diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c index 2e9a3581de..d9571b437f 100644 --- a/libavcodec/allcodecs.c @@ -15303,6 +17287,887 @@ index 0000000000..af8c4c03f0 + bx lr + +endfunc +diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c +index 2cca784f5a..48cb816b70 100644 +--- a/libavcodec/arm/vc1dsp_init_neon.c ++++ b/libavcodec/arm/vc1dsp_init_neon.c +@@ -19,6 +19,7 @@ + #include + + #include "libavutil/attributes.h" ++#include "libavutil/intreadwrite.h" + #include "libavcodec/vc1dsp.h" + #include "vc1dsp.h" + +@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc + void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block); + ++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); ++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); ++ + void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int rnd); + +@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride, + int h, int x, int y); + ++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst); ++ ++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst) ++{ ++ /* Dealing with starting and stopping, and removing escape bytes, are ++ * comparatively less time-sensitive, so are more clearly expressed using ++ * a C wrapper around the assembly inner loop. Note that we assume a ++ * little-endian machine that supports unaligned loads. */ ++ int dsize = 0; ++ while (size >= 4) ++ { ++ int found = 0; ++ while (!found && (((uintptr_t) dst) & 7) && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ if (!found) ++ { ++ int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst); ++ dst += skip; ++ src += skip; ++ size -= skip; ++ dsize += skip; ++ while (!found && size >= 4) ++ { ++ found = (AV_RL32(src) &~ 0x03000000) == 0x00030000; ++ if (!found) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ } ++ } ++ if (found) ++ { ++ *dst++ = *src++; ++ *dst++ = *src++; ++ ++src; ++ size -= 3; ++ dsize += 2; ++ } ++ } ++ while (size > 0) ++ { ++ *dst++ = *src++; ++ --size; ++ ++dsize; ++ } ++ return dsize; ++} ++ + #define FN_ASSIGN(X, Y) \ + dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \ + dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon +@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) + dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; + dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; + ++ dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; ++ dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; ++ dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; ++ dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; ++ dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; ++ dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; ++ + dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; + FN_ASSIGN(1, 0); + FN_ASSIGN(2, 0); +@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) + dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon; + dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon; + dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon; ++ ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon; + } +diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S +index 93f043bf08..8e97bc5e58 100644 +--- a/libavcodec/arm/vc1dsp_neon.S ++++ b/libavcodec/arm/vc1dsp_neon.S +@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1 + vst1.32 {d1[1]}, [r0,:32] + bx lr + endfunc ++ ++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter4_neon, export=1 ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.32 {d1[0]}, [r0], r1 @ P5 ++ vld1.32 {d2[0]}, [r3], r1 @ P1 ++ vld1.32 {d3[0]}, [r3], r1 @ P2 ++ vld1.32 {d4[0]}, [r0], r1 @ P6 ++ vld1.32 {d5[0]}, [r3], r1 @ P3 ++ vld1.32 {d6[0]}, [r0], r1 @ P7 ++ vld1.32 {d7[0]}, [r3] @ P4 ++ vld1.32 {d16[0]}, [r0] @ P8 ++ vshll.u8 q9, d1, #1 @ 2*P5 ++ vdup.16 d17, r2 @ pq ++ vshll.u8 q10, d2, #1 @ 2*P1 ++ vmovl.u8 q11, d3 @ P2 ++ vmovl.u8 q1, d4 @ P6 ++ vmovl.u8 q12, d5 @ P3 ++ vmls.i16 d20, d22, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q11, d6 @ P7 ++ vmls.i16 d18, d2, d0[1] @ 2*P5-5*P6 ++ vshll.u8 q2, d5, #1 @ 2*P3 ++ vmovl.u8 q3, d7 @ P4 ++ vmla.i16 d18, d22, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q11, d16 @ P8 ++ vmla.u16 d20, d24, d0[1] @ 2*P1-5*P2+5*P3 ++ vmovl.u8 q12, d1 @ P5 ++ vmls.u16 d4, d6, d0[1] @ 2*P3-5*P4 ++ vmls.u16 d18, d22, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vsub.i16 d1, d6, d24 @ P4-P5 ++ vmls.i16 d20, d6, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vmla.i16 d4, d24, d0[1] @ 2*P3-5*P4+5*P5 ++ vmls.i16 d4, d2, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vabs.s16 d2, d1 ++ vrshr.s16 d3, d18, #3 ++ vrshr.s16 d5, d20, #3 ++ vshr.s16 d2, d2, #1 @ clip ++ vrshr.s16 d4, d4, #3 ++ vabs.s16 d3, d3 @ a2 ++ vshr.s16 d1, d1, #8 @ clip_sign ++ vabs.s16 d5, d5 @ a1 ++ vceq.i16 d7, d2, #0 @ test clip == 0 ++ vabs.s16 d16, d4 @ a0 ++ vshr.s16 d4, d4, #8 @ a0_sign ++ vcge.s16 d18, d5, d3 @ test a1 >= a2 ++ vcge.s16 d17, d16, d17 @ test a0 >= pq ++ vbsl d18, d3, d5 @ a3 ++ vsub.i16 d1, d1, d4 @ clip_sign - a0_sign ++ vorr d3, d7, d17 @ test clip == 0 || a0 >= pq ++ vqsub.u16 d4, d16, d18 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 d5, d18, d16 @ test a3 >= a0 ++ vmul.i16 d0, d4, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr d4, d3, d5 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r0, d4[1] @ move to gp reg ++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vcge.s16 d4, d0, d2 ++ tst r0, #1 ++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered ++ vbsl d4, d2, d0 @ FFMIN(d, clip) ++ vbic d0, d4, d3 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmls.i16 d6, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vmla.i16 d24, d0, d1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vqmovun.s16 d0, q3 ++ vqmovun.s16 d1, q12 ++ vst1.32 {d0[0]}, [r3], r1 ++ vst1.32 {d1[0]}, [r3] ++1: bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter4_neon, export=1 ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d4}, [r3], r1 ++ vld1.32 {d3}, [r3], r1 ++ vld1.32 {d5}, [r3] ++ vdup.16 d1, r2 @ pq ++ vtrn.8 q1, q2 ++ vtrn.16 d2, d3 @ P1, P5, P3, P7 ++ vtrn.16 d4, d5 @ P2, P6, P4, P8 ++ vshll.u8 q3, d2, #1 @ 2*P1, 2*P5 ++ vmovl.u8 q8, d4 @ P2, P6 ++ vmovl.u8 q9, d3 @ P3, P7 ++ vmovl.u8 q2, d5 @ P4, P8 ++ vmls.i16 q3, q8, d0[1] @ 2*P1-5*P2, 2*P5-5*P6 ++ vshll.u8 q10, d3, #1 @ 2*P3, 2*P7 ++ vmovl.u8 q1, d2 @ P1, P5 ++ vmla.i16 q3, q9, d0[1] @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7 ++ vmls.i16 q3, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8 ++ vmov d2, d3 @ needs to be in an even-numbered vector for when we come to narrow it later ++ vmls.i16 d20, d4, d0[1] @ 2*P3-5*P4 ++ vmla.i16 d20, d3, d0[1] @ 2*P3-5*P4+5*P5 ++ vsub.i16 d3, d4, d2 @ P4-P5 ++ vmls.i16 d20, d17, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vrshr.s16 q3, q3, #3 ++ vabs.s16 d5, d3 ++ vshr.s16 d3, d3, #8 @ clip_sign ++ vrshr.s16 d16, d20, #3 ++ vabs.s16 q3, q3 @ a1, a2 ++ vshr.s16 d5, d5, #1 @ clip ++ vabs.s16 d17, d16 @ a0 ++ vceq.i16 d18, d5, #0 @ test clip == 0 ++ vshr.s16 d16, d16, #8 @ a0_sign ++ vcge.s16 d19, d6, d7 @ test a1 >= a2 ++ vcge.s16 d1, d17, d1 @ test a0 >= pq ++ vsub.i16 d16, d3, d16 @ clip_sign - a0_sign ++ vbsl d19, d7, d6 @ a3 ++ vorr d1, d18, d1 @ test clip == 0 || a0 >= pq ++ vqsub.u16 d3, d17, d19 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 d6, d19, d17 @ test a3 >= a0 @ ++ vmul.i16 d0, d3, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr d3, d1, d6 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r2, d3[1] @ move to gp reg ++ vshr.u16 d0, d0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vcge.s16 d3, d0, d5 ++ tst r2, #1 ++ bne 1f @ none of the 4 pixel pairs should be updated if this one is not filtered ++ vbsl d3, d5, d0 @ FFMIN(d, clip) ++ vbic d0, d3, d1 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmla.i16 d2, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vmls.i16 d4, d0, d16 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d0, q2 ++ vst2.8 {d0[0], d1[0]}, [r0], r1 ++ vst2.8 {d0[1], d1[1]}, [r0], r1 ++ vst2.8 {d0[2], d1[2]}, [r0], r1 ++ vst2.8 {d0[3], d1[3]}, [r0] ++1: bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter8_neon, export=1 ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.32 {d1}, [r0], r1 @ P5 ++ vld1.32 {d2}, [r3], r1 @ P1 ++ vld1.32 {d3}, [r3], r1 @ P2 ++ vld1.32 {d4}, [r0], r1 @ P6 ++ vld1.32 {d5}, [r3], r1 @ P3 ++ vld1.32 {d6}, [r0], r1 @ P7 ++ vshll.u8 q8, d1, #1 @ 2*P5 ++ vshll.u8 q9, d2, #1 @ 2*P1 ++ vld1.32 {d7}, [r3] @ P4 ++ vmovl.u8 q1, d3 @ P2 ++ vld1.32 {d20}, [r0] @ P8 ++ vmovl.u8 q11, d4 @ P6 ++ vdup.16 q12, r2 @ pq ++ vmovl.u8 q13, d5 @ P3 ++ vmls.i16 q9, q1, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q1, d6 @ P7 ++ vshll.u8 q2, d5, #1 @ 2*P3 ++ vmls.i16 q8, q11, d0[1] @ 2*P5-5*P6 ++ vmovl.u8 q3, d7 @ P4 ++ vmovl.u8 q10, d20 @ P8 ++ vmla.i16 q8, q1, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q1, d1 @ P5 ++ vmla.i16 q9, q13, d0[1] @ 2*P1-5*P2+5*P3 ++ vsub.i16 q13, q3, q1 @ P4-P5 ++ vmls.i16 q2, q3, d0[1] @ 2*P3-5*P4 ++ vmls.i16 q8, q10, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vabs.s16 q10, q13 ++ vshr.s16 q13, q13, #8 @ clip_sign ++ vmls.i16 q9, q3, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vshr.s16 q10, q10, #1 @ clip ++ vmla.i16 q2, q1, d0[1] @ 2*P3-5*P4+5*P5 ++ vrshr.s16 q8, q8, #3 ++ vmls.i16 q2, q11, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vceq.i16 q11, q10, #0 @ test clip == 0 ++ vrshr.s16 q9, q9, #3 ++ vabs.s16 q8, q8 @ a2 ++ vabs.s16 q9, q9 @ a1 ++ vrshr.s16 q2, q2, #3 ++ vcge.s16 q14, q9, q8 @ test a1 >= a2 ++ vabs.s16 q15, q2 @ a0 ++ vshr.s16 q2, q2, #8 @ a0_sign ++ vbsl q14, q8, q9 @ a3 ++ vcge.s16 q8, q15, q12 @ test a0 >= pq ++ vsub.i16 q2, q13, q2 @ clip_sign - a0_sign ++ vqsub.u16 q9, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q12, q14, q15 @ test a3 >= a0 ++ vorr q8, q11, q8 @ test clip == 0 || a0 >= pq ++ vmul.i16 q0, q9, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr q9, q8, q12 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vshl.i64 q11, q9, #16 ++ vmov.32 r0, d18[1] @ move to gp reg ++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vmov.32 r2, d19[1] ++ vshr.s64 q9, q11, #48 ++ vcge.s16 q11, q0, q10 ++ vorr q8, q8, q9 ++ and r0, r0, r2 ++ vbsl q11, q10, q0 @ FFMIN(d, clip) ++ tst r0, #1 ++ bne 1f @ none of the 8 pixel pairs should be updated in this case ++ vbic q0, q11, q8 @ set each d to zero if it should not be filtered ++ vmls.i16 q3, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vmla.i16 q1, q0, q2 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vqmovun.s16 d0, q3 ++ vqmovun.s16 d1, q1 ++ vst1.32 {d0}, [r3], r1 ++ vst1.32 {d1}, [r3] ++1: bx lr ++endfunc ++ ++.align 5 ++.Lcoeffs: ++.quad 0x00050002 ++ ++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter8_neon, export=1 ++ push {lr} ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d4}, [r3], r1 ++ add r12, r0, r1, lsl #2 ++ vld1.32 {d3}, [r3], r1 ++ vld1.32 {d5}, [r3], r1 ++ vld1.32 {d6}, [r3], r1 ++ vld1.32 {d16}, [r3], r1 ++ vld1.32 {d7}, [r3], r1 ++ vld1.32 {d17}, [r3] ++ vtrn.8 q1, q2 @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]... ++ vdup.16 q9, r2 @ pq ++ vtrn.16 d2, d3 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... ++ vtrn.16 d4, d5 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... ++ vtrn.8 q3, q8 @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]... ++ vtrn.16 d6, d7 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]... ++ vtrn.16 d16, d17 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... ++ vtrn.32 d2, d6 @ P1, P5 ++ vtrn.32 d4, d16 @ P2, P6 ++ vtrn.32 d3, d7 @ P3, P7 ++ vtrn.32 d5, d17 @ P4, P8 ++ vshll.u8 q10, d2, #1 @ 2*P1 ++ vshll.u8 q11, d6, #1 @ 2*P5 ++ vmovl.u8 q12, d4 @ P2 ++ vmovl.u8 q13, d16 @ P6 ++ vmovl.u8 q14, d3 @ P3 ++ vmls.i16 q10, q12, d0[1] @ 2*P1-5*P2 ++ vmovl.u8 q12, d7 @ P7 ++ vshll.u8 q1, d3, #1 @ 2*P3 ++ vmls.i16 q11, q13, d0[1] @ 2*P5-5*P6 ++ vmovl.u8 q2, d5 @ P4 ++ vmovl.u8 q8, d17 @ P8 ++ vmla.i16 q11, q12, d0[1] @ 2*P5-5*P6+5*P7 ++ vmovl.u8 q3, d6 @ P5 ++ vmla.i16 q10, q14, d0[1] @ 2*P1-5*P2+5*P3 ++ vsub.i16 q12, q2, q3 @ P4-P5 ++ vmls.i16 q1, q2, d0[1] @ 2*P3-5*P4 ++ vmls.i16 q11, q8, d0[0] @ 2*P5-5*P6+5*P7-2*P8 ++ vabs.s16 q8, q12 ++ vshr.s16 q12, q12, #8 @ clip_sign ++ vmls.i16 q10, q2, d0[0] @ 2*P1-5*P2+5*P3-2*P4 ++ vshr.s16 q8, q8, #1 @ clip ++ vmla.i16 q1, q3, d0[1] @ 2*P3-5*P4+5*P5 ++ vrshr.s16 q11, q11, #3 ++ vmls.i16 q1, q13, d0[0] @ 2*P3-5*P4+5*P5-2*P6 ++ vceq.i16 q13, q8, #0 @ test clip == 0 ++ vrshr.s16 q10, q10, #3 ++ vabs.s16 q11, q11 @ a2 ++ vabs.s16 q10, q10 @ a1 ++ vrshr.s16 q1, q1, #3 ++ vcge.s16 q14, q10, q11 @ test a1 >= a2 ++ vabs.s16 q15, q1 @ a0 ++ vshr.s16 q1, q1, #8 @ a0_sign ++ vbsl q14, q11, q10 @ a3 ++ vcge.s16 q9, q15, q9 @ test a0 >= pq ++ vsub.i16 q1, q12, q1 @ clip_sign - a0_sign ++ vqsub.u16 q10, q15, q14 @ a0 >= a3 ? a0-a3 : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q11, q14, q15 @ test a3 >= a0 ++ vorr q9, q13, q9 @ test clip == 0 || a0 >= pq ++ vmul.i16 q0, q10, d0[1] @ a0 >= a3 ? 5*(a0-a3) : 0 ++ vorr q10, q9, q11 @ test clip == 0 || a0 >= pq || a3 >= a0 ++ vmov.32 r2, d20[1] @ move to gp reg ++ vshr.u16 q0, q0, #3 @ a0 >= a3 ? (5*(a0-a3))>>3 : 0 ++ vmov.32 r3, d21[1] ++ vcge.s16 q10, q0, q8 ++ and r14, r2, r3 ++ vbsl q10, q8, q0 @ FFMIN(d, clip) ++ tst r14, #1 ++ bne 2f @ none of the 8 pixel pairs should be updated in this case ++ vbic q0, q10, q9 @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmla.i16 q3, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5 ++ vmls.i16 q2, q0, q1 @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4 ++ vqmovun.s16 d1, q3 ++ vqmovun.s16 d0, q2 ++ tst r2, #1 ++ bne 1f @ none of the first 4 pixel pairs should be updated if so ++ vst2.8 {d0[0], d1[0]}, [r0], r1 ++ vst2.8 {d0[1], d1[1]}, [r0], r1 ++ vst2.8 {d0[2], d1[2]}, [r0], r1 ++ vst2.8 {d0[3], d1[3]}, [r0] ++1: tst r3, #1 ++ bne 2f @ none of the second 4 pixel pairs should be updated if so ++ vst2.8 {d0[4], d1[4]}, [r12], r1 ++ vst2.8 {d0[5], d1[5]}, [r12], r1 ++ vst2.8 {d0[6], d1[6]}, [r12], r1 ++ vst2.8 {d0[7], d1[7]}, [r12] ++2: pop {pc} ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of lower block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_v_loop_filter16_neon, export=1 ++ vpush {d8-d15} ++ sub r3, r0, r1, lsl #2 ++ vldr d0, .Lcoeffs ++ vld1.64 {q1}, [r0], r1 @ P5 ++ vld1.64 {q2}, [r3], r1 @ P1 ++ vld1.64 {q3}, [r3], r1 @ P2 ++ vld1.64 {q4}, [r0], r1 @ P6 ++ vld1.64 {q5}, [r3], r1 @ P3 ++ vld1.64 {q6}, [r0], r1 @ P7 ++ vshll.u8 q7, d2, #1 @ 2*P5[0..7] ++ vshll.u8 q8, d4, #1 @ 2*P1[0..7] ++ vld1.64 {q9}, [r3] @ P4 ++ vmovl.u8 q10, d6 @ P2[0..7] ++ vld1.64 {q11}, [r0] @ P8 ++ vmovl.u8 q12, d8 @ P6[0..7] ++ vdup.16 q13, r2 @ pq ++ vshll.u8 q2, d5, #1 @ 2*P1[8..15] ++ vmls.i16 q8, q10, d0[1] @ 2*P1[0..7]-5*P2[0..7] ++ vshll.u8 q10, d3, #1 @ 2*P5[8..15] ++ vmovl.u8 q3, d7 @ P2[8..15] ++ vmls.i16 q7, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] ++ vmovl.u8 q4, d9 @ P6[8..15] ++ vmovl.u8 q14, d10 @ P3[0..7] ++ vmovl.u8 q15, d12 @ P7[0..7] ++ vmls.i16 q2, q3, d0[1] @ 2*P1[8..15]-5*P2[8..15] ++ vshll.u8 q3, d10, #1 @ 2*P3[0..7] ++ vmls.i16 q10, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15] ++ vmovl.u8 q6, d13 @ P7[8..15] ++ vmla.i16 q8, q14, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ vmovl.u8 q14, d18 @ P4[0..7] ++ vmovl.u8 q9, d19 @ P4[8..15] ++ vmla.i16 q7, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ vmovl.u8 q15, d11 @ P3[8..15] ++ vshll.u8 q5, d11, #1 @ 2*P3[8..15] ++ vmls.i16 q3, q14, d0[1] @ 2*P3[0..7]-5*P4[0..7] ++ vmla.i16 q2, q15, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ vmovl.u8 q15, d22 @ P8[0..7] ++ vmovl.u8 q11, d23 @ P8[8..15] ++ vmla.i16 q10, q6, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ vmovl.u8 q6, d2 @ P5[0..7] ++ vmovl.u8 q1, d3 @ P5[8..15] ++ vmls.i16 q5, q9, d0[1] @ 2*P3[8..15]-5*P4[8..15] ++ vmls.i16 q8, q14, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ vmls.i16 q7, q15, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ vsub.i16 q15, q14, q6 @ P4[0..7]-P5[0..7] ++ vmla.i16 q3, q6, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ vrshr.s16 q8, q8, #3 ++ vmls.i16 q2, q9, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ vrshr.s16 q7, q7, #3 ++ vmls.i16 q10, q11, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ vabs.s16 q11, q15 ++ vabs.s16 q8, q8 @ a1[0..7] ++ vmla.i16 q5, q1, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ vshr.s16 q15, q15, #8 @ clip_sign[0..7] ++ vrshr.s16 q2, q2, #3 ++ vmls.i16 q3, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ vabs.s16 q7, q7 @ a2[0..7] ++ vrshr.s16 q10, q10, #3 ++ vsub.i16 q12, q9, q1 @ P4[8..15]-P5[8..15] ++ vshr.s16 q11, q11, #1 @ clip[0..7] ++ vmls.i16 q5, q4, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ vcge.s16 q4, q8, q7 @ test a1[0..7] >= a2[0..7] ++ vabs.s16 q2, q2 @ a1[8..15] ++ vrshr.s16 q3, q3, #3 ++ vabs.s16 q10, q10 @ a2[8..15] ++ vbsl q4, q7, q8 @ a3[0..7] ++ vabs.s16 q7, q12 ++ vshr.s16 q8, q12, #8 @ clip_sign[8..15] ++ vrshr.s16 q5, q5, #3 ++ vcge.s16 q12, q2, q10 @ test a1[8..15] >= a2[8.15] ++ vshr.s16 q7, q7, #1 @ clip[8..15] ++ vbsl q12, q10, q2 @ a3[8..15] ++ vabs.s16 q2, q3 @ a0[0..7] ++ vceq.i16 q10, q11, #0 @ test clip[0..7] == 0 ++ vshr.s16 q3, q3, #8 @ a0_sign[0..7] ++ vsub.i16 q3, q15, q3 @ clip_sign[0..7] - a0_sign[0..7] ++ vcge.s16 q15, q2, q13 @ test a0[0..7] >= pq ++ vorr q10, q10, q15 @ test clip[0..7] == 0 || a0[0..7] >= pq ++ vqsub.u16 q15, q2, q4 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q2, q4, q2 @ test a3[0..7] >= a0[0..7] ++ vabs.s16 q4, q5 @ a0[8..15] ++ vshr.s16 q5, q5, #8 @ a0_sign[8..15] ++ vmul.i16 q15, q15, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ vcge.s16 q13, q4, q13 @ test a0[8..15] >= pq ++ vorr q2, q10, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ vsub.i16 q5, q8, q5 @ clip_sign[8..15] - a0_sign[8..15] ++ vceq.i16 q8, q7, #0 @ test clip[8..15] == 0 ++ vshr.u16 q15, q15, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ vmov.32 r0, d4[1] @ move to gp reg ++ vorr q8, q8, q13 @ test clip[8..15] == 0 || a0[8..15] >= pq ++ vqsub.u16 q13, q4, q12 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vmov.32 r2, d5[1] ++ vcge.s16 q4, q12, q4 @ test a3[8..15] >= a0[8..15] ++ vshl.i64 q2, q2, #16 ++ vcge.s16 q12, q15, q11 ++ vmul.i16 q0, q13, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ vorr q4, q8, q4 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ vshr.s64 q2, q2, #48 ++ and r0, r0, r2 ++ vbsl q12, q11, q15 @ FFMIN(d[0..7], clip[0..7]) ++ vshl.i64 q11, q4, #16 ++ vmov.32 r2, d8[1] ++ vshr.u16 q0, q0, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ vorr q2, q10, q2 ++ vmov.32 r12, d9[1] ++ vshr.s64 q4, q11, #48 ++ vcge.s16 q10, q0, q7 ++ vbic q2, q12, q2 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vorr q4, q8, q4 ++ and r2, r2, r12 ++ vbsl q10, q7, q0 @ FFMIN(d[8..15], clip[8..15]) ++ vmls.i16 q14, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7] ++ and r0, r0, r2 ++ vbic q0, q10, q4 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ tst r0, #1 ++ bne 1f @ none of the 16 pixel pairs should be updated in this case ++ vmla.i16 q6, q2, q3 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7] ++ vmls.i16 q9, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15] ++ vqmovun.s16 d4, q14 ++ vmla.i16 q1, q0, q5 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15] ++ vqmovun.s16 d0, q6 ++ vqmovun.s16 d5, q9 ++ vqmovun.s16 d1, q1 ++ vst1.64 {q2}, [r3], r1 ++ vst1.64 {q0}, [r3] ++1: vpop {d8-d15} ++ bx lr ++endfunc ++ ++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks ++@ On entry: ++@ r0 -> top-left pel of right block ++@ r1 = row stride, bytes ++@ r2 = PQUANT bitstream parameter ++function ff_vc1_h_loop_filter16_neon, export=1 ++ push {r4-r6,lr} ++ vpush {d8-d15} ++ sub r3, r0, #4 @ where to start reading ++ vldr d0, .Lcoeffs ++ vld1.32 {d2}, [r3], r1 @ P1[0], P2[0]... ++ sub r0, r0, #1 @ where to start writing ++ vld1.32 {d3}, [r3], r1 ++ add r4, r0, r1, lsl #2 ++ vld1.32 {d10}, [r3], r1 ++ vld1.32 {d11}, [r3], r1 ++ vld1.32 {d16}, [r3], r1 ++ vld1.32 {d4}, [r3], r1 ++ vld1.32 {d8}, [r3], r1 ++ vtrn.8 d2, d3 @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]... ++ vld1.32 {d14}, [r3], r1 ++ vld1.32 {d5}, [r3], r1 ++ vtrn.8 d10, d11 @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]... ++ vld1.32 {d6}, [r3], r1 ++ vld1.32 {d12}, [r3], r1 ++ vtrn.8 d16, d4 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]... ++ vld1.32 {d13}, [r3], r1 ++ vtrn.16 d2, d10 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]... ++ vld1.32 {d1}, [r3], r1 ++ vtrn.8 d8, d14 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]... ++ vld1.32 {d7}, [r3], r1 ++ vtrn.16 d3, d11 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]... ++ vld1.32 {d9}, [r3], r1 ++ vtrn.8 d5, d6 @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]... ++ vld1.32 {d15}, [r3] ++ vtrn.16 d16, d8 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]... ++ vtrn.16 d4, d14 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]... ++ vtrn.8 d12, d13 @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]... ++ vdup.16 q9, r2 @ pq ++ vtrn.8 d1, d7 @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]... ++ vtrn.32 d2, d16 @ P1[0..7], P5[0..7] ++ vtrn.16 d5, d12 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]... ++ vtrn.16 d6, d13 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]... ++ vtrn.8 d9, d15 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]... ++ vtrn.32 d3, d4 @ P2[0..7], P6[0..7] ++ vshll.u8 q10, d2, #1 @ 2*P1[0..7] ++ vtrn.32 d10, d8 @ P3[0..7], P7[0..7] ++ vshll.u8 q11, d16, #1 @ 2*P5[0..7] ++ vtrn.32 d11, d14 @ P4[0..7], P8[0..7] ++ vtrn.16 d1, d9 @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]... ++ vtrn.16 d7, d15 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]... ++ vmovl.u8 q1, d3 @ P2[0..7] ++ vmovl.u8 q12, d4 @ P6[0..7] ++ vtrn.32 d5, d1 @ P1[8..15], P5[8..15] ++ vtrn.32 d6, d7 @ P2[8..15], P6[8..15] ++ vtrn.32 d12, d9 @ P3[8..15], P7[8..15] ++ vtrn.32 d13, d15 @ P4[8..15], P8[8..15] ++ vmls.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7] ++ vmovl.u8 q1, d10 @ P3[0..7] ++ vshll.u8 q2, d5, #1 @ 2*P1[8..15] ++ vshll.u8 q13, d1, #1 @ 2*P5[8..15] ++ vmls.i16 q11, q12, d0[1] @ 2*P5[0..7]-5*P6[0..7] ++ vmovl.u8 q14, d6 @ P2[8..15] ++ vmovl.u8 q3, d7 @ P6[8..15] ++ vmovl.u8 q15, d8 @ P7[0..7] ++ vmla.i16 q10, q1, d0[1] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7] ++ vmovl.u8 q1, d12 @ P3[8..15] ++ vmls.i16 q2, q14, d0[1] @ 2*P1[8..15]-5*P2[8..15] ++ vmovl.u8 q4, d9 @ P7[8..15] ++ vshll.u8 q14, d10, #1 @ 2*P3[0..7] ++ vmls.i16 q13, q3, d0[1] @ 2*P5[8..15]-5*P6[8..15] ++ vmovl.u8 q5, d11 @ P4[0..7] ++ vmla.i16 q11, q15, d0[1] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7] ++ vshll.u8 q15, d12, #1 @ 2*P3[8..15] ++ vmovl.u8 q6, d13 @ P4[8..15] ++ vmla.i16 q2, q1, d0[1] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15] ++ vmovl.u8 q1, d14 @ P8[0..7] ++ vmovl.u8 q7, d15 @ P8[8..15] ++ vmla.i16 q13, q4, d0[1] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15] ++ vmovl.u8 q4, d16 @ P5[0..7] ++ vmovl.u8 q8, d1 @ P5[8..15] ++ vmls.i16 q14, q5, d0[1] @ 2*P3[0..7]-5*P4[0..7] ++ vmls.i16 q15, q6, d0[1] @ 2*P3[8..15]-5*P4[8..15] ++ vmls.i16 q10, q5, d0[0] @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7] ++ vmls.i16 q11, q1, d0[0] @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7] ++ vsub.i16 q1, q5, q4 @ P4[0..7]-P5[0..7] ++ vmls.i16 q2, q6, d0[0] @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15] ++ vrshr.s16 q10, q10, #3 ++ vmls.i16 q13, q7, d0[0] @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15] ++ vsub.i16 q7, q6, q8 @ P4[8..15]-P5[8..15] ++ vrshr.s16 q11, q11, #3 ++ vmla.s16 q14, q4, d0[1] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7] ++ vrshr.s16 q2, q2, #3 ++ vmla.i16 q15, q8, d0[1] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15] ++ vabs.s16 q10, q10 @ a1[0..7] ++ vrshr.s16 q13, q13, #3 ++ vmls.i16 q15, q3, d0[0] @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15] ++ vabs.s16 q3, q11 @ a2[0..7] ++ vabs.s16 q2, q2 @ a1[8..15] ++ vmls.i16 q14, q12, d0[0] @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7] ++ vabs.s16 q11, q1 ++ vabs.s16 q12, q13 @ a2[8..15] ++ vcge.s16 q13, q10, q3 @ test a1[0..7] >= a2[0..7] ++ vshr.s16 q1, q1, #8 @ clip_sign[0..7] ++ vrshr.s16 q15, q15, #3 ++ vshr.s16 q11, q11, #1 @ clip[0..7] ++ vrshr.s16 q14, q14, #3 ++ vbsl q13, q3, q10 @ a3[0..7] ++ vcge.s16 q3, q2, q12 @ test a1[8..15] >= a2[8.15] ++ vabs.s16 q10, q15 @ a0[8..15] ++ vshr.s16 q15, q15, #8 @ a0_sign[8..15] ++ vbsl q3, q12, q2 @ a3[8..15] ++ vabs.s16 q2, q14 @ a0[0..7] ++ vabs.s16 q12, q7 ++ vshr.s16 q7, q7, #8 @ clip_sign[8..15] ++ vshr.s16 q14, q14, #8 @ a0_sign[0..7] ++ vshr.s16 q12, q12, #1 @ clip[8..15] ++ vsub.i16 q7, q7, q15 @ clip_sign[8..15] - a0_sign[8..15] ++ vqsub.u16 q15, q10, q3 @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q3, q3, q10 @ test a3[8..15] >= a0[8..15] ++ vcge.s16 q10, q10, q9 @ test a0[8..15] >= pq ++ vcge.s16 q9, q2, q9 @ test a0[0..7] >= pq ++ vsub.i16 q1, q1, q14 @ clip_sign[0..7] - a0_sign[0..7] ++ vqsub.u16 q14, q2, q13 @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs) ++ vcge.s16 q2, q13, q2 @ test a3[0..7] >= a0[0..7] ++ vmul.i16 q13, q15, d0[1] @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0 ++ vceq.i16 q15, q11, #0 @ test clip[0..7] == 0 ++ vmul.i16 q0, q14, d0[1] @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0 ++ vorr q9, q15, q9 @ test clip[0..7] == 0 || a0[0..7] >= pq ++ vceq.i16 q14, q12, #0 @ test clip[8..15] == 0 ++ vshr.u16 q13, q13, #3 @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0 ++ vorr q2, q9, q2 @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7] ++ vshr.u16 q0, q0, #3 @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0 ++ vorr q10, q14, q10 @ test clip[8..15] == 0 || a0[8..15] >= pq ++ vcge.s16 q14, q13, q12 ++ vmov.32 r2, d4[1] @ move to gp reg ++ vorr q3, q10, q3 @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15] ++ vmov.32 r3, d5[1] ++ vcge.s16 q2, q0, q11 ++ vbsl q14, q12, q13 @ FFMIN(d[8..15], clip[8..15]) ++ vbsl q2, q11, q0 @ FFMIN(d[0..7], clip[0..7]) ++ vmov.32 r5, d6[1] ++ vbic q0, q14, q10 @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmov.32 r6, d7[1] ++ and r12, r2, r3 ++ vbic q2, q2, q9 @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub) ++ vmls.i16 q6, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4 ++ vmls.i16 q5, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4 ++ and r14, r5, r6 ++ vmla.i16 q4, q2, q1 @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5 ++ and r12, r12, r14 ++ vqmovun.s16 d4, q6 ++ vmla.i16 q8, q0, q7 @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5 ++ tst r12, #1 ++ bne 4f @ none of the 16 pixel pairs should be updated in this case ++ vqmovun.s16 d2, q5 ++ vqmovun.s16 d3, q4 ++ vqmovun.s16 d5, q8 ++ tst r2, #1 ++ bne 1f ++ vst2.8 {d2[0], d3[0]}, [r0], r1 ++ vst2.8 {d2[1], d3[1]}, [r0], r1 ++ vst2.8 {d2[2], d3[2]}, [r0], r1 ++ vst2.8 {d2[3], d3[3]}, [r0] ++1: add r0, r4, r1, lsl #2 ++ tst r3, #1 ++ bne 2f ++ vst2.8 {d2[4], d3[4]}, [r4], r1 ++ vst2.8 {d2[5], d3[5]}, [r4], r1 ++ vst2.8 {d2[6], d3[6]}, [r4], r1 ++ vst2.8 {d2[7], d3[7]}, [r4] ++2: add r4, r0, r1, lsl #2 ++ tst r5, #1 ++ bne 3f ++ vst2.8 {d4[0], d5[0]}, [r0], r1 ++ vst2.8 {d4[1], d5[1]}, [r0], r1 ++ vst2.8 {d4[2], d5[2]}, [r0], r1 ++ vst2.8 {d4[3], d5[3]}, [r0] ++3: tst r6, #1 ++ bne 4f ++ vst2.8 {d4[4], d5[4]}, [r4], r1 ++ vst2.8 {d4[5], d5[5]}, [r4], r1 ++ vst2.8 {d4[6], d5[6]}, [r4], r1 ++ vst2.8 {d4[7], d5[7]}, [r4] ++4: vpop {d8-d15} ++ pop {r4-r6,pc} ++endfunc ++ ++@ Copy at most the specified number of bytes from source to destination buffer, ++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence ++@ On entry: ++@ r0 -> source buffer ++@ r1 = max number of bytes to copy ++@ r2 -> destination buffer, optimally 8-byte aligned ++@ On exit: ++@ r0 = number of bytes not copied ++function ff_vc1_unescape_buffer_helper_neon, export=1 ++ @ Offset by 48 to screen out cases that are too short for us to handle, ++ @ and also make it easy to test for loop termination, or to determine ++ @ whether we need an odd number of half-iterations of the loop. ++ subs r1, r1, #48 ++ bmi 90f ++ ++ @ Set up useful constants ++ vmov.i32 q0, #0x3000000 ++ vmov.i32 q1, #0x30000 ++ ++ tst r1, #16 ++ bne 1f ++ ++ vld1.8 {q8, q9}, [r0]! ++ vbic q12, q8, q0 ++ vext.8 q13, q8, q9, #1 ++ vext.8 q14, q8, q9, #2 ++ vext.8 q15, q8, q9, #3 ++ veor q12, q12, q1 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ add r1, r1, #16 ++ b 3f ++ ++1: vld1.8 {q10, q11}, [r0]! ++ vbic q12, q10, q0 ++ vext.8 q13, q10, q11, #1 ++ vext.8 q14, q10, q11, #2 ++ vext.8 q15, q10, q11, #3 ++ veor q12, q12, q1 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ @ Drop through... ++2: vmov q8, q11 ++ vld1.8 {q9}, [r0]! ++ vorr q13, q12, q13 ++ vorr q15, q14, q15 ++ vbic q12, q8, q0 ++ vorr q3, q13, q15 ++ vext.8 q13, q8, q9, #1 ++ vext.8 q14, q8, q9, #2 ++ vext.8 q15, q8, q9, #3 ++ veor q12, q12, q1 ++ vorr d6, d6, d7 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ vmov r3, r12, d6 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ orrs r3, r3, r12 ++ bne 90f ++ vst1.64 {q10}, [r2]! ++3: vmov q10, q9 ++ vld1.8 {q11}, [r0]! ++ vorr q13, q12, q13 ++ vorr q15, q14, q15 ++ vbic q12, q10, q0 ++ vorr q3, q13, q15 ++ vext.8 q13, q10, q11, #1 ++ vext.8 q14, q10, q11, #2 ++ vext.8 q15, q10, q11, #3 ++ veor q12, q12, q1 ++ vorr d6, d6, d7 ++ vbic q13, q13, q0 ++ vbic q14, q14, q0 ++ vbic q15, q15, q0 ++ vceq.i32 q12, q12, #0 ++ vmov r3, r12, d6 ++ veor q13, q13, q1 ++ veor q14, q14, q1 ++ veor q15, q15, q1 ++ vceq.i32 q13, q13, #0 ++ vceq.i32 q14, q14, #0 ++ vceq.i32 q15, q15, #0 ++ orrs r3, r3, r12 ++ bne 91f ++ vst1.64 {q8}, [r2]! ++ subs r1, r1, #32 ++ bpl 2b ++ ++90: add r0, r1, #48 ++ bx lr ++ ++91: sub r1, r1, #16 ++ b 90b ++endfunc diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 8a71c04230..53644506e5 100644 --- a/libavcodec/avcodec.h @@ -15325,6 +18190,31 @@ index 8a71c04230..53644506e5 100644 } AVHWAccel; /** +diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c +index c7efe7e77b..46766244b8 100644 +--- a/libavcodec/blockdsp.c ++++ b/libavcodec/blockdsp.c +@@ -65,6 +65,8 @@ av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx) + c->fill_block_tab[0] = fill_block16_c; + c->fill_block_tab[1] = fill_block8_c; + ++ if (ARCH_AARCH64) ++ ff_blockdsp_init_aarch64(c); + if (ARCH_ALPHA) + ff_blockdsp_init_alpha(c); + if (ARCH_ARM) +diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h +index 26fc2ea13b..fe539491da 100644 +--- a/libavcodec/blockdsp.h ++++ b/libavcodec/blockdsp.h +@@ -41,6 +41,7 @@ typedef struct BlockDSPContext { + + void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx); + ++void ff_blockdsp_init_aarch64(BlockDSPContext *c); + void ff_blockdsp_init_alpha(BlockDSPContext *c); + void ff_blockdsp_init_arm(BlockDSPContext *c); + void ff_blockdsp_init_ppc(BlockDSPContext *c); diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h index 38d06b2842..bbf5d70560 100644 --- a/libavcodec/cabac.h @@ -46342,7 +49232,7 @@ index 0000000000..85c5b46d75 +}; + diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c -index 4b2679eb38..6ee6ad8642 100644 +index 4b2679eb38..6ca83cc21b 100644 --- a/libavcodec/v4l2_buffers.c +++ b/libavcodec/v4l2_buffers.c @@ -21,6 +21,7 @@ @@ -47256,7 +50146,7 @@ index 4b2679eb38..6ee6ad8642 100644 if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { avbuf->buf.m.planes = avbuf->planes; avbuf->buf.length = avbuf->num_planes; -@@ -555,18 +911,47 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) +@@ -555,20 +911,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) avbuf->buf.length = avbuf->planes[0].length; } @@ -47300,14 +50190,18 @@ index 4b2679eb38..6ee6ad8642 100644 + err, strerror(err)); + return AVERROR(err); + } -+ + ++ // Lock not wanted - if called from buffer free then lock already obtained + qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1; + avbuf->status = V4L2BUF_IN_DRIVER; ++ pthread_cond_broadcast(&avbuf->context->cond); ++ + av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n", + avbuf->context->name, avbuf->buf.index, + avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc); - avbuf->status = V4L2BUF_IN_DRIVER; - + return 0; + } diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h index 8dbc7fc104..7d5fadcd3d 100644 --- a/libavcodec/v4l2_buffers.h @@ -47380,7 +50274,7 @@ index 8dbc7fc104..7d5fadcd3d 100644 /** * Enqueues a V4L2Buffer diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c -index ff1ea8e57b..d8a86e8261 100644 +index ff1ea8e57b..1aff16c1de 100644 --- a/libavcodec/v4l2_context.c +++ b/libavcodec/v4l2_context.c @@ -27,11 +27,13 @@ @@ -47625,7 +50519,7 @@ index ff1ea8e57b..d8a86e8261 100644 return 1; } -@@ -280,171 +291,274 @@ static int v4l2_stop_encode(V4L2Context *ctx) +@@ -280,171 +291,275 @@ static int v4l2_stop_encode(V4L2Context *ctx) return 0; } @@ -47639,6 +50533,7 @@ index ff1ea8e57b..d8a86e8261 100644 +// Returns: +// 0 Success +// AVERROR(EPIPE) Nothing more to read ++// AVERROR(ENOSPC) No buffers in Q to put result in +// * AVERROR(..) + + static int @@ -47874,7 +50769,7 @@ index ff1ea8e57b..d8a86e8261 100644 + (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) || + (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) { + av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name); -+ return AVERROR(EAGAIN); ++ return AVERROR(ENOSPC); + } - /* the driver is ready to accept more input; instead of waiting for the capture @@ -48022,7 +50917,7 @@ index ff1ea8e57b..d8a86e8261 100644 } return NULL; -@@ -452,25 +566,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) +@@ -452,25 +567,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx) static int v4l2_release_buffers(V4L2Context* ctx) { @@ -48082,7 +50977,7 @@ index ff1ea8e57b..d8a86e8261 100644 } static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) -@@ -499,6 +633,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm +@@ -499,6 +634,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) { @@ -48091,7 +50986,7 @@ index ff1ea8e57b..d8a86e8261 100644 enum AVPixelFormat pixfmt = ctx->av_pix_fmt; struct v4l2_fmtdesc fdesc; int ret; -@@ -517,6 +653,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) +@@ -517,6 +654,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) if (ret) return AVERROR(EINVAL); @@ -48105,7 +51000,7 @@ index ff1ea8e57b..d8a86e8261 100644 pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); ret = v4l2_try_raw_format(ctx, pixfmt); if (ret){ -@@ -569,18 +712,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) +@@ -569,18 +713,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p) * *****************************************************************************/ @@ -48194,7 +51089,7 @@ index ff1ea8e57b..d8a86e8261 100644 } int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) -@@ -608,7 +817,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) +@@ -608,7 +818,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) return ff_v4l2_buffer_enqueue(avbuf); } @@ -48204,7 +51099,7 @@ index ff1ea8e57b..d8a86e8261 100644 { V4L2m2mContext *s = ctx_to_m2mctx(ctx); V4L2Buffer* avbuf; -@@ -616,8 +826,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +@@ -616,8 +827,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) if (!pkt->size) { ret = v4l2_stop_decode(ctx); @@ -48215,7 +51110,7 @@ index ff1ea8e57b..d8a86e8261 100644 s->draining = 1; return 0; } -@@ -626,8 +837,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +@@ -626,8 +838,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) if (!avbuf) return AVERROR(EAGAIN); @@ -48229,7 +51124,7 @@ index ff1ea8e57b..d8a86e8261 100644 return ret; return ff_v4l2_buffer_enqueue(avbuf); -@@ -636,19 +850,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) +@@ -636,19 +851,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) { V4L2Buffer *avbuf; @@ -48252,7 +51147,7 @@ index ff1ea8e57b..d8a86e8261 100644 return ff_v4l2_buffer_buf_to_avframe(frame, avbuf); } -@@ -656,19 +861,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) +@@ -656,19 +862,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout) int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt) { V4L2Buffer *avbuf; @@ -48271,11 +51166,11 @@ index ff1ea8e57b..d8a86e8261 100644 - return AVERROR(EAGAIN); - } + if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0) -+ return rv; ++ return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv; // Caller not currently expecting ENOSPC return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf); } -@@ -702,78 +898,158 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) +@@ -702,78 +899,160 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe) int ff_v4l2_context_set_format(V4L2Context* ctx) { @@ -48322,6 +51217,7 @@ index ff1ea8e57b..d8a86e8261 100644 + av_buffer_unref(&ctx->frames_ref); + + ff_mutex_destroy(&ctx->lock); ++ pthread_cond_destroy(&ctx->cond); } -int ff_v4l2_context_init(V4L2Context* ctx) @@ -48424,6 +51320,7 @@ index ff1ea8e57b..d8a86e8261 100644 + } + + ff_mutex_init(&ctx->lock, NULL); ++ pthread_cond_init(&ctx->cond, NULL); + atomic_init(&ctx->q_count, 0); + + if (s->output_drm) { @@ -48466,7 +51363,7 @@ index ff1ea8e57b..d8a86e8261 100644 return ret; } diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h -index 22a9532444..35e83c66d9 100644 +index 22a9532444..a56216e990 100644 --- a/libavcodec/v4l2_context.h +++ b/libavcodec/v4l2_context.h @@ -31,6 +31,7 @@ @@ -48498,7 +51395,7 @@ index 22a9532444..35e83c66d9 100644 /** * Readonly after init. -@@ -92,6 +100,20 @@ typedef struct V4L2Context { +@@ -92,6 +100,21 @@ typedef struct V4L2Context { */ int done; @@ -48516,18 +51413,22 @@ index 22a9532444..35e83c66d9 100644 + struct ff_weak_link_master *wl_master; + + AVMutex lock; ++ pthread_cond_t cond; } V4L2Context; /** -@@ -156,6 +178,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); +@@ -156,7 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt); * @param[in] ctx The V4L2Context to dequeue from. * @param[inout] f The AVFrame to dequeue to. * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds) + * * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error. ++ * AVERROR(ENOSPC) if no buffer availible to put ++ * the frame in */ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); -@@ -170,7 +193,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + +@@ -170,7 +196,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); * @param[in] pkt A pointer to an AVPacket. * @return 0 in case of success, a negative error otherwise. */ @@ -48614,7 +51515,7 @@ index cdfd579810..010b4232d4 100644 return 0; diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h -index b67b216331..f1923bb26d 100644 +index b67b216331..9a20447030 100644 --- a/libavcodec/v4l2_m2m.h +++ b/libavcodec/v4l2_m2m.h @@ -30,6 +30,7 @@ @@ -48673,7 +51574,7 @@ index b67b216331..f1923bb26d 100644 /* null frame/packet received */ int draining; -@@ -66,6 +97,27 @@ typedef struct V4L2m2mContext { +@@ -66,6 +97,29 @@ typedef struct V4L2m2mContext { /* reference back to V4L2m2mPriv */ void *priv; @@ -48685,6 +51586,8 @@ index b67b216331..f1923bb26d 100644 + + /* Frame tracking */ + xlat_track_t xlat; ++ int pending_hw; ++ int pending_n; + + pts_stats_t pts_stat; + @@ -48701,7 +51604,7 @@ index b67b216331..f1923bb26d 100644 } V4L2m2mContext; typedef struct V4L2m2mPriv { -@@ -76,6 +128,7 @@ typedef struct V4L2m2mPriv { +@@ -76,6 +130,7 @@ typedef struct V4L2m2mPriv { int num_output_buffers; int num_capture_buffers; @@ -48709,7 +51612,7 @@ index b67b216331..f1923bb26d 100644 } V4L2m2mPriv; /** -@@ -129,4 +182,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); +@@ -129,4 +184,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx); */ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx); @@ -48737,7 +51640,7 @@ index b67b216331..f1923bb26d 100644 + #endif /* AVCODEC_V4L2_M2M_H */ diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c -index ab07c0a24a..3dd462362c 100644 +index ab07c0a24a..3bd4ff64cc 100644 --- a/libavcodec/v4l2_m2m_dec.c +++ b/libavcodec/v4l2_m2m_dec.c @@ -23,6 +23,10 @@ @@ -48907,7 +51810,7 @@ index ab07c0a24a..3dd462362c 100644 return 0; } -@@ -133,58 +164,461 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) +@@ -133,58 +164,514 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) return 0; } @@ -49004,7 +51907,8 @@ index ab07c0a24a..3dd462362c 100644 +#endif + frame->best_effort_timestamp = pts_stats_guess(ps); + frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? -+ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts); ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n", ++ frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n); + return 0; +} + @@ -49197,6 +52101,36 @@ index ab07c0a24a..3dd462362c 100644 return ret; } ++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx) ++{ ++ int rv = 0; ++ ++ ff_mutex_lock(&ctx->lock); ++ ++ while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) { ++ if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) { ++ rv = AVERROR(errno); ++ av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv)); ++ break; ++ } ++ } ++ ++ ff_mutex_unlock(&ctx->lock); ++ return rv; ++} ++ ++// Number of frames over what xlat_pending returns that we keep *16 ++// This is a min value - if it appears to be too small the threshold should ++// adjust dynamically. ++#define PENDING_HW_MIN (3 * 16) ++// Offset to use when setting dynamically ++// Set to %16 == 15 to avoid the threshold changing immediately as we relax ++#define PENDING_HW_OFFSET (PENDING_HW_MIN - 1) ++// Number of consecutive times we've failed to get a frame when we prefer it ++// before we increase the prefer threshold (5ms * N = max expected decode ++// time) ++#define PENDING_N_THRESHOLD 6 ++ +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) +{ + V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; @@ -49206,7 +52140,7 @@ index ab07c0a24a..3dd462362c 100644 + + do { + const int pending = xlat_pending(&s->xlat); -+ const int prefer_dq = (pending > 5); ++ const int prefer_dq = (pending > s->pending_hw / 16); + + // Enqueue another pkt for decode if + // (a) We don't have a lot of stuff in the buffer already OR @@ -49240,6 +52174,27 @@ index ab07c0a24a..3dd462362c 100644 + // there is room in the input Q and timeout == -1 + dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); + ++ // Failure due to no buffer in Q? ++ if (dst_rv == AVERROR(ENOSPC)) { ++ // Wait & retry ++ if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) { ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t); ++ } ++ } ++ ++ // Adjust dynamic pending threshold ++ if (dst_rv == 0) { ++ if (--s->pending_hw < PENDING_HW_MIN) ++ s->pending_hw = PENDING_HW_MIN; ++ s->pending_n = 0; ++ } ++ else if (dst_rv == AVERROR(EAGAIN)) { ++ if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) { ++ s->pending_hw = pending * 16 + PENDING_HW_OFFSET; ++ s->pending_n = 0; ++ } ++ } ++ + if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) { + av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF"); + dst_rv = AVERROR_EOF; @@ -49388,11 +52343,12 @@ index ab07c0a24a..3dd462362c 100644 + xlat_init(&s->xlat); + pts_stats_init(&s->pts_stat, avctx, "decoder"); ++ s->pending_hw = PENDING_HW_MIN; + capture = &s->capture; output = &s->output; -@@ -192,14 +626,51 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -192,14 +679,51 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) * by the v4l2 driver; this event will trigger a full pipeline reconfig and * the proper values will be retrieved from the kernel driver. */ @@ -49446,7 +52402,7 @@ index ab07c0a24a..3dd462362c 100644 s->avctx = avctx; ret = ff_v4l2_m2m_codec_init(priv); -@@ -208,12 +679,68 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) +@@ -208,12 +732,68 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) return ret; } @@ -49517,7 +52473,7 @@ index ab07c0a24a..3dd462362c 100644 } #define OFFSET(x) offsetof(V4L2m2mPriv, x) -@@ -222,10 +749,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) +@@ -222,10 +802,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) static const AVOption options[] = { V4L_M2M_DEFAULT_OPTS, { "num_capture_buffers", "Number of buffers in the capture context", @@ -49535,7 +52491,7 @@ index ab07c0a24a..3dd462362c 100644 #define M2MDEC_CLASS(NAME) \ static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ .class_name = #NAME "_v4l2m2m_decoder", \ -@@ -246,9 +779,15 @@ static const AVOption options[] = { +@@ -246,9 +832,15 @@ static const AVOption options[] = { .init = v4l2_decode_init, \ .receive_frame = v4l2_receive_frame, \ .close = v4l2_decode_close, \ @@ -54292,6 +57248,114 @@ index 0000000000..bee4c50fac +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2); + +#endif +diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c +index ea93e11588..a9e0c6323e 100644 +--- a/libavcodec/vc1dec.c ++++ b/libavcodec/vc1dec.c +@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) + size = next - start - 4; + if (size <= 0) + continue; +- buf2_size = vc1_unescape_buffer(start + 4, size, buf2); ++ buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + init_get_bits(&gb, buf2, buf2_size * 8); + switch (AV_RB32(start)) { + case VC1_CODE_SEQHDR: +@@ -678,7 +678,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + case VC1_CODE_FRAME: + if (avctx->hwaccel) + buf_start = start; +- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + break; + case VC1_CODE_FIELD: { + int buf_size3; +@@ -695,8 +695,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(start + 4, size, +- slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, ++ slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = avctx->coded_height + 31 >> 5; +@@ -707,7 +707,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + break; + } + case VC1_CODE_ENTRYPOINT: /* it should be before frame data */ +- buf_size2 = vc1_unescape_buffer(start + 4, size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2); + init_get_bits(&s->gb, buf2, buf_size2 * 8); + ff_vc1_decode_entry_point(avctx, v, &s->gb); + break; +@@ -724,8 +724,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(start + 4, size, +- slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, ++ slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9); +@@ -759,7 +759,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + ret = AVERROR(ENOMEM); + goto err; + } +- buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); ++ buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf); + init_get_bits(&slices[n_slices].gb, slices[n_slices].buf, + buf_size3 << 3); + slices[n_slices].mby_start = s->mb_height + 1 >> 1; +@@ -768,9 +768,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data, + n_slices1 = n_slices - 1; + n_slices++; + } +- buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2); + } else { +- buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2); ++ buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2); + } + init_get_bits(&s->gb, buf2, buf_size2*8); + } else +diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c +index c25a6f3adf..10182786b3 100644 +--- a/libavcodec/vc1dsp.c ++++ b/libavcodec/vc1dsp.c +@@ -32,6 +32,7 @@ + #include "rnd_avg.h" + #include "vc1dsp.h" + #include "startcode.h" ++#include "vc1_common.h" + + /* Apply overlap transform to horizontal edge */ + static void vc1_v_overlap_c(uint8_t *src, int stride) +@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) + #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ + + dsp->startcode_find_candidate = ff_startcode_find_candidate_c; ++ dsp->vc1_unescape_buffer = vc1_unescape_buffer; + + if (ARCH_AARCH64) + ff_vc1dsp_init_aarch64(dsp); +diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h +index 75db62b1b4..e192b431be 100644 +--- a/libavcodec/vc1dsp.h ++++ b/libavcodec/vc1dsp.h +@@ -80,6 +80,9 @@ typedef struct VC1DSPContext { + * one or more further zero bytes and a one byte. + */ + int (*startcode_find_candidate)(const uint8_t *buf, int size); ++ ++ /* Copy a buffer, removing startcode emulation escape bytes as we go */ ++ int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst); + } VC1DSPContext; + + void ff_vc1dsp_init(VC1DSPContext* c); diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c new file mode 100644 index 0000000000..f234a985b9