From 34bc440c43f261effaabfefb6eef46a2766426a9 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Fri, 11 Mar 2022 17:11:52 +0100
Subject: [PATCH 1/2] ffmpeg: update rpi patch

Patch created using revisions dc91b91..34fb1cd
from branch dev/4.4/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg
---
 .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch   | 166 ++++++++++--------
 1 file changed, 88 insertions(+), 78 deletions(-)

diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index e5d29c0507..1ccf22ba72 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -47380,7 +47380,7 @@ index 8dbc7fc104..7d5fadcd3d 100644
  /**
   * Enqueues a V4L2Buffer
 diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index ff1ea8e57b..b2c40636a2 100644
+index ff1ea8e57b..d8a86e8261 100644
 --- a/libavcodec/v4l2_context.c
 +++ b/libavcodec/v4l2_context.c
 @@ -27,11 +27,13 @@
@@ -47625,7 +47625,7 @@ index ff1ea8e57b..b2c40636a2 100644
      return 1;
  }
  
-@@ -280,171 +291,267 @@ static int v4l2_stop_encode(V4L2Context *ctx)
+@@ -280,171 +291,274 @@ static int v4l2_stop_encode(V4L2Context *ctx)
      return 0;
  }
  
@@ -47698,6 +47698,7 @@ index ff1ea8e57b..b2c40636a2 100644
 +
 +    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
 +        const int err = errno;
++        av_assert0(AVERROR(err) < 0);
 +        if (err != EINTR) {
 +            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
 +                ctx->name, av_err2str(AVERROR(err)));
@@ -47811,6 +47812,12 @@ index ff1ea8e57b..b2c40636a2 100644
 -            /* if re-init failed, abort */
 -            ctx->done = 1;
 -            return NULL;
+-        }
+-        if (ret) {
+-            /* if re-init was successful drop the buffer (if there was one)
+-             * since we had to reconfigure capture (unmap all buffers)
+-             */
+-            return NULL;
 +    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
 +        return do_source_change(m);
 +
@@ -47821,6 +47828,7 @@ index ff1ea8e57b..b2c40636a2 100644
 +// Get a buffer
 +// If output then just gets the buffer in the expected way
 +// If capture then runs the capture state m/c to deal with res change etc.
++// If return value == 0 then *ppavbuf != NULL
 +
 +static int
 +get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
@@ -47849,52 +47857,40 @@ index ff1ea8e57b..b2c40636a2 100644
 +            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
 +            return AVERROR_EOF;
          }
--        if (ret) {
--            /* if re-init was successful drop the buffer (if there was one)
--             * since we had to reconfigure capture (unmap all buffers)
--             */
--            return NULL;
-+
+-    }
+ 
+-    /* 2. dequeue the buffer */
+-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
 +        // If capture && timeout == -1 then also wait for rx buffer free
 +        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
 +            pfd.events |= poll_out;
-+
+ 
+-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-            /* there is a capture buffer ready */
+-            if (pfd.revents & (POLLIN | POLLRDNORM))
+-                goto dequeue;
 +        // If nothing Qed all we will get is POLLERR - avoid that
 +        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
 +            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
 +            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
 +            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
 +            return AVERROR(EAGAIN);
-         }
--    }
- 
--    /* 2. dequeue the buffer */
--    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
-+        // Timeout kludged s.t. "forever" eventually gives up & produces logging
-+        // If waiting for an event when we have seen a last_frame then we expect
-+        //   it to be ready already so force a short timeout
-+        ret = poll(&pfd, 1,
-+                   ff_v4l2_ctx_eos(ctx) ? 10 :
-+                   timeout == -1 ? 3000 : timeout);
- 
--        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
--            /* there is a capture buffer ready */
--            if (pfd.revents & (POLLIN | POLLRDNORM))
--                goto dequeue;
-+        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
-+               ctx->name, ret, timeout, pfd.events, pfd.revents);
++        }
  
 -            /* the driver is ready to accept more input; instead of waiting for the capture
 -             * buffer to complete we return NULL so input can proceed (we are single threaded)
 -             */
 -            if (pfd.revents & (POLLOUT | POLLWRNORM))
 -                return NULL;
++        // Timeout kludged s.t. "forever" eventually gives up & produces logging
++        // If waiting for an event when we have seen a last_frame then we expect
++        //   it to be ready already so force a short timeout
++        ret = poll(&pfd, 1,
++                   ff_v4l2_ctx_eos(ctx) ? 10 :
++                   timeout == -1 ? 3000 : timeout);
 +        if (ret < 0) {
-+            const int err = errno;
-+            if (err == EINTR)
-+                continue;
-+            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, err, strerror(err));
-+            return AVERROR(err);
++            ret = AVERROR(errno);  // Remember errno before logging etc.
++            av_assert0(ret < 0);
          }
  
 -dequeue:
@@ -47905,6 +47901,23 @@ index ff1ea8e57b..b2c40636a2 100644
 -            memset(planes, 0, sizeof(planes));
 -            buf.length = VIDEO_MAX_PLANES;
 -            buf.m.planes = planes;
++        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
++               ctx->name, ret, timeout, pfd.events, pfd.revents);
++
++        if (ret < 0) {
++            if (ret == AVERROR(EINTR))
++                continue;
++            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
++            return ret;
+         }
+ 
+-        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
+-        if (ret) {
+-            if (errno != EAGAIN) {
+-                ctx->done = 1;
+-                if (errno != EPIPE)
+-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+-                        ctx->name, av_err2str(AVERROR(errno)));
 +        if (ret == 0) {
 +            if (timeout == -1)
 +                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
@@ -47915,19 +47928,11 @@ index ff1ea8e57b..b2c40636a2 100644
 +                    ctx->done = 1;
 +                    return ret;
 +                }
-+            }
-+            return AVERROR(EAGAIN);
-         }
- 
--        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
--        if (ret) {
--            if (errno != EAGAIN) {
--                ctx->done = 1;
--                if (errno != EPIPE)
--                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
--                        ctx->name, av_err2str(AVERROR(errno)));
--            }
+             }
 -            return NULL;
++            return AVERROR(EAGAIN);
++        }
++
 +        if ((pfd.revents & POLLERR) != 0) {
 +            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
 +            return AVERROR_UNKNOWN;
@@ -47949,6 +47954,13 @@ index ff1ea8e57b..b2c40636a2 100644
 -                ctx->done = 1;
 -#endif
 +            continue;
++        }
++
++        if ((pfd.revents & poll_cap) != 0) {
++            ret = dq_buf(ctx, ppavbuf);
++            if (ret == AVERROR(EPIPE))
++                continue;
++            return ret;
          }
  
 -        avbuf = &ctx->buffers[buf.index];
@@ -47957,18 +47969,13 @@ index ff1ea8e57b..b2c40636a2 100644
 -        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
 -            memcpy(avbuf->planes, planes, sizeof(planes));
 -            avbuf->buf.m.planes = avbuf->planes;
-+        if ((pfd.revents & poll_cap) != 0) {
-+            ret = dq_buf(ctx, ppavbuf);
-+            if (ret == AVERROR(EPIPE))
-+                continue;
-+            return ret;
++        if ((pfd.revents & poll_out) != 0) {
++            if (is_cap)
++                return AVERROR(EAGAIN);
++            return dq_buf(ctx, ppavbuf);
          }
 -        return avbuf;
 +
-+        if ((pfd.revents & poll_out) != 0) {
-+            return is_cap ? 0 : dq_buf(ctx, ppavbuf);
-+        }
-+
 +        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
 +        return AVERROR_UNKNOWN;
      }
@@ -48015,7 +48022,7 @@ index ff1ea8e57b..b2c40636a2 100644
      }
  
      return NULL;
-@@ -452,25 +559,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+@@ -452,25 +566,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
  
  static int v4l2_release_buffers(V4L2Context* ctx)
  {
@@ -48045,18 +48052,18 @@ index ff1ea8e57b..b2c40636a2 100644
 +            .type = ctx->type,
 +            .count = 0, /* 0 -> unmap all buffers from the driver */
 +        };
++
++        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
++            if (errno == EINTR)
++                continue;
++
++            ret = AVERROR(errno);
  
 -        for (j = 0; j < buffer->num_planes; j++) {
 -            struct V4L2Plane_info *p = &buffer->plane_info[j];
 -            if (p->mm_addr && p->length)
 -                if (munmap(p->mm_addr, p->length) < 0)
 -                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
-+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
-+            if (errno == EINTR)
-+                continue;
-+
-+            ret = AVERROR(errno);
-+
 +            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
 +                ctx->name, av_err2str(AVERROR(errno)));
 +
@@ -48075,7 +48082,7 @@ index ff1ea8e57b..b2c40636a2 100644
  }
  
  static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -499,6 +626,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
+@@ -499,6 +633,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
  
  static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
  {
@@ -48084,7 +48091,7 @@ index ff1ea8e57b..b2c40636a2 100644
      enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
      struct v4l2_fmtdesc fdesc;
      int ret;
-@@ -517,6 +646,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+@@ -517,6 +653,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
          if (ret)
              return AVERROR(EINVAL);
  
@@ -48098,7 +48105,7 @@ index ff1ea8e57b..b2c40636a2 100644
          pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
          ret = v4l2_try_raw_format(ctx, pixfmt);
          if (ret){
-@@ -569,18 +705,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
+@@ -569,18 +712,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
    *
    *****************************************************************************/
  
@@ -48170,24 +48177,24 @@ index ff1ea8e57b..b2c40636a2 100644
 +    {
 +        if (cmd == VIDIOC_STREAMOFF)
 +            flush_all_buffers_status(ctx);
- 
--    ctx->streamon = (cmd == VIDIOC_STREAMON);
++
 +        ctx->streamon = (cmd == VIDIOC_STREAMON);
 +        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
 +               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
 +    }
  
--    return 0;
+-    ctx->streamon = (cmd == VIDIOC_STREAMON);
 +    // Both stream off & on effectively clear flag_last
 +    ctx->flag_last = 0;
-+
+ 
+-    return 0;
 +    ff_mutex_unlock(&ctx->lock);
 +
 +    return ret;
  }
  
  int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-@@ -608,7 +810,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+@@ -608,7 +817,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
      return ff_v4l2_buffer_enqueue(avbuf);
  }
  
@@ -48197,7 +48204,7 @@ index ff1ea8e57b..b2c40636a2 100644
  {
      V4L2m2mContext *s = ctx_to_m2mctx(ctx);
      V4L2Buffer* avbuf;
-@@ -616,8 +819,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -616,8 +826,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
  
      if (!pkt->size) {
          ret = v4l2_stop_decode(ctx);
@@ -48208,7 +48215,7 @@ index ff1ea8e57b..b2c40636a2 100644
          s->draining = 1;
          return 0;
      }
-@@ -626,8 +830,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -626,8 +837,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
      if (!avbuf)
          return AVERROR(EAGAIN);
  
@@ -48222,7 +48229,7 @@ index ff1ea8e57b..b2c40636a2 100644
          return ret;
  
      return ff_v4l2_buffer_enqueue(avbuf);
-@@ -636,19 +843,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -636,19 +850,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
  int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  {
      V4L2Buffer *avbuf;
@@ -48245,7 +48252,7 @@ index ff1ea8e57b..b2c40636a2 100644
  
      return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
  }
-@@ -656,19 +854,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+@@ -656,19 +861,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
  {
      V4L2Buffer *avbuf;
@@ -48268,7 +48275,7 @@ index ff1ea8e57b..b2c40636a2 100644
  
      return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
  }
-@@ -702,78 +891,158 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +898,158 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
  
  int ff_v4l2_context_set_format(V4L2Context* ctx)
  {
@@ -61254,32 +61261,35 @@ index 0000000000..92bc13a3df
 +
 diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh
 new file mode 100755
-index 0000000000..98ab9d6de9
+index 0000000000..b3b2d5509d
 --- /dev/null
 +++ b/pi-util/clean_usr_libs.sh
-@@ -0,0 +1,23 @@
+@@ -0,0 +1,26 @@
 +set -e
 +U=/usr/lib/arm-linux-gnueabihf
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +U=/usr/lib/arm-linux-gnueabihf/neon/vfp
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +U=/usr/lib/aarch64-linux-gnu
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
 +rm -f $U/libavfilter.*
 +rm -f $U/libavformat.*
-+rm -f $U/libavresample.*
 +rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
 +
 diff --git a/pi-util/conf_arm64_native.sh b/pi-util/conf_arm64_native.sh
 new file mode 100644

From 74d19598a5b2197b17b90d0c9d6503521d37bd03 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 24 Mar 2022 18:17:58 +0100
Subject: [PATCH 2/2] ffmpeg: update rpi patch

Patch created using revisions dc91b91..5bab299
from branch dev/4.4/rpi_import_1 of https://github.com/jc-kynesim/rpi-ffmpeg
---
 .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch   | 3134 ++++++++++++++++-
 1 file changed, 3099 insertions(+), 35 deletions(-)

diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 1ccf22ba72..f88fe6d562 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -460,6 +460,1990 @@ index 33a280cf69..be3b73e7c4 100644
 +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
 +$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
 +endif
+diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
+index 954461f81d..7078dc6089 100644
+--- a/libavcodec/aarch64/Makefile
++++ b/libavcodec/aarch64/Makefile
+@@ -35,6 +35,8 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP)           += aarch64/videodsp.o
+ 
+ # subsystems
+ NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/sbrdsp_neon.o
++NEON-OBJS-$(CONFIG_BLOCKDSP)            += aarch64/blockdsp_init_aarch64.o     \
++                                           aarch64/blockdsp_neon.o
+ NEON-OBJS-$(CONFIG_FFT)                 += aarch64/fft_neon.o
+ NEON-OBJS-$(CONFIG_FMTCONVERT)          += aarch64/fmtconvert_neon.o
+ NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
+@@ -44,10 +46,12 @@ NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
+ NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
+                                            aarch64/hpeldsp_neon.o
+ NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
+-NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
++NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
++                                           aarch64/simple_idct_neon.o
+ NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+ NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
++NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
+ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
+ 
+ # decoders/encoders
+diff --git a/libavcodec/aarch64/blockdsp_init_aarch64.c b/libavcodec/aarch64/blockdsp_init_aarch64.c
+new file mode 100644
+index 0000000000..9f3280f007
+--- /dev/null
++++ b/libavcodec/aarch64/blockdsp_init_aarch64.c
+@@ -0,0 +1,42 @@
++/*
++ * AArch64 NEON optimised block operations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/blockdsp.h"
++
++void ff_clear_block_neon(int16_t *block);
++void ff_clear_blocks_neon(int16_t *blocks);
++
++av_cold void ff_blockdsp_init_aarch64(BlockDSPContext *c)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags)) {
++        c->clear_block  = ff_clear_block_neon;
++        c->clear_blocks = ff_clear_blocks_neon;
++    }
++}
+diff --git a/libavcodec/aarch64/blockdsp_neon.S b/libavcodec/aarch64/blockdsp_neon.S
+new file mode 100644
+index 0000000000..e4a4959ccc
+--- /dev/null
++++ b/libavcodec/aarch64/blockdsp_neon.S
+@@ -0,0 +1,43 @@
++/*
++ * AArch64 NEON optimised block operations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++function ff_clear_block_neon, export=1
++        movi            v0.16b, #0
++        movi            v1.16b, #0
++        st1             {v0.16b, v1.16b}, [x0], #32
++        st1             {v0.16b, v1.16b}, [x0], #32
++        st1             {v0.16b, v1.16b}, [x0], #32
++        st1             {v0.16b, v1.16b}, [x0]
++        ret
++endfunc
++
++function ff_clear_blocks_neon, export=1
++        movi            v0.16b, #0
++        movi            v1.16b, #0
++        .rept           23
++        st1             {v0.16b, v1.16b}, [x0], #32
++        .endr
++        st1             {v0.16b, v1.16b}, [x0]
++        ret
++endfunc
+diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
+index 742a3372e3..eec21aa5a2 100644
+--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
+@@ -27,19 +27,29 @@
+ #include "libavcodec/idctdsp.h"
+ #include "idct.h"
+ 
++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++
+ av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                      unsigned high_bit_depth)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+-    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
+-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+-            c->idct_put  = ff_simple_idct_put_neon;
+-            c->idct_add  = ff_simple_idct_add_neon;
+-            c->idct      = ff_simple_idct_neon;
+-            c->perm_type = FF_IDCT_PERM_PARTTRANS;
++    if (have_neon(cpu_flags)) {
++        if (!avctx->lowres && !high_bit_depth) {
++            if (avctx->idct_algo == FF_IDCT_AUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLENEON) {
++                c->idct_put  = ff_simple_idct_put_neon;
++                c->idct_add  = ff_simple_idct_add_neon;
++                c->idct      = ff_simple_idct_neon;
++                c->perm_type = FF_IDCT_PERM_PARTTRANS;
++            }
+         }
++
++        c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
++        c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
++        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+     }
+ }
+diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S
+new file mode 100644
+index 0000000000..7f47611206
+--- /dev/null
++++ b/libavcodec/aarch64/idctdsp_neon.S
+@@ -0,0 +1,130 @@
++/*
++ * IDCT AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// Clamp 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x1], x2
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x1], x2
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x1], x2
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x1], x2
++        st1             {v4.8b}, [x1], x2
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1]
++        ret
++endfunc
++
++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_signed_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        movi            v4.8b, #128
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        sqxtn           v0.8b, v0.8h
++        sqxtn           v1.8b, v1.8h
++        sqxtn           v2.8b, v2.8h
++        sqxtn           v3.8b, v3.8h
++        sqxtn           v5.8b, v16.8h
++        add             v0.8b, v0.8b, v4.8b
++        sqxtn           v6.8b, v17.8h
++        add             v1.8b, v1.8b, v4.8b
++        sqxtn           v7.8b, v18.8h
++        add             v2.8b, v2.8b, v4.8b
++        sqxtn           v16.8b, v19.8h
++        add             v3.8b, v3.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        add             v0.8b, v5.8b, v4.8b
++        st1             {v1.8b}, [x1], x2
++        add             v1.8b, v6.8b, v4.8b
++        st1             {v2.8b}, [x1], x2
++        add             v2.8b, v7.8b, v4.8b
++        st1             {v3.8b}, [x1], x2
++        add             v3.8b, v16.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1], x2
++        st1             {v3.8b}, [x1]
++        ret
++endfunc
++
++// Add 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit input and results
++//   x2 = row stride for 8-bit input and results, bytes
++function ff_add_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        mov             x3, x1
++        ld1             {v4.8b}, [x1], x2
++        ld1             {v5.8b}, [x1], x2
++        ld1             {v6.8b}, [x1], x2
++        ld1             {v7.8b}, [x1], x2
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        uaddw           v0.8h, v0.8h, v4.8b
++        uaddw           v1.8h, v1.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        ld1             {v4.8b}, [x1], x2
++        uaddw           v3.8h, v3.8h, v7.8b
++        ld1             {v5.8b}, [x1], x2
++        sqxtun          v0.8b, v0.8h
++        ld1             {v6.8b}, [x1], x2
++        sqxtun          v1.8b, v1.8h
++        ld1             {v7.8b}, [x1]
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        uaddw           v4.8h, v16.8h, v4.8b
++        st1             {v0.8b}, [x3], x2
++        uaddw           v0.8h, v17.8h, v5.8b
++        st1             {v1.8b}, [x3], x2
++        uaddw           v1.8h, v18.8h, v6.8b
++        st1             {v2.8b}, [x3], x2
++        uaddw           v2.8h, v19.8h, v7.8b
++        sqxtun          v4.8b, v4.8h
++        sqxtun          v0.8b, v0.8h
++        st1             {v3.8b}, [x3], x2
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        st1             {v4.8b}, [x3], x2
++        st1             {v0.8b}, [x3], x2
++        st1             {v1.8b}, [x3], x2
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
+diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+index 13dfd74940..161d5a972b 100644
+--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+@@ -21,10 +21,28 @@
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ 
+ #include "config.h"
+ 
++void ff_vc1_inv_trans_8x8_neon(int16_t *block);
++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+     if (have_neon(cpu_flags)) {
++        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
++        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
++        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
++        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
++        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
++        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
++        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
++        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
++
++        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+     }
+ }
+diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
+new file mode 100644
+index 0000000000..529c21d285
+--- /dev/null
++++ b/libavcodec/aarch64/vc1dsp_neon.S
+@@ -0,0 +1,1552 @@
++/*
++ * VC1 AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// VC-1 8x8 inverse transform
++// On entry:
++//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
++// On exit:
++//   array at x0 updated to hold transformed block; also now held in row-major order
++function ff_vc1_inv_trans_8x8_neon, export=1
++        ld1             {v1.16b, v2.16b}, [x0], #32
++        ld1             {v3.16b, v4.16b}, [x0], #32
++        ld1             {v5.16b, v6.16b}, [x0], #32
++        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
++        sub             x1, x0, #3*32
++        ld1             {v16.16b, v17.16b}, [x0]
++        shl             v7.8h, v2.8h, #4        //          16 * src[8]
++        shl             v18.8h, v2.8h, #2       //           4 * src[8]
++        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
++        ldr             d0, .Lcoeffs_it8
++        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
++        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
++        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
++        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
++        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
++        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
++        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
++        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
++        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
++        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
++        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
++        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
++        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
++        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
++        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
++        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
++        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
++        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
++        neg             v3.8h, v7.8h            // -t1
++        neg             v4.8h, v20.8h           // +t2
++        neg             v6.8h, v19.8h           // +t3
++        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
++        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
++        neg             v7.8h, v18.8h           // +t4
++        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
++        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
++        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
++        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
++        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
++        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
++        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
++        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
++        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
++        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
++        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
++        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
++        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
++        trn2            v17.8h, v3.8h, v4.8h
++        trn2            v18.8h, v5.8h, v6.8h
++        trn2            v19.8h, v2.8h, v1.8h
++        trn2            v20.8h, v7.8h, v16.8h
++        trn1            v21.4s, v17.4s, v18.4s
++        trn2            v17.4s, v17.4s, v18.4s
++        trn1            v18.4s, v19.4s, v20.4s
++        trn2            v19.4s, v19.4s, v20.4s
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.2d, v21.2d, v18.2d
++        trn1            v20.2d, v17.2d, v19.2d
++        trn1            v5.8h, v5.8h, v6.8h
++        trn1            v1.8h, v2.8h, v1.8h
++        trn1            v2.8h, v7.8h, v16.8h
++        trn1            v6.2d, v21.2d, v18.2d
++        trn2            v7.2d, v17.2d, v19.2d
++        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
++        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
++        trn1            v18.4s, v3.4s, v5.4s
++        trn1            v19.4s, v1.4s, v2.4s
++        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
++        shl             v22.8h, v6.8h, #2       //           4 * src[8]
++        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
++        trn2            v3.4s, v3.4s, v5.4s
++        trn2            v1.4s, v1.4s, v2.4s
++        shl             v2.8h, v6.8h, #4        //          16 * src[8]
++        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
++        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
++        trn1            v22.2d, v18.2d, v19.2d
++        trn2            v18.2d, v18.2d, v19.2d
++        trn1            v19.2d, v3.2d, v1.2d
++        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
++        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
++        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
++        trn2            v1.2d, v3.2d, v1.2d
++        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
++        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
++        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
++        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
++        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
++        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        neg             v21.8h, v17.8h          // +t2
++        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
++        neg             v4.8h, v5.8h            // +t3
++        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
++        neg             v24.8h, v16.8h          // +t4
++        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
++        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
++        neg             v3.8h, v2.8h            // -t1
++        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
++        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
++        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
++        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
++        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
++        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
++        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
++        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
++        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
++        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
++        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
++        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
++        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
++        st1             {v2.16b, v3.16b}, [x1], #32
++        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
++        st1             {v4.16b, v5.16b}, [x1], #32
++        st1             {v16.16b, v17.16b}, [x1], #32
++        st1             {v0.16b, v1.16b}, [x1]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_neon, export=1
++        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
++        mov             x3, x0
++        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        ld1             {v5.8b}, [x0], x1
++        trn2            v6.4h, v1.4h, v3.4h
++        trn2            v7.4h, v2.4h, v4.4h
++        trn1            v1.4h, v1.4h, v3.4h
++        trn1            v2.4h, v2.4h, v4.4h
++        trn2            v3.4h, v16.4h, v18.4h
++        trn2            v4.4h, v17.4h, v19.4h
++        trn1            v16.4h, v16.4h, v18.4h
++        trn1            v17.4h, v17.4h, v19.4h
++        ld1             {v18.8b}, [x0], x1
++        trn1            v19.2s, v6.2s, v3.2s
++        trn2            v3.2s, v6.2s, v3.2s
++        trn1            v6.2s, v7.2s, v4.2s
++        trn2            v4.2s, v7.2s, v4.2s
++        trn1            v7.2s, v1.2s, v16.2s
++        trn1            v20.2s, v2.2s, v17.2s
++        shl             v21.4h, v19.4h, #4      //          16 * src[1]
++        trn2            v1.2s, v1.2s, v16.2s
++        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
++        trn2            v2.2s, v2.2s, v17.2s
++        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
++        ld1             {v22.8b}, [x0], x1
++        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
++        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
++        ld1             {v25.8b}, [x0]
++        shl             v26.4h, v19.4h, #2      //           4 * src[1]
++        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
++        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
++        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
++        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
++        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
++        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
++        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
++        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
++        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
++        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
++        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
++        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
++        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
++        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
++        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
++        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
++        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
++        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
++        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
++        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
++        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
++        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
++        neg             v6.4h, v21.4h           // -t1
++        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
++        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
++        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
++        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
++        neg             v3.4h, v17.4h           // +t2
++        neg             v4.4h, v16.4h           // +t3
++        neg             v28.4h, v23.4h          // +t4
++        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
++        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
++        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
++        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
++        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
++        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
++        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
++        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
++        trn1            v1.2d, v7.2d, v1.2d
++        trn1            v2.2d, v20.2d, v2.2d
++        trn1            v3.2d, v24.2d, v27.2d
++        trn1            v4.2d, v19.2d, v26.2d
++        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
++        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
++        trn2            v6.8h, v1.8h, v2.8h
++        trn1            v1.8h, v1.8h, v2.8h
++        trn2            v2.8h, v3.8h, v4.8h
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.4s, v6.4s, v2.4s
++        trn1            v7.4s, v1.4s, v3.4s
++        trn2            v1.4s, v1.4s, v3.4s
++        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
++        trn1            v2.4s, v6.4s, v2.4s
++        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
++        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
++        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v2.8h, v3.8h            // -t4/2
++        neg             v6.8h, v4.8h            // -t3/2
++        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
++        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
++        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
++        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
++        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
++        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
++        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
++        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v18.8b
++        uaddw           v2.8h, v2.8h, v22.8b
++        uaddw           v3.8h, v3.8h, v25.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_neon, export=1
++        mov             x3, #16
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
++        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
++        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
++        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
++        ld1             {v4.d}[1], [x2]         // 70 71 72 73
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        ld1             {v7.s}[0], [x0], x1
++        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
++        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
++        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
++        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
++        ld1             {v4.s}[0], [x0], x1
++        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
++        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
++        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
++        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
++        ld1             {v5.s}[1], [x0], x1
++        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
++        ld1             {v6.s}[1], [x0], x1
++        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
++        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
++        ld1             {v7.s}[1], [x0], x1
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
++        ld1             {v4.s}[1], [x0]
++        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v3.8h, v16.8h           // -t3/2
++        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
++        neg             v18.8h, v17.8h          // -t4/2
++        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
++        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
++        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
++        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
++        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
++        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
++        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
++        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
++        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
++        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
++        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
++        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
++        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
++        mov             d18, v3.d[1]            // 50 51 52 53
++        shl             v19.4h, v3.4h, #4       //          16 * src[8]
++        mov             d20, v16.d[1]           // 70 71 72 73
++        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
++        mov             d22, v17.d[1]           // 40 41 42 43
++        shl             v23.4h, v3.4h, #2       //           4 * src[8]
++        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
++        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
++        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
++        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
++        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
++        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
++        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
++        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
++        mov             d23, v1.d[1]            // 60 61 62 63
++        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
++        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
++        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
++        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
++        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
++        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
++        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
++        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        neg             v23.4h, v24.4h          // +t2
++        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
++        neg             v17.4h, v21.4h          // +t3
++        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
++        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        neg             v16.4h, v19.4h          // -t1
++        neg             v27.4h, v2.4h           // +t4
++        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
++        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
++        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
++        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
++        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
++        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
++        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
++        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
++        trn1            v0.2d, v20.2d, v0.2d
++        trn1            v2.2d, v18.2d, v22.2d
++        trn1            v3.2d, v25.2d, v3.2d
++        trn1            v1.2d, v26.2d, v1.2d
++        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
++        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
++        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        uaddw           v3.8h, v3.8h, v7.8b
++        uaddw           v1.8h, v1.8h, v4.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v2.s}[0], [x4], x1
++        st1             {v3.s}[0], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v2.s}[1], [x4], x1
++        st1             {v3.s}[1], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_neon, export=1
++        mov             x3, #16
++        ldr             d0, .Lcoeffs_it4
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2]         // 30 31 32 33
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v5.s}[1], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
++        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
++        ld1             {v6.s}[1], [x0]
++        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
++        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
++        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
++        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
++        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
++        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
++        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
++        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
++        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v7.4h, v3.4h            // -t3/2
++        neg             v16.4h, v4.4h           // -t4/2
++        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
++        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
++        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
++        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
++        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
++        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
++        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
++        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
++        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
++        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
++        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
++        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
++        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
++        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
++        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
++        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
++        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
++        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v3.4h, v2.4h            // -t4/2
++        neg             v7.4h, v4.4h            // -t3/2
++        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
++        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
++        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
++        trn1            v0.2d, v4.2d, v3.2d
++        trn1            v1.2d, v2.2d, v7.2d
++        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v6.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 8x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0], x1
++        ld1             {v4.8b}, [x0], x1
++        add             w2, w2, #1
++        ld1             {v5.8b}, [x0], x1
++        asr             w2, w2, #1
++        ld1             {v6.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v7.8b}, [x0]
++        add             w0, w2, #16
++        asr             w0, w0, #5
++        dup             v16.8h, w0
++        uaddw           v0.8h, v16.8h, v0.8b
++        uaddw           v1.8h, v16.8h, v1.8b
++        uaddw           v2.8h, v16.8h, v2.8b
++        uaddw           v3.8h, v16.8h, v3.8b
++        uaddw           v4.8h, v16.8h, v4.8b
++        uaddw           v5.8h, v16.8h, v5.8b
++        sqxtun          v0.8b, v0.8h
++        uaddw           v6.8h, v16.8h, v6.8b
++        sqxtun          v1.8b, v1.8h
++        uaddw           v7.8h, v16.8h, v7.8b
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x3], x1
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x3], x1
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x3], x1
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x3], x1
++        st1             {v4.8b}, [x3], x1
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0]
++        add             w0, w2, #1
++        asr             w0, w0, #1
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v4.8h, w0
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v2.s}[0], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v3.s}[0], [x0], x1
++        add             w2, w2, #4
++        asr             w2, w2, #3
++        add             w2, w2, w2, lsl #1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, #16
++        asr             w2, w2, #5
++        dup             v4.8h, w2
++        ld1             {v1.s}[1], [x0], x1
++        ld1             {v2.s}[1], [x0], x1
++        ld1             {v3.s}[1], [x0]
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v2.s}[0], [x3], x1
++        st1             {v3.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3], x1
++        st1             {v2.s}[1], [x3], x1
++        st1             {v3.s}[1], [x3]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v1.s}[1], [x0]
++        add             w0, w2, #4
++        asr             w0, w0, #3
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v2.8h, w0
++        uaddw           v0.8h, v2.8h, v0.8b
++        uaddw           v1.8h, v2.8h, v1.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3]
++        ret
++endfunc
++
++.align  5
++.Lcoeffs_it8:
++.quad   0x000F00090003
++.Lcoeffs_it4:
++.quad   0x0011000B0005
++.Lcoeffs:
++.quad   0x00050002
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   w1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        sxtw            x1, w1                  // technically, stride is signed int
++        ldr             d0, .Lcoeffs
++        ld1             {v1.s}[0], [x0], x1     // P5
++        ld1             {v2.s}[0], [x3], x1     // P1
++        ld1             {v3.s}[0], [x3], x1     // P2
++        ld1             {v4.s}[0], [x0], x1     // P6
++        ld1             {v5.s}[0], [x3], x1     // P3
++        ld1             {v6.s}[0], [x0], x1     // P7
++        ld1             {v7.s}[0], [x3]         // P4
++        ld1             {v16.s}[0], [x0]        // P8
++        ushll           v17.8h, v1.8b, #1       // 2*P5
++        dup             v18.8h, w2              // pq
++        ushll           v2.8h, v2.8b, #1        // 2*P1
++        uxtl            v3.8h, v3.8b            // P2
++        uxtl            v4.8h, v4.8b            // P6
++        uxtl            v19.8h, v5.8b           // P3
++        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v3.8h, v6.8b            // P7
++        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
++        ushll           v5.8h, v5.8b, #1        // 2*P3
++        uxtl            v6.8h, v7.8b            // P4
++        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v3.8h, v16.8b           // P8
++        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
++        uxtl            v1.8h, v1.8b            // P5
++        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
++        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        sub             v3.4h, v6.4h, v1.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        abs             v4.4h, v3.4h
++        srshr           v7.4h, v17.4h, #3
++        srshr           v2.4h, v2.4h, #3
++        sshr            v4.4h, v4.4h, #1        // clip
++        srshr           v5.4h, v5.4h, #3
++        abs             v7.4h, v7.4h            // a2
++        sshr            v3.4h, v3.4h, #8        // clip_sign
++        abs             v2.4h, v2.4h            // a1
++        cmeq            v16.4h, v4.4h, #0       // test clip == 0
++        abs             v17.4h, v5.4h           // a0
++        sshr            v5.4h, v5.4h, #8        // a0_sign
++        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
++        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
++        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v7.8b, v2.8b    // a3
++        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v4.4h
++        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v6.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   w1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        sxtw            x1, w1                  // technically, stride is signed int
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3]
++        dup             v5.8h, w2               // pq
++        trn1            v6.8b, v1.8b, v2.8b
++        trn2            v1.8b, v1.8b, v2.8b
++        trn1            v2.8b, v3.8b, v4.8b
++        trn2            v3.8b, v3.8b, v4.8b
++        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
++        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
++        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
++        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
++        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
++        uxtl            v6.8h, v7.8b            // P2, P6
++        uxtl            v7.8h, v2.8b            // P3, P7
++        uxtl            v1.8h, v1.8b            // P4, P8
++        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
++        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
++        uxtl            v4.8h, v4.8b            // P1, P5
++        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        mov             d6, v6.d[1]             // P6
++        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        mov             d4, v4.d[1]             // P5
++        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
++        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        sub             v7.4h, v1.4h, v4.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        srshr           v3.8h, v3.8h, #3
++        abs             v6.4h, v7.4h
++        sshr            v7.4h, v7.4h, #8        // clip_sign
++        srshr           v2.4h, v2.4h, #3
++        abs             v3.8h, v3.8h            // a1, a2
++        sshr            v6.4h, v6.4h, #1        // clip
++        mov             d16, v3.d[1]            // a2
++        abs             v17.4h, v2.4h           // a0
++        cmeq            v18.4h, v6.4h, #0       // test clip == 0
++        sshr            v2.4h, v2.4h, #8        // a0_sign
++        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
++        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
++        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v16.8b, v3.8b   // a3
++        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v6.4h
++        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v3.8b, v4.8h
++        sqxtun          v2.8b, v1.8h
++        st2             {v2.b, v3.b}[0], [x0], x1
++        st2             {v2.b, v3.b}[1], [x0], x1
++        st2             {v2.b, v3.b}[2], [x0], x1
++        st2             {v2.b, v3.b}[3], [x0]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   w1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        sxtw            x1, w1                  // technically, stride is signed int
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x0], x1       // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.8b}, [x3], x1       // P1
++        ld1             {v4.8b}, [x3], x1       // P2
++        ld1             {v5.8b}, [x0], x1       // P6
++        ld1             {v6.8b}, [x3], x1       // P3
++        ld1             {v7.8b}, [x0], x1       // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5
++        ushll           v3.8h, v3.8b, #1        // 2*P1
++        ld1             {v17.8b}, [x3]          // P4
++        uxtl            v4.8h, v4.8b            // P2
++        ld1             {v18.8b}, [x0]          // P8
++        uxtl            v5.8h, v5.8b            // P6
++        dup             v19.8h, w2              // pq
++        uxtl            v20.8h, v6.8b           // P3
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v4.8h, v7.8b            // P7
++        ushll           v6.8h, v6.8b, #1        // 2*P3
++        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v7.8h, v17.8b           // P4
++        uxtl            v17.8h, v18.8b          // P8
++        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v1.8h, v1.8b            // P5
++        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v4.8h, v7.8h, v1.8h     // P4-P5
++        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
++        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++        abs             v17.8h, v4.8h
++        sshr            v4.8h, v4.8h, #8        // clip_sign
++        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        sshr            v17.8h, v17.8h, #1      // clip
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v16.8h, v16.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v5.8h, v17.8h, #0       // test clip == 0
++        srshr           v3.8h, v3.8h, #3
++        abs             v16.8h, v16.8h          // a2
++        abs             v3.8h, v3.8h            // a1
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
++        abs             v20.8h, v6.8h           // a0
++        sshr            v6.8h, v6.8h, #8        // a0_sign
++        bsl             v18.16b, v16.16b, v3.16b // a3
++        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
++        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
++        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
++        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
++        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v2.16b, v3.16b, v2.16b
++        cmhs            v3.8h, v0.8h, v17.8h
++        and             w0, w0, w2
++        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
++        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
++        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v7.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   w1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        sxtw            x1, w1                  // technically, stride is signed int
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #2
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3], x1
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3]
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        dup             v4.8h, w2               // pq
++        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn1            v7.2s, v6.2s, v3.2s     // P1
++        trn1            v18.2s, v19.2s, v16.2s  // P2
++        trn2            v3.2s, v6.2s, v3.2s     // P5
++        trn2            v6.2s, v19.2s, v16.2s   // P6
++        trn1            v16.2s, v2.2s, v17.2s   // P3
++        trn2            v2.2s, v2.2s, v17.2s    // P7
++        ushll           v7.8h, v7.8b, #1        // 2*P1
++        trn1            v17.2s, v1.2s, v5.2s    // P4
++        ushll           v19.8h, v3.8b, #1       // 2*P5
++        trn2            v1.2s, v1.2s, v5.2s     // P8
++        uxtl            v5.8h, v18.8b           // P2
++        uxtl            v6.8h, v6.8b            // P6
++        uxtl            v18.8h, v16.8b          // P3
++        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v2.8h, v2.8b            // P7
++        ushll           v5.8h, v16.8b, #1       // 2*P3
++        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v16.8h, v17.8b          // P4
++        uxtl            v1.8h, v1.8b            // P8
++        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v2.8h, v3.8b            // P5
++        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v3.8h, v16.8h, v2.8h    // P4-P5
++        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
++        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        abs             v1.8h, v3.8h
++        sshr            v3.8h, v3.8h, #8        // clip_sign
++        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
++        sshr            v1.8h, v1.8h, #1        // clip
++        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v17.8h, v19.8h, #3
++        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v6.8h, v1.8h, #0        // test clip == 0
++        srshr           v7.8h, v7.8h, #3
++        abs             v17.8h, v17.8h          // a2
++        abs             v7.8h, v7.8h            // a1
++        srshr           v5.8h, v5.8h, #3
++        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
++        abs             v19.8h, v5.8h           // a0
++        sshr            v5.8h, v5.8h, #8        // a0_sign
++        bsl             v18.16b, v17.16b, v7.16b // a3
++        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
++        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
++        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
++        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v1.8h
++        and             w5, w2, w3
++        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
++        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v1.8b, v2.8h
++        sqxtun          v0.8b, v16.8h
++        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[4], [x4], x1
++        st2             {v0.b, v1.b}[5], [x4], x1
++        st2             {v0.b, v1.b}[6], [x4], x1
++        st2             {v0.b, v1.b}[7], [x4]
++2:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   w1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        sxtw            x1, w1                  // technically, stride is signed int
++        ldr             d0, .Lcoeffs
++        ld1             {v1.16b}, [x0], x1      // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.16b}, [x3], x1      // P1
++        ld1             {v4.16b}, [x3], x1      // P2
++        ld1             {v5.16b}, [x0], x1      // P6
++        ld1             {v6.16b}, [x3], x1      // P3
++        ld1             {v7.16b}, [x0], x1      // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
++        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
++        ld1             {v18.16b}, [x3]         // P4
++        uxtl            v19.8h, v4.8b           // P2[0..7]
++        ld1             {v20.16b}, [x0]         // P8
++        uxtl            v21.8h, v5.8b           // P6[0..7]
++        dup             v22.8h, w2              // pq
++        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
++        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
++        uxtl2           v4.8h, v4.16b           // P2[8..15]
++        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++        uxtl2           v5.8h, v5.16b           // P6[8..15]
++        uxtl            v23.8h, v6.8b           // P3[0..7]
++        uxtl            v24.8h, v7.8b           // P7[0..7]
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
++        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
++        uxtl            v25.8h, v18.8b          // P4[0..7]
++        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
++        uxtl2           v26.8h, v6.16b          // P3[8..15]
++        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl2           v7.8h, v7.16b           // P7[8..15]
++        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
++        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl2           v18.8h, v18.16b         // P4[8..15]
++        uxtl            v23.8h, v20.8b          // P8[0..7]
++        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        uxtl            v24.8h, v1.8b           // P5[0..7]
++        uxtl2           v20.8h, v20.16b         // P8[8..15]
++        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl2           v1.8h, v1.16b           // P5[8..15]
++        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
++        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
++        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
++        abs             v27.8h, v26.8h
++        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
++        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        abs             v28.8h, v7.8h
++        sshr            v27.8h, v27.8h, #1      // clip[0..7]
++        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
++        sshr            v23.8h, v28.8h, #1      // clip[8..15]
++        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
++        srshr           v17.8h, v17.8h, #3
++        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
++        srshr           v16.8h, v16.8h, #3
++        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        abs             v17.8h, v17.8h          // a1[0..7]
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        srshr           v3.8h, v3.8h, #3
++        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v16.8h, v16.8h          // a2[0..7]
++        srshr           v19.8h, v19.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
++        abs             v3.8h, v3.8h            // a1[8..15]
++        srshr           v4.8h, v4.8h, #3
++        abs             v19.8h, v19.8h          // a2[8..15]
++        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
++        abs             v17.8h, v4.8h           // a0[0..7]
++        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
++        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
++        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        abs             v19.8h, v6.8h           // a0[8..15]
++        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
++        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
++        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
++        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
++        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
++        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        cmhs            v19.8h, v3.8h, v27.8h
++        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v16.16b, v20.16b, v17.16b
++        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
++        cmtst           v2.2d, v5.2d, v2.2d
++        cmhs            v3.8h, v0.8h, v23.8h
++        mov             w4, v5.s[1]
++        mov             w5, v5.s[3]
++        and             w0, w0, w2
++        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        orr             v2.16b, v7.16b, v2.16b
++        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
++        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             w2, w4, w5
++        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        and             w0, w0, w2
++        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        sqxtun          v2.8b, v25.8h
++        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
++        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        sqxtun          v0.8b, v24.8h
++        sqxtun2         v2.16b, v18.8h
++        sqxtun2         v0.16b, v1.8h
++        st1             {v2.16b}, [x3], x1
++        st1             {v0.16b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   w1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        sxtw            x1, w1                  // technically, stride is signed int
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #3
++        ld1             {v3.8b}, [x3], x1
++        add             x5, x0, x1, lsl #2
++        ld1             {v4.8b}, [x3], x1
++        add             x6, x4, x1, lsl #2
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3], x1
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        ld1             {v2.8b}, [x3], x1
++        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
++        ld1             {v19.8b}, [x3], x1
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        ld1             {v4.8b}, [x3], x1
++        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        ld1             {v21.8b}, [x3], x1
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        ld1             {v6.8b}, [x3], x1
++        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        ld1             {v23.8b}, [x3], x1
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        ld1             {v17.8b}, [x3], x1
++        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
++        ld1             {v25.8b}, [x3]
++        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
++        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
++        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
++        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
++        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
++        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
++        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
++        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
++        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
++        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
++        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
++        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
++        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
++        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
++        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
++        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
++        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
++        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
++        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
++        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
++        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
++        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
++        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
++        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
++        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
++        uxtl            v17.8h, v27.8b          // P2[0..7]
++        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
++        uxtl            v20.8h, v21.8b          // P6[0..7]
++        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
++        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
++        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
++        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
++        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
++        uxtl            v26.8h, v26.8b          // P2[8..15]
++        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
++        uxtl            v17.8h, v18.8b          // P6[8..15]
++        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
++        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
++        uxtl            v28.8h, v7.8b           // P3[0..7]
++        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++        uxtl            v16.8h, v16.8b          // P7[0..7]
++        uxtl            v26.8h, v21.8b          // P3[8..15]
++        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++        uxtl            v22.8h, v22.8b          // P7[8..15]
++        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
++        uxtl            v27.8h, v27.8b          // P4[0..7]
++        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
++        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
++        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
++        uxtl            v4.8h, v18.8b           // P4[8..15]
++        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl            v1.8h, v1.8b            // P8[0..7]
++        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl            v2.8h, v2.8b            // P8[8..15]
++        uxtl            v16.8h, v19.8b          // P5[0..7]
++        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl            v18.8h, v23.8b          // P5[8..15]
++        dup             v19.8h, w2              // pq
++        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
++        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
++        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        abs             v23.8h, v21.8h
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
++        abs             v26.8h, v22.8h
++        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
++        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        sshr            v23.8h, v23.8h, #1      // clip[0..7]
++        sshr            v26.8h, v26.8h, #1      // clip[8..15]
++        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
++        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
++        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
++        srshr           v5.8h, v5.8h, #3
++        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        srshr           v2.8h, v6.8h, #3
++        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        srshr           v6.8h, v24.8h, #3
++        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        abs             v5.8h, v5.8h            // a1[0..7]
++        srshr           v24.8h, v25.8h, #3
++        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        abs             v2.8h, v2.8h            // a2[0..7]
++        abs             v6.8h, v6.8h            // a1[8..15]
++        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v17.8h, v24.8h          // a2[8..15]
++        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
++        srshr           v3.8h, v3.8h, #3
++        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
++        srshr           v7.8h, v7.8h, #3
++        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
++        abs             v2.8h, v3.8h            // a0[8..15]
++        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
++        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
++        abs             v5.8h, v7.8h            // a0[0..7]
++        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
++        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
++        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
++        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
++        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
++        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
++        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w7, v2.s[1]
++        mov             w8, v2.s[3]
++        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        mov             w2, v5.s[1]             // move to gp reg
++        cmhs            v2.8h, v3.8h, v26.8h
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v23.8h
++        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
++        and             w9, w7, w8
++        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
++        and             w10, w2, w3
++        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        and             w9, w10, w9
++        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
++        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        sqxtun          v2.8b, v4.8h
++        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v27.8h
++        sqxtun          v1.8b, v16.8h
++        sqxtun          v3.8b, v18.8h
++        tbnz            w2, #0, 1f
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f
++        st2             {v0.b, v1.b}[4], [x5], x1
++        st2             {v0.b, v1.b}[5], [x5], x1
++        st2             {v0.b, v1.b}[6], [x5], x1
++        st2             {v0.b, v1.b}[7], [x5]
++2:      tbnz            w7, #0, 3f
++        st2             {v2.b, v3.b}[0], [x4], x1
++        st2             {v2.b, v3.b}[1], [x4], x1
++        st2             {v2.b, v3.b}[2], [x4], x1
++        st2             {v2.b, v3.b}[3], [x4]
++3:      tbnz            w8, #0, 4f
++        st2             {v2.b, v3.b}[4], [x6], x1
++        st2             {v2.b, v3.b}[5], [x6], x1
++        st2             {v2.b, v3.b}[6], [x6], x1
++        st2             {v2.b, v3.b}[7], [x6]
++4:      ret
++endfunc
++
++// Copy at most the specified number of bytes from source to destination buffer,
++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
++// On entry:
++//   x0 -> source buffer
++//   w1 = max number of bytes to copy
++//   x2 -> destination buffer, optimally 8-byte aligned
++// On exit:
++//   w0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        // Offset by 80 to screen out cases that are too short for us to handle,
++        // and also make it easy to test for loop termination, or to determine
++        // whether we need an odd number of half-iterations of the loop.
++        subs            w1, w1, #80
++        b.mi            90f
++
++        // Set up useful constants
++        movi            v20.4s, #3, lsl #24
++        movi            v21.4s, #3, lsl #16
++
++        tst             w1, #32
++        b.ne            1f
++
++          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
++          ext             v25.16b, v0.16b, v1.16b, #1
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++          add             w1, w1, #32
++          b               3f
++
++1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
++        ext             v25.16b, v3.16b, v4.16b, #1
++        ext             v26.16b, v3.16b, v4.16b, #2
++        ext             v27.16b, v3.16b, v4.16b, #3
++        ext             v29.16b, v4.16b, v5.16b, #1
++        ext             v30.16b, v4.16b, v5.16b, #2
++        ext             v31.16b, v4.16b, v5.16b, #3
++        bic             v24.16b, v3.16b, v20.16b
++        bic             v25.16b, v25.16b, v20.16b
++        bic             v26.16b, v26.16b, v20.16b
++        bic             v27.16b, v27.16b, v20.16b
++        bic             v28.16b, v4.16b, v20.16b
++        bic             v29.16b, v29.16b, v20.16b
++        bic             v30.16b, v30.16b, v20.16b
++        bic             v31.16b, v31.16b, v20.16b
++        eor             v24.16b, v24.16b, v21.16b
++        eor             v25.16b, v25.16b, v21.16b
++        eor             v26.16b, v26.16b, v21.16b
++        eor             v27.16b, v27.16b, v21.16b
++        eor             v28.16b, v28.16b, v21.16b
++        eor             v29.16b, v29.16b, v21.16b
++        eor             v30.16b, v30.16b, v21.16b
++        eor             v31.16b, v31.16b, v21.16b
++        cmeq            v24.4s, v24.4s, #0
++        cmeq            v25.4s, v25.4s, #0
++        cmeq            v26.4s, v26.4s, #0
++        cmeq            v27.4s, v27.4s, #0
++        // Drop through...
++2:        mov             v0.16b, v5.16b
++          ld1             {v1.16b, v2.16b}, [x0], #32
++        cmeq            v28.4s, v28.4s, #0
++        cmeq            v29.4s, v29.4s, #0
++        cmeq            v30.4s, v30.4s, #0
++        cmeq            v31.4s, v31.4s, #0
++        orr             v24.16b, v24.16b, v25.16b
++        orr             v26.16b, v26.16b, v27.16b
++        orr             v28.16b, v28.16b, v29.16b
++        orr             v30.16b, v30.16b, v31.16b
++          ext             v25.16b, v0.16b, v1.16b, #1
++        orr             v22.16b, v24.16b, v26.16b
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++        orr             v23.16b, v28.16b, v30.16b
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++        orr             v22.16b, v22.16b, v23.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++        addv            s22, v22.4s
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++        mov             w3, v22.s[0]
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++        cbnz            w3, 90f
++        st1             {v3.16b, v4.16b}, [x2], #32
++3:          mov             v3.16b, v2.16b
++            ld1             {v4.16b, v5.16b}, [x0], #32
++          cmeq            v28.4s, v28.4s, #0
++          cmeq            v29.4s, v29.4s, #0
++          cmeq            v30.4s, v30.4s, #0
++          cmeq            v31.4s, v31.4s, #0
++          orr             v24.16b, v24.16b, v25.16b
++          orr             v26.16b, v26.16b, v27.16b
++          orr             v28.16b, v28.16b, v29.16b
++          orr             v30.16b, v30.16b, v31.16b
++            ext             v25.16b, v3.16b, v4.16b, #1
++          orr             v22.16b, v24.16b, v26.16b
++            ext             v26.16b, v3.16b, v4.16b, #2
++            ext             v27.16b, v3.16b, v4.16b, #3
++            ext             v29.16b, v4.16b, v5.16b, #1
++          orr             v23.16b, v28.16b, v30.16b
++            ext             v30.16b, v4.16b, v5.16b, #2
++            ext             v31.16b, v4.16b, v5.16b, #3
++            bic             v24.16b, v3.16b, v20.16b
++            bic             v25.16b, v25.16b, v20.16b
++            bic             v26.16b, v26.16b, v20.16b
++          orr             v22.16b, v22.16b, v23.16b
++            bic             v27.16b, v27.16b, v20.16b
++            bic             v28.16b, v4.16b, v20.16b
++            bic             v29.16b, v29.16b, v20.16b
++            bic             v30.16b, v30.16b, v20.16b
++            bic             v31.16b, v31.16b, v20.16b
++          addv            s22, v22.4s
++            eor             v24.16b, v24.16b, v21.16b
++            eor             v25.16b, v25.16b, v21.16b
++            eor             v26.16b, v26.16b, v21.16b
++            eor             v27.16b, v27.16b, v21.16b
++            eor             v28.16b, v28.16b, v21.16b
++          mov             w3, v22.s[0]
++            eor             v29.16b, v29.16b, v21.16b
++            eor             v30.16b, v30.16b, v21.16b
++            eor             v31.16b, v31.16b, v21.16b
++            cmeq            v24.4s, v24.4s, #0
++            cmeq            v25.4s, v25.4s, #0
++            cmeq            v26.4s, v26.4s, #0
++            cmeq            v27.4s, v27.4s, #0
++          cbnz            w3, 91f
++          st1             {v0.16b, v1.16b}, [x2], #32
++        subs            w1, w1, #64
++        b.pl            2b
++
++90:     add             w0, w1, #80
++        ret
++
++91:     sub             w1, w1, #32
++        b               90b
++endfunc
 diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
 index 2e9a3581de..d9571b437f 100644
 --- a/libavcodec/allcodecs.c
@@ -15303,6 +17287,887 @@ index 0000000000..af8c4c03f0
 +        bx          lr
 +
 +endfunc
+diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
+index 2cca784f5a..48cb816b70 100644
+--- a/libavcodec/arm/vc1dsp_init_neon.c
++++ b/libavcodec/arm/vc1dsp_init_neon.c
+@@ -19,6 +19,7 @@
+ #include <stdint.h>
+ 
+ #include "libavutil/attributes.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ #include "vc1dsp.h"
+ 
+@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc
+ void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ 
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++
+ void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int rnd);
+ 
+@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ #define FN_ASSIGN(X, Y) \
+     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
+     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+ 
++    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+     dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
+     FN_ASSIGN(1, 0);
+     FN_ASSIGN(2, 0);
+@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
+index 93f043bf08..8e97bc5e58 100644
+--- a/libavcodec/arm/vc1dsp_neon.S
++++ b/libavcodec/arm/vc1dsp_neon.S
+@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
+         vst1.32         {d1[1]},  [r0,:32]
+         bx              lr
+ endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1[0]}, [r0], r1       @ P5
++        vld1.32         {d2[0]}, [r3], r1       @ P1
++        vld1.32         {d3[0]}, [r3], r1       @ P2
++        vld1.32         {d4[0]}, [r0], r1       @ P6
++        vld1.32         {d5[0]}, [r3], r1       @ P3
++        vld1.32         {d6[0]}, [r0], r1       @ P7
++        vld1.32         {d7[0]}, [r3]           @ P4
++        vld1.32         {d16[0]}, [r0]          @ P8
++        vshll.u8        q9, d1, #1              @ 2*P5
++        vdup.16         d17, r2                 @ pq
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vmovl.u8        q11, d3                 @ P2
++        vmovl.u8        q1, d4                  @ P6
++        vmovl.u8        q12, d5                 @ P3
++        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q11, d6                 @ P7
++        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmovl.u8        q3, d7                  @ P4
++        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q11, d16                @ P8
++        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
++        vmovl.u8        q12, d1                 @ P5
++        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
++        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
++        vsub.i16        d1, d6, d24             @ P4-P5
++        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
++        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
++        vabs.s16        d2, d1
++        vrshr.s16       d3, d18, #3
++        vrshr.s16       d5, d20, #3
++        vshr.s16        d2, d2, #1              @ clip
++        vrshr.s16       d4, d4, #3
++        vabs.s16        d3, d3                  @ a2
++        vshr.s16        d1, d1, #8              @ clip_sign
++        vabs.s16        d5, d5                  @ a1
++        vceq.i16        d7, d2, #0              @ test clip == 0
++        vabs.s16        d16, d4                 @ a0
++        vshr.s16        d4, d4, #8              @ a0_sign
++        vcge.s16        d18, d5, d3             @ test a1 >= a2
++        vcge.s16        d17, d16, d17           @ test a0 >= pq
++        vbsl            d18, d3, d5             @ a3
++        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
++        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d5, d18, d16            @ test a3 >= a0
++        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d4, d0, d2
++        tst             r0, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d4, d2, d0              @ FFMIN(d, clip)
++        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q12
++        vst1.32         {d0[0]}, [r3], r1
++        vst1.32         {d1[0]}, [r3]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3]
++        vdup.16         d1, r2                  @ pq
++        vtrn.8          q1, q2
++        vtrn.16         d2, d3                  @ P1, P5, P3, P7
++        vtrn.16         d4, d5                  @ P2, P6, P4, P8
++        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
++        vmovl.u8        q8, d4                  @ P2, P6
++        vmovl.u8        q9, d3                  @ P3, P7
++        vmovl.u8        q2, d5                  @ P4, P8
++        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
++        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
++        vmovl.u8        q1, d2                  @ P1, P5
++        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
++        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
++        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
++        vsub.i16        d3, d4, d2              @ P4-P5
++        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
++        vrshr.s16       q3, q3, #3
++        vabs.s16        d5, d3
++        vshr.s16        d3, d3, #8              @ clip_sign
++        vrshr.s16       d16, d20, #3
++        vabs.s16        q3, q3                  @ a1, a2
++        vshr.s16        d5, d5, #1              @ clip
++        vabs.s16        d17, d16                @ a0
++        vceq.i16        d18, d5, #0             @ test clip == 0
++        vshr.s16        d16, d16, #8            @ a0_sign
++        vcge.s16        d19, d6, d7             @ test a1 >= a2
++        vcge.s16        d1, d17, d1             @ test a0 >= pq
++        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
++        vbsl            d19, d7, d6             @ a3
++        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
++        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d3[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d3, d0, d5
++        tst             r2, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d3, d5, d0              @ FFMIN(d, clip)
++        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q1
++        vqmovun.s16     d0, q2
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1}, [r0], r1          @ P5
++        vld1.32         {d2}, [r3], r1          @ P1
++        vld1.32         {d3}, [r3], r1          @ P2
++        vld1.32         {d4}, [r0], r1          @ P6
++        vld1.32         {d5}, [r3], r1          @ P3
++        vld1.32         {d6}, [r0], r1          @ P7
++        vshll.u8        q8, d1, #1              @ 2*P5
++        vshll.u8        q9, d2, #1              @ 2*P1
++        vld1.32         {d7}, [r3]              @ P4
++        vmovl.u8        q1, d3                  @ P2
++        vld1.32         {d20}, [r0]             @ P8
++        vmovl.u8        q11, d4                 @ P6
++        vdup.16         q12, r2                 @ pq
++        vmovl.u8        q13, d5                 @ P3
++        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
++        vmovl.u8        q1, d6                  @ P7
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
++        vmovl.u8        q3, d7                  @ P4
++        vmovl.u8        q10, d20                @ P8
++        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
++        vmovl.u8        q1, d1                  @ P5
++        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
++        vsub.i16        q13, q3, q1             @ P4-P5
++        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q10, q13
++        vshr.s16        q13, q13, #8            @ clip_sign
++        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q10, q10, #1            @ clip
++        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q11, q10, #0            @ test clip == 0
++        vrshr.s16       q9, q9, #3
++        vabs.s16        q8, q8                  @ a2
++        vabs.s16        q9, q9                  @ a1
++        vrshr.s16       q2, q2, #3
++        vcge.s16        q14, q9, q8             @ test a1 >= a2
++        vabs.s16        q15, q2                 @ a0
++        vshr.s16        q2, q2, #8              @ a0_sign
++        vbsl            q14, q8, q9             @ a3
++        vcge.s16        q8, q15, q12            @ test a0 >= pq
++        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
++        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q12, q14, q15           @ test a3 >= a0
++        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
++        vshl.i64        q11, q9, #16
++        vmov.32         r0, d18[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r2, d19[1]
++        vshr.s64        q9, q11, #48
++        vcge.s16        q11, q0, q10
++        vorr            q8, q8, q9
++        and             r0, r0, r2
++        vbsl            q11, q10, q0            @ FFMIN(d, clip)
++        tst             r0, #1
++        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
++        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q1
++        vst1.32         {d0}, [r3], r1
++        vst1.32         {d1}, [r3]
++1:      bx              lr
++endfunc
++
++.align  5
++.Lcoeffs:
++.quad   0x00050002
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        push            {lr}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        add             r12, r0, r1, lsl #2
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d7}, [r3], r1
++        vld1.32         {d17}, [r3]
++        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
++        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
++        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.32         d2, d6                  @ P1, P5
++        vtrn.32         d4, d16                 @ P2, P6
++        vtrn.32         d3, d7                  @ P3, P7
++        vtrn.32         d5, d17                 @ P4, P8
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vshll.u8        q11, d6, #1             @ 2*P5
++        vmovl.u8        q12, d4                 @ P2
++        vmovl.u8        q13, d16                @ P6
++        vmovl.u8        q14, d3                 @ P3
++        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q12, d7                 @ P7
++        vshll.u8        q1, d3, #1              @ 2*P3
++        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
++        vmovl.u8        q2, d5                  @ P4
++        vmovl.u8        q8, d17                 @ P8
++        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q3, d6                  @ P5
++        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
++        vsub.i16        q12, q2, q3             @ P4-P5
++        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q8, q12
++        vshr.s16        q12, q12, #8            @ clip_sign
++        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q8, q8, #1              @ clip
++        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q11, q11, #3
++        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q13, q8, #0             @ test clip == 0
++        vrshr.s16       q10, q10, #3
++        vabs.s16        q11, q11                @ a2
++        vabs.s16        q10, q10                @ a1
++        vrshr.s16       q1, q1, #3
++        vcge.s16        q14, q10, q11           @ test a1 >= a2
++        vabs.s16        q15, q1                 @ a0
++        vshr.s16        q1, q1, #8              @ a0_sign
++        vbsl            q14, q11, q10           @ a3
++        vcge.s16        q9, q15, q9             @ test a0 >= pq
++        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
++        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q11, q14, q15           @ test a3 >= a0
++        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d20[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r3, d21[1]
++        vcge.s16        q10, q0, q8
++        and             r14, r2, r3
++        vbsl            q10, q8, q0             @ FFMIN(d, clip)
++        tst             r14, #1
++        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q3
++        vqmovun.s16     d0, q2
++        tst             r2, #1
++        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      tst             r3, #1
++        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
++        vst2.8          {d0[4], d1[4]}, [r12], r1
++        vst2.8          {d0[5], d1[5]}, [r12], r1
++        vst2.8          {d0[6], d1[6]}, [r12], r1
++        vst2.8          {d0[7], d1[7]}, [r12]
++2:      pop             {pc}
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        vpush           {d8-d15}
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.64         {q1}, [r0], r1          @ P5
++        vld1.64         {q2}, [r3], r1          @ P1
++        vld1.64         {q3}, [r3], r1          @ P2
++        vld1.64         {q4}, [r0], r1          @ P6
++        vld1.64         {q5}, [r3], r1          @ P3
++        vld1.64         {q6}, [r0], r1          @ P7
++        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
++        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
++        vld1.64         {q9}, [r3]              @ P4
++        vmovl.u8        q10, d6                 @ P2[0..7]
++        vld1.64         {q11}, [r0]             @ P8
++        vmovl.u8        q12, d8                 @ P6[0..7]
++        vdup.16         q13, r2                 @ pq
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
++        vmovl.u8        q3, d7                  @ P2[8..15]
++        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q4, d9                  @ P6[8..15]
++        vmovl.u8        q14, d10                @ P3[0..7]
++        vmovl.u8        q15, d12                @ P7[0..7]
++        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
++        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
++        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q6, d13                 @ P7[8..15]
++        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q14, d18                @ P4[0..7]
++        vmovl.u8        q9, d19                 @ P4[8..15]
++        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vmovl.u8        q15, d11                @ P3[8..15]
++        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
++        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q15, d22                @ P8[0..7]
++        vmovl.u8        q11, d23                @ P8[8..15]
++        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q6, d2                  @ P5[0..7]
++        vmovl.u8        q1, d3                  @ P5[8..15]
++        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
++        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q7, q7, #3
++        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vabs.s16        q11, q15
++        vabs.s16        q8, q8                  @ a1[0..7]
++        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
++        vrshr.s16       q2, q2, #3
++        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q7, q7                  @ a2[0..7]
++        vrshr.s16       q10, q10, #3
++        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vrshr.s16       q3, q3, #3
++        vabs.s16        q10, q10                @ a2[8..15]
++        vbsl            q4, q7, q8              @ a3[0..7]
++        vabs.s16        q7, q12
++        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
++        vrshr.s16       q5, q5, #3
++        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
++        vshr.s16        q7, q7, #1              @ clip[8..15]
++        vbsl            q12, q10, q2            @ a3[8..15]
++        vabs.s16        q2, q3                  @ a0[0..7]
++        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
++        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
++        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
++        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
++        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
++        vabs.s16        q4, q5                  @ a0[8..15]
++        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
++        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
++        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
++        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
++        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vmov.32         r2, d5[1]
++        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
++        vshl.i64        q2, q2, #16
++        vcge.s16        q12, q15, q11
++        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vshr.s64        q2, q2, #48
++        and             r0, r0, r2
++        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
++        vshl.i64        q11, q4, #16
++        vmov.32         r2, d8[1]
++        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q10, q2
++        vmov.32         r12, d9[1]
++        vshr.s64        q4, q11, #48
++        vcge.s16        q10, q0, q7
++        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vorr            q4, q8, q4
++        and             r2, r2, r12
++        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
++        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             r0, r0, r2
++        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        tst             r0, #1
++        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
++        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        vqmovun.s16     d4, q14
++        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        vqmovun.s16     d0, q6
++        vqmovun.s16     d5, q9
++        vqmovun.s16     d1, q1
++        vst1.64         {q2}, [r3], r1
++        vst1.64         {q0}, [r3]
++1:      vpop            {d8-d15}
++        bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        push            {r4-r6,lr}
++        vpush           {d8-d15}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d3}, [r3], r1
++        add             r4, r0, r1, lsl #2
++        vld1.32         {d10}, [r3], r1
++        vld1.32         {d11}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d8}, [r3], r1
++        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
++        vld1.32         {d14}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d12}, [r3], r1
++        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
++        vld1.32         {d13}, [r3], r1
++        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vld1.32         {d1}, [r3], r1
++        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
++        vld1.32         {d7}, [r3], r1
++        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vld1.32         {d9}, [r3], r1
++        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
++        vld1.32         {d15}, [r3]
++        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
++        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
++        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
++        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
++        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
++        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
++        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
++        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
++        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
++        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
++        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
++        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
++        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
++        vmovl.u8        q1, d3                  @ P2[0..7]
++        vmovl.u8        q12, d4                 @ P6[0..7]
++        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
++        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
++        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
++        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
++        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vmovl.u8        q1, d10                 @ P3[0..7]
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
++        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q14, d6                 @ P2[8..15]
++        vmovl.u8        q3, d7                  @ P6[8..15]
++        vmovl.u8        q15, d8                 @ P7[0..7]
++        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q1, d12                 @ P3[8..15]
++        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
++        vmovl.u8        q4, d9                  @ P7[8..15]
++        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
++        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q5, d11                 @ P4[0..7]
++        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
++        vmovl.u8        q6, d13                 @ P4[8..15]
++        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q1, d14                 @ P8[0..7]
++        vmovl.u8        q7, d15                 @ P8[8..15]
++        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q4, d16                 @ P5[0..7]
++        vmovl.u8        q8, d1                  @ P5[8..15]
++        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
++        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q10, q10, #3
++        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
++        vrshr.s16       q11, q11, #3
++        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q2, q2, #3
++        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vabs.s16        q10, q10                @ a1[0..7]
++        vrshr.s16       q13, q13, #3
++        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vabs.s16        q3, q11                 @ a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q11, q1
++        vabs.s16        q12, q13                @ a2[8..15]
++        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
++        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
++        vrshr.s16       q15, q15, #3
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vrshr.s16       q14, q14, #3
++        vbsl            q13, q3, q10            @ a3[0..7]
++        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
++        vabs.s16        q10, q15                @ a0[8..15]
++        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
++        vbsl            q3, q12, q2             @ a3[8..15]
++        vabs.s16        q2, q14                 @ a0[0..7]
++        vabs.s16        q12, q7
++        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
++        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
++        vshr.s16        q12, q12, #1            @ clip[8..15]
++        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
++        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
++        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
++        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
++        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
++        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
++        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
++        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
++        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vcge.s16        q14, q13, q12
++        vmov.32         r2, d4[1]               @ move to gp reg
++        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vmov.32         r3, d5[1]
++        vcge.s16        q2, q0, q11
++        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
++        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
++        vmov.32         r5, d6[1]
++        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmov.32         r6, d7[1]
++        and             r12, r2, r3
++        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        and             r14, r5, r6
++        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        and             r12, r12, r14
++        vqmovun.s16     d4, q6
++        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        tst             r12, #1
++        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
++        vqmovun.s16     d2, q5
++        vqmovun.s16     d3, q4
++        vqmovun.s16     d5, q8
++        tst             r2, #1
++        bne             1f
++        vst2.8          {d2[0], d3[0]}, [r0], r1
++        vst2.8          {d2[1], d3[1]}, [r0], r1
++        vst2.8          {d2[2], d3[2]}, [r0], r1
++        vst2.8          {d2[3], d3[3]}, [r0]
++1:      add             r0, r4, r1, lsl #2
++        tst             r3, #1
++        bne             2f
++        vst2.8          {d2[4], d3[4]}, [r4], r1
++        vst2.8          {d2[5], d3[5]}, [r4], r1
++        vst2.8          {d2[6], d3[6]}, [r4], r1
++        vst2.8          {d2[7], d3[7]}, [r4]
++2:      add             r4, r0, r1, lsl #2
++        tst             r5, #1
++        bne             3f
++        vst2.8          {d4[0], d5[0]}, [r0], r1
++        vst2.8          {d4[1], d5[1]}, [r0], r1
++        vst2.8          {d4[2], d5[2]}, [r0], r1
++        vst2.8          {d4[3], d5[3]}, [r0]
++3:      tst             r6, #1
++        bne             4f
++        vst2.8          {d4[4], d5[4]}, [r4], r1
++        vst2.8          {d4[5], d5[5]}, [r4], r1
++        vst2.8          {d4[6], d5[6]}, [r4], r1
++        vst2.8          {d4[7], d5[7]}, [r4]
++4:      vpop            {d8-d15}
++        pop             {r4-r6,pc}
++endfunc
++
++@ Copy at most the specified number of bytes from source to destination buffer,
++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
++@ On entry:
++@   r0 -> source buffer
++@   r1 = max number of bytes to copy
++@   r2 -> destination buffer, optimally 8-byte aligned
++@ On exit:
++@   r0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        @ Offset by 48 to screen out cases that are too short for us to handle,
++        @ and also make it easy to test for loop termination, or to determine
++        @ whether we need an odd number of half-iterations of the loop.
++        subs    r1, r1, #48
++        bmi     90f
++
++        @ Set up useful constants
++        vmov.i32        q0, #0x3000000
++        vmov.i32        q1, #0x30000
++
++        tst             r1, #16
++        bne             1f
++
++          vld1.8          {q8, q9}, [r0]!
++          vbic            q12, q8, q0
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++          add             r1, r1, #16
++          b               3f
++
++1:      vld1.8          {q10, q11}, [r0]!
++        vbic            q12, q10, q0
++        vext.8          q13, q10, q11, #1
++        vext.8          q14, q10, q11, #2
++        vext.8          q15, q10, q11, #3
++        veor            q12, q12, q1
++        vbic            q13, q13, q0
++        vbic            q14, q14, q0
++        vbic            q15, q15, q0
++        vceq.i32        q12, q12, #0
++        veor            q13, q13, q1
++        veor            q14, q14, q1
++        veor            q15, q15, q1
++        vceq.i32        q13, q13, #0
++        vceq.i32        q14, q14, #0
++        vceq.i32        q15, q15, #0
++        @ Drop through...
++2:        vmov            q8, q11
++          vld1.8          {q9}, [r0]!
++        vorr            q13, q12, q13
++        vorr            q15, q14, q15
++          vbic            q12, q8, q0
++        vorr            q3, q13, q15
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++        vorr            d6, d6, d7
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++        vmov            r3, r12, d6
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++        orrs            r3, r3, r12
++        bne             90f
++        vst1.64         {q10}, [r2]!
++3:          vmov            q10, q9
++            vld1.8          {q11}, [r0]!
++          vorr            q13, q12, q13
++          vorr            q15, q14, q15
++            vbic            q12, q10, q0
++          vorr            q3, q13, q15
++            vext.8          q13, q10, q11, #1
++            vext.8          q14, q10, q11, #2
++            vext.8          q15, q10, q11, #3
++            veor            q12, q12, q1
++          vorr            d6, d6, d7
++            vbic            q13, q13, q0
++            vbic            q14, q14, q0
++            vbic            q15, q15, q0
++            vceq.i32        q12, q12, #0
++          vmov            r3, r12, d6
++            veor            q13, q13, q1
++            veor            q14, q14, q1
++            veor            q15, q15, q1
++            vceq.i32        q13, q13, #0
++            vceq.i32        q14, q14, #0
++            vceq.i32        q15, q15, #0
++          orrs            r3, r3, r12
++          bne             91f
++          vst1.64         {q8}, [r2]!
++        subs            r1, r1, #32
++        bpl             2b
++
++90:     add             r0, r1, #48
++        bx              lr
++
++91:     sub             r1, r1, #16
++        b               90b
++endfunc
 diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
 index 8a71c04230..53644506e5 100644
 --- a/libavcodec/avcodec.h
@@ -15325,6 +18190,31 @@ index 8a71c04230..53644506e5 100644
  } AVHWAccel;
  
  /**
+diff --git a/libavcodec/blockdsp.c b/libavcodec/blockdsp.c
+index c7efe7e77b..46766244b8 100644
+--- a/libavcodec/blockdsp.c
++++ b/libavcodec/blockdsp.c
+@@ -65,6 +65,8 @@ av_cold void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx)
+     c->fill_block_tab[0] = fill_block16_c;
+     c->fill_block_tab[1] = fill_block8_c;
+ 
++    if (ARCH_AARCH64)
++        ff_blockdsp_init_aarch64(c);
+     if (ARCH_ALPHA)
+         ff_blockdsp_init_alpha(c);
+     if (ARCH_ARM)
+diff --git a/libavcodec/blockdsp.h b/libavcodec/blockdsp.h
+index 26fc2ea13b..fe539491da 100644
+--- a/libavcodec/blockdsp.h
++++ b/libavcodec/blockdsp.h
+@@ -41,6 +41,7 @@ typedef struct BlockDSPContext {
+ 
+ void ff_blockdsp_init(BlockDSPContext *c, AVCodecContext *avctx);
+ 
++void ff_blockdsp_init_aarch64(BlockDSPContext *c);
+ void ff_blockdsp_init_alpha(BlockDSPContext *c);
+ void ff_blockdsp_init_arm(BlockDSPContext *c);
+ void ff_blockdsp_init_ppc(BlockDSPContext *c);
 diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
 index 38d06b2842..bbf5d70560 100644
 --- a/libavcodec/cabac.h
@@ -46342,7 +49232,7 @@ index 0000000000..85c5b46d75
 +};
 +
 diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 4b2679eb38..6ee6ad8642 100644
+index 4b2679eb38..6ca83cc21b 100644
 --- a/libavcodec/v4l2_buffers.c
 +++ b/libavcodec/v4l2_buffers.c
 @@ -21,6 +21,7 @@
@@ -47256,7 +50146,7 @@ index 4b2679eb38..6ee6ad8642 100644
      if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
          avbuf->buf.m.planes = avbuf->planes;
          avbuf->buf.length   = avbuf->num_planes;
-@@ -555,18 +911,47 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+@@ -555,20 +911,51 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
          avbuf->buf.length    = avbuf->planes[0].length;
      }
  
@@ -47300,14 +50190,18 @@ index 4b2679eb38..6ee6ad8642 100644
 +               err, strerror(err));
 +        return AVERROR(err);
 +    }
-+
+ 
++    // Lock not wanted - if called from buffer free then lock already obtained
 +    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+     avbuf->status = V4L2BUF_IN_DRIVER;
++    pthread_cond_broadcast(&avbuf->context->cond);
++
 +    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
 +           avbuf->context->name, avbuf->buf.index,
 +           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
  
-     avbuf->status = V4L2BUF_IN_DRIVER;
- 
+     return 0;
+ }
 diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
 index 8dbc7fc104..7d5fadcd3d 100644
 --- a/libavcodec/v4l2_buffers.h
@@ -47380,7 +50274,7 @@ index 8dbc7fc104..7d5fadcd3d 100644
  /**
   * Enqueues a V4L2Buffer
 diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index ff1ea8e57b..d8a86e8261 100644
+index ff1ea8e57b..1aff16c1de 100644
 --- a/libavcodec/v4l2_context.c
 +++ b/libavcodec/v4l2_context.c
 @@ -27,11 +27,13 @@
@@ -47625,7 +50519,7 @@ index ff1ea8e57b..d8a86e8261 100644
      return 1;
  }
  
-@@ -280,171 +291,274 @@ static int v4l2_stop_encode(V4L2Context *ctx)
+@@ -280,171 +291,275 @@ static int v4l2_stop_encode(V4L2Context *ctx)
      return 0;
  }
  
@@ -47639,6 +50533,7 @@ index ff1ea8e57b..d8a86e8261 100644
 +// Returns:
 +//  0               Success
 +//  AVERROR(EPIPE)  Nothing more to read
++//  AVERROR(ENOSPC) No buffers in Q to put result in
 +//  *               AVERROR(..)
 +
 + static int
@@ -47874,7 +50769,7 @@ index ff1ea8e57b..d8a86e8261 100644
 +            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
 +            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
 +            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
-+            return AVERROR(EAGAIN);
++            return AVERROR(ENOSPC);
 +        }
  
 -            /* the driver is ready to accept more input; instead of waiting for the capture
@@ -48022,7 +50917,7 @@ index ff1ea8e57b..d8a86e8261 100644
      }
  
      return NULL;
-@@ -452,25 +566,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+@@ -452,25 +567,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
  
  static int v4l2_release_buffers(V4L2Context* ctx)
  {
@@ -48082,7 +50977,7 @@ index ff1ea8e57b..d8a86e8261 100644
  }
  
  static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -499,6 +633,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
+@@ -499,6 +634,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
  
  static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
  {
@@ -48091,7 +50986,7 @@ index ff1ea8e57b..d8a86e8261 100644
      enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
      struct v4l2_fmtdesc fdesc;
      int ret;
-@@ -517,6 +653,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+@@ -517,6 +654,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
          if (ret)
              return AVERROR(EINVAL);
  
@@ -48105,7 +51000,7 @@ index ff1ea8e57b..d8a86e8261 100644
          pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
          ret = v4l2_try_raw_format(ctx, pixfmt);
          if (ret){
-@@ -569,18 +712,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
+@@ -569,18 +713,84 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
    *
    *****************************************************************************/
  
@@ -48194,7 +51089,7 @@ index ff1ea8e57b..d8a86e8261 100644
  }
  
  int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-@@ -608,7 +817,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+@@ -608,7 +818,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
      return ff_v4l2_buffer_enqueue(avbuf);
  }
  
@@ -48204,7 +51099,7 @@ index ff1ea8e57b..d8a86e8261 100644
  {
      V4L2m2mContext *s = ctx_to_m2mctx(ctx);
      V4L2Buffer* avbuf;
-@@ -616,8 +826,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -616,8 +827,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
  
      if (!pkt->size) {
          ret = v4l2_stop_decode(ctx);
@@ -48215,7 +51110,7 @@ index ff1ea8e57b..d8a86e8261 100644
          s->draining = 1;
          return 0;
      }
-@@ -626,8 +837,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -626,8 +838,11 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
      if (!avbuf)
          return AVERROR(EAGAIN);
  
@@ -48229,7 +51124,7 @@ index ff1ea8e57b..d8a86e8261 100644
          return ret;
  
      return ff_v4l2_buffer_enqueue(avbuf);
-@@ -636,19 +850,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+@@ -636,19 +851,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
  int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  {
      V4L2Buffer *avbuf;
@@ -48252,7 +51147,7 @@ index ff1ea8e57b..d8a86e8261 100644
  
      return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
  }
-@@ -656,19 +861,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+@@ -656,19 +862,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
  int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
  {
      V4L2Buffer *avbuf;
@@ -48271,11 +51166,11 @@ index ff1ea8e57b..d8a86e8261 100644
 -        return AVERROR(EAGAIN);
 -    }
 +    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-+        return rv;
++        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
  
      return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
  }
-@@ -702,78 +898,158 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +899,160 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
  
  int ff_v4l2_context_set_format(V4L2Context* ctx)
  {
@@ -48322,6 +51217,7 @@ index ff1ea8e57b..d8a86e8261 100644
 +    av_buffer_unref(&ctx->frames_ref);
 +
 +    ff_mutex_destroy(&ctx->lock);
++    pthread_cond_destroy(&ctx->cond);
  }
  
 -int ff_v4l2_context_init(V4L2Context* ctx)
@@ -48424,6 +51320,7 @@ index ff1ea8e57b..d8a86e8261 100644
 +    }
 +
 +    ff_mutex_init(&ctx->lock, NULL);
++    pthread_cond_init(&ctx->cond, NULL);
 +    atomic_init(&ctx->q_count, 0);
 +
 +    if (s->output_drm) {
@@ -48466,7 +51363,7 @@ index ff1ea8e57b..d8a86e8261 100644
      return ret;
  }
 diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 22a9532444..35e83c66d9 100644
+index 22a9532444..a56216e990 100644
 --- a/libavcodec/v4l2_context.h
 +++ b/libavcodec/v4l2_context.h
 @@ -31,6 +31,7 @@
@@ -48498,7 +51395,7 @@ index 22a9532444..35e83c66d9 100644
  
      /**
       * Readonly after init.
-@@ -92,6 +100,20 @@ typedef struct V4L2Context {
+@@ -92,6 +100,21 @@ typedef struct V4L2Context {
       */
      int done;
  
@@ -48516,18 +51413,22 @@ index 22a9532444..35e83c66d9 100644
 +    struct ff_weak_link_master *wl_master;
 +
 +    AVMutex lock;
++    pthread_cond_t cond;
  } V4L2Context;
  
  /**
-@@ -156,6 +178,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+@@ -156,7 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
   * @param[in] ctx The V4L2Context to dequeue from.
   * @param[inout] f The AVFrame to dequeue to.
   * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
 + *
   * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
++ *                AVERROR(ENOSPC) if no buffer availible to put
++ *                the frame in
   */
  int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
-@@ -170,7 +193,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+ 
+@@ -170,7 +196,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
   * @param[in] pkt A pointer to an AVPacket.
   * @return 0 in case of success, a negative error otherwise.
   */
@@ -48614,7 +51515,7 @@ index cdfd579810..010b4232d4 100644
  
      return 0;
 diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index b67b216331..f1923bb26d 100644
+index b67b216331..9a20447030 100644
 --- a/libavcodec/v4l2_m2m.h
 +++ b/libavcodec/v4l2_m2m.h
 @@ -30,6 +30,7 @@
@@ -48673,7 +51574,7 @@ index b67b216331..f1923bb26d 100644
  
      /* null frame/packet received */
      int draining;
-@@ -66,6 +97,27 @@ typedef struct V4L2m2mContext {
+@@ -66,6 +97,29 @@ typedef struct V4L2m2mContext {
  
      /* reference back to V4L2m2mPriv */
      void *priv;
@@ -48685,6 +51586,8 @@ index b67b216331..f1923bb26d 100644
 +
 +    /* Frame tracking */
 +    xlat_track_t xlat;
++    int pending_hw;
++    int pending_n;
 +
 +    pts_stats_t pts_stat;
 +
@@ -48701,7 +51604,7 @@ index b67b216331..f1923bb26d 100644
  } V4L2m2mContext;
  
  typedef struct V4L2m2mPriv {
-@@ -76,6 +128,7 @@ typedef struct V4L2m2mPriv {
+@@ -76,6 +130,7 @@ typedef struct V4L2m2mPriv {
  
      int num_output_buffers;
      int num_capture_buffers;
@@ -48709,7 +51612,7 @@ index b67b216331..f1923bb26d 100644
  } V4L2m2mPriv;
  
  /**
-@@ -129,4 +182,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
+@@ -129,4 +184,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
   */
  int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
  
@@ -48737,7 +51640,7 @@ index b67b216331..f1923bb26d 100644
 +
  #endif /* AVCODEC_V4L2_M2M_H */
 diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index ab07c0a24a..3dd462362c 100644
+index ab07c0a24a..3bd4ff64cc 100644
 --- a/libavcodec/v4l2_m2m_dec.c
 +++ b/libavcodec/v4l2_m2m_dec.c
 @@ -23,6 +23,10 @@
@@ -48907,7 +51810,7 @@ index ab07c0a24a..3dd462362c 100644
      return 0;
  }
  
-@@ -133,58 +164,461 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
+@@ -133,58 +164,514 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
      return 0;
  }
  
@@ -49004,7 +51907,8 @@ index ab07c0a24a..3dd462362c 100644
 +#endif
 +    frame->best_effort_timestamp = pts_stats_guess(ps);
 +    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
++    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
 +    return 0;
 +}
 +
@@ -49197,6 +52101,36 @@ index ab07c0a24a..3dd462362c 100644
      return ret;
  }
  
++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
++{
++    int rv = 0;
++
++    ff_mutex_lock(&ctx->lock);
++
++    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
++        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
++            rv = AVERROR(errno);
++            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
++            break;
++        }
++    }
++
++    ff_mutex_unlock(&ctx->lock);
++    return rv;
++}
++
++// Number of frames over what xlat_pending returns that we keep *16
++// This is a min value - if it appears to be too small the threshold should
++// adjust dynamically.
++#define PENDING_HW_MIN      (3 * 16)
++// Offset to use when setting dynamically
++// Set to %16 == 15 to avoid the threshold changing immediately as we relax
++#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
++// Number of consecutive times we've failed to get a frame when we prefer it
++// before we increase the prefer threshold (5ms * N = max expected decode
++// time)
++#define PENDING_N_THRESHOLD 6
++
 +static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
 +{
 +    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
@@ -49206,7 +52140,7 @@ index ab07c0a24a..3dd462362c 100644
 +
 +    do {
 +        const int pending = xlat_pending(&s->xlat);
-+        const int prefer_dq = (pending > 5);
++        const int prefer_dq = (pending > s->pending_hw / 16);
 +
 +        // Enqueue another pkt for decode if
 +        // (a) We don't have a lot of stuff in the buffer already OR
@@ -49240,6 +52174,27 @@ index ab07c0a24a..3dd462362c 100644
 +                // there is room in the input Q and timeout == -1
 +                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
 +
++                // Failure due to no buffer in Q?
++                if (dst_rv == AVERROR(ENOSPC)) {
++                    // Wait & retry
++                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++                    }
++                }
++
++                // Adjust dynamic pending threshold
++                if (dst_rv == 0) {
++                    if (--s->pending_hw < PENDING_HW_MIN)
++                        s->pending_hw = PENDING_HW_MIN;
++                    s->pending_n = 0;
++                }
++                else if (dst_rv == AVERROR(EAGAIN)) {
++                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
++                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
++                        s->pending_n = 0;
++                    }
++                }
++
 +                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
 +                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
 +                    dst_rv = AVERROR_EOF;
@@ -49388,11 +52343,12 @@ index ab07c0a24a..3dd462362c 100644
  
 +    xlat_init(&s->xlat);
 +    pts_stats_init(&s->pts_stat, avctx, "decoder");
++    s->pending_hw = PENDING_HW_MIN;
 +
      capture = &s->capture;
      output = &s->output;
  
-@@ -192,14 +626,51 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -192,14 +679,51 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
       * by the v4l2 driver; this event will trigger a full pipeline reconfig and
       * the proper values will be retrieved from the kernel driver.
       */
@@ -49446,7 +52402,7 @@ index ab07c0a24a..3dd462362c 100644
  
      s->avctx = avctx;
      ret = ff_v4l2_m2m_codec_init(priv);
-@@ -208,12 +679,68 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+@@ -208,12 +732,68 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
          return ret;
      }
  
@@ -49517,7 +52473,7 @@ index ab07c0a24a..3dd462362c 100644
  }
  
  #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -222,10 +749,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+@@ -222,10 +802,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
  static const AVOption options[] = {
      V4L_M2M_DEFAULT_OPTS,
      { "num_capture_buffers", "Number of buffers in the capture context",
@@ -49535,7 +52491,7 @@ index ab07c0a24a..3dd462362c 100644
  #define M2MDEC_CLASS(NAME) \
      static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
          .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -246,9 +779,15 @@ static const AVOption options[] = {
+@@ -246,9 +832,15 @@ static const AVOption options[] = {
          .init           = v4l2_decode_init, \
          .receive_frame  = v4l2_receive_frame, \
          .close          = v4l2_decode_close, \
@@ -54292,6 +57248,114 @@ index 0000000000..bee4c50fac
 +extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
 +
 +#endif
+diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
+index ea93e11588..a9e0c6323e 100644
+--- a/libavcodec/vc1dec.c
++++ b/libavcodec/vc1dec.c
+@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
+             size = next - start - 4;
+             if (size <= 0)
+                 continue;
+-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
++            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+             init_get_bits(&gb, buf2, buf2_size * 8);
+             switch (AV_RB32(start)) {
+             case VC1_CODE_SEQHDR:
+@@ -678,7 +678,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                 case VC1_CODE_FRAME:
+                     if (avctx->hwaccel)
+                         buf_start = start;
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     break;
+                 case VC1_CODE_FIELD: {
+                     int buf_size3;
+@@ -695,8 +695,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
+@@ -707,7 +707,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                     break;
+                 }
+                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
+                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
+                     break;
+@@ -724,8 +724,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
+@@ -759,7 +759,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                     ret = AVERROR(ENOMEM);
+                     goto err;
+                 }
+-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
++                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                               buf_size3 << 3);
+                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+@@ -768,9 +768,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                 n_slices1 = n_slices - 1;
+                 n_slices++;
+             }
+-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
+         } else {
+-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
+         }
+         init_get_bits(&s->gb, buf2, buf_size2*8);
+     } else
+diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
+index c25a6f3adf..10182786b3 100644
+--- a/libavcodec/vc1dsp.c
++++ b/libavcodec/vc1dsp.c
+@@ -32,6 +32,7 @@
+ #include "rnd_avg.h"
+ #include "vc1dsp.h"
+ #include "startcode.h"
++#include "vc1_common.h"
+ 
+ /* Apply overlap transform to horizontal edge */
+ static void vc1_v_overlap_c(uint8_t *src, int stride)
+@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
+ #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+ 
+     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
++    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
+ 
+     if (ARCH_AARCH64)
+         ff_vc1dsp_init_aarch64(dsp);
+diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
+index 75db62b1b4..e192b431be 100644
+--- a/libavcodec/vc1dsp.h
++++ b/libavcodec/vc1dsp.h
+@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
+      * one or more further zero bytes and a one byte.
+      */
+     int (*startcode_find_candidate)(const uint8_t *buf, int size);
++
++    /* Copy a buffer, removing startcode emulation escape bytes as we go */
++    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
+ } VC1DSPContext;
+ 
+ void ff_vc1dsp_init(VC1DSPContext* c);
 diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
 new file mode 100644
 index 0000000000..f234a985b9