From 54ea2ebba8c301f6bff7edb9527a8c2116c6d6e3 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Tue, 7 Jul 2020 13:51:07 +0200
Subject: [PATCH 01/10] ffmpeg: drop support for legacy RPi MMAL acceleration

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 packages/multimedia/ffmpeg/package.mk | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index a783541ed8..edd8b4141f 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -71,14 +71,6 @@ else
   PKG_FFMPEG_DEBUG="--disable-debug --enable-stripping"
 fi
 
-if [ "${KODIPLAYER_DRIVER}" = "bcm2835-driver" ]; then
-  PKG_DEPENDS_TARGET+=" bcm2835-driver"
-  PKG_NEED_UNPACK+=" $(get_pkg_directory bcm2835-driver)"
-  PKG_PATCH_DIRS+=" rpi-hevc"
-  PKG_FFMPEG_LIBS="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm"
-  PKG_FFMPEG_RPI="--enable-rpi --enable-mmal"
-fi
-
 if target_has_feature neon; then
   PKG_FFMPEG_FPU="--enable-neon"
 else

From d5d98b3f7b41aa42a8e506b9632b390eb18ea49c Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Tue, 7 Jul 2020 13:17:25 +0200
Subject: [PATCH 02/10] ffmpeg: drop all local patches

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 packages/multimedia/ffmpeg/package.mk         |     4 -
 ...format-to-fix-an-issue-with-MMAL-ren.patch |    47 -
 ...l-unsupported-GMC-with-more-than-one.patch |    48 -
 ...g-99.1004-added_upstream_mvc_patches.patch |   269 -
 ...1d-fix-multithreaded-av1-sw-decoding.patch |    45 -
 .../ffmpeg-99.1010-yuv2rgb-logspam.patch      |    13 -
 ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 44880 -------------
 ...0001-avutil-add-av_buffer_pool_flush.patch |    51 -
 ...002-Add-common-V4L2-request-API-code.patch |  1139 -
 ...3-Add-V4L2-request-API-mpeg2-hwaccel.patch |   244 -
 ...04-Add-V4L2-request-API-h264-hwaccel.patch |   547 -
 ...05-Add-V4L2-request-API-hevc-hwaccel.patch |   636 -
 ...006-Add-V4L2-request-API-vp8-hwaccel.patch |   282 -
 ...ate-linux-headers-for-V4L2-request-A.patch |   759 -
 ...ontext_drm-do-not-require-drm-device.patch |    26 -
 ...5.0009-avcodec-h264-parse-idr_pic_id.patch |    51 -
 ...se-ref_pic_marking_size_in_bits-and-.patch |    88 -
 ...gs-for-reference-usage-and-field-pic.patch |    57 -
 ...-WIP-v4l2-request-rolling-timestamps.patch |    69 -
 ...vc-Set-SPS-control-at-initialization.patch |   121 -
 .../v4l2-rpi/0000-revert-le-patches.patch     |    78 -
 .../v4l2-rpi/0001-popcornmix-kodi-gbm.patch   | 54411 ----------------
 ...2m-fix-indentation-and-add-M2MDEC_CL.patch |    73 -
 ...-v4l2m2m-output-AVDRMFrameDescriptor.patch |   511 -
 ...libavcodec-v4l2m2m-adjust-formatting.patch |   106 -
 ...2m-fix-error-handling-during-buffer-.patch |    84 -
 ...libavcodec-v4l2m2m-depends-on-libdrm.patch |    38 -
 ...2m-set-format_modifier-to-DRM_FORMAT.patch |    30 -
 ...2m-only-mmap-the-buffer-when-it-is-o.patch |    50 -
 ...2m-allow-using-software-pixel-format.patch |    42 -
 ...dec-v4l2m2m-fix-decoder-capabilities.patch |    26 -
 ...bavcodec-v4l2m2m-implement-hwcontext.patch |   101 -
 ...1-libavcodec-v4l2m2m-implement-flush.patch |    50 -
 ...0012-libavcodec-v4l2m2m-aspect-ratio.patch |    63 -
 .../0013-libavcodec-v4l2m2m-save-pkt.patch    |    62 -
 ...4l2m2m-only-use-a-few-output-buffers.patch |    25 -
 36 files changed, 105126 deletions(-)
 delete mode 100644 packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-yuv2rgb-logspam.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/rpi-hevc/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0001-avutil-add-av_buffer_pool_flush.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0002-Add-common-V4L2-request-API-code.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0003-Add-V4L2-request-API-mpeg2-hwaccel.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0004-Add-V4L2-request-API-h264-hwaccel.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0005-Add-V4L2-request-API-hevc-hwaccel.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0006-Add-V4L2-request-API-vp8-hwaccel.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0007-Add-and-use-private-linux-headers-for-V4L2-request-A.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0008-hwcontext_drm-do-not-require-drm-device.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0009-avcodec-h264-parse-idr_pic_id.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0010-avcodec-h264-parse-ref_pic_marking_size_in_bits-and-.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0011-HACK-add-dpb-flags-for-reference-usage-and-field-pic.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0012-WIP-v4l2-request-rolling-timestamps.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0013-v4l2-request-hevc-Set-SPS-control-at-initialization.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-rpi/0000-revert-le-patches.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2-rpi/0001-popcornmix-kodi-gbm.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0001-libavcodec-v4l2m2m-fix-indentation-and-add-M2MDEC_CL.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0002-libavcodec-v4l2m2m-output-AVDRMFrameDescriptor.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0003-libavcodec-v4l2m2m-adjust-formatting.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0004-libavcodec-v4l2m2m-fix-error-handling-during-buffer-.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0005-libavcodec-v4l2m2m-depends-on-libdrm.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0006-libavcodec-v4l2m2m-set-format_modifier-to-DRM_FORMAT.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0007-libavcodec-v4l2m2m-only-mmap-the-buffer-when-it-is-o.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0008-libavcodec-v4l2m2m-allow-using-software-pixel-format.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0009-libavcodec-v4l2m2m-fix-decoder-capabilities.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0010-libavcodec-v4l2m2m-implement-hwcontext.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0011-libavcodec-v4l2m2m-implement-flush.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0012-libavcodec-v4l2m2m-aspect-ratio.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0013-libavcodec-v4l2m2m-save-pkt.patch
 delete mode 100644 packages/multimedia/ffmpeg/patches/v4l2/0014-libavcodec-v4l2m2m-only-use-a-few-output-buffers.patch

diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index edd8b4141f..80127527c0 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -24,7 +24,6 @@ if [ "${V4L2_SUPPORT}" = "yes" ]; then
   PKG_FFMPEG_V4L2="--enable-v4l2_m2m --enable-libdrm"
 
   if [ "${PROJECT}" = "RPi" ]; then
-    PKG_PATCH_DIRS+=" v4l2-rpi"
     PKG_FFMPEG_RPI="--disable-rpi --disable-mmal"
     if [ "${DEVICE}" = "RPi4" ]; then
       PKG_DEPENDS_TARGET+=" systemd"
@@ -35,8 +34,6 @@ if [ "${V4L2_SUPPORT}" = "yes" ]; then
 	                  --disable-hwaccel=mpeg2_v4l2request \
 		          --disable-hwaccel=vp8_v4l2request"
     fi
-  else
-    PKG_PATCH_DIRS+=" v4l2"
   fi
 else
   PKG_FFMPEG_V4L2="--disable-v4l2_m2m"
@@ -61,7 +58,6 @@ fi
 if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" ]; then
   PKG_DEPENDS_TARGET+=" libdrm systemd" # systemd is needed for libudev
   PKG_NEED_UNPACK+=" $(get_pkg_directory libdrm) $(get_pkg_directory systemd)"
-  PKG_PATCH_DIRS+=" v4l2-request-api"
   PKG_FFMPEG_V4L2_REQUEST="--enable-v4l2-request --enable-libudev --enable-libdrm"
 fi
 
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
deleted file mode 100644
index 6e99dbce4a..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-From 85417a4ba42360248b4999e458a6e05c1c2f9b17 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 4 Mar 2017 19:24:02 +0000
-Subject: [PATCH] ffmpeg: Call get_format to fix an issue with MMAL rendering
-
----
- libavcodec/dvdec.c | 6 ++++++
- libavcodec/rv34.c  | 6 +++++-
- 2 files changed, 11 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
-index 89864f2edc..b5f9224d72 100644
---- a/libavcodec/dvdec.c
-+++ b/libavcodec/dvdec.c
-@@ -197,6 +197,12 @@ static av_cold int dvvideo_decode_init(AVCodecContext *avctx)
-     s->idct_put[0] = idsp.idct_put;
-     s->idct_put[1] = ff_simple_idct248_put;
- 
-+    static const enum AVPixelFormat pix_fmts[] = {
-+       AV_PIX_FMT_YUV420P,
-+       AV_PIX_FMT_NONE
-+    };
-+    avctx->pix_fmt = ff_get_format(avctx, pix_fmts);
-+
-     return ff_dvvideo_init(avctx);
- }
- 
-diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
-index d171e6e1bd..f6f0aa1f74 100644
---- a/libavcodec/rv34.c
-+++ b/libavcodec/rv34.c
-@@ -1499,7 +1499,11 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx)
-     ff_mpv_decode_init(s, avctx);
-     s->out_format = FMT_H263;
- 
--    avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-+    static const enum AVPixelFormat pix_fmts[] = {
-+       AV_PIX_FMT_YUV420P,
-+       AV_PIX_FMT_NONE
-+    };
-+    avctx->pix_fmt = ff_get_format(avctx, pix_fmts);
-     avctx->has_b_frames = 1;
-     s->low_delay = 0;
- 
--- 
-2.20.1
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
deleted file mode 100644
index 8b6565f2c1..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+++ /dev/null
@@ -1,48 +0,0 @@
-From 848de6c1923820f1de49ed7875d6c8877d0c321c Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 5 Jun 2015 22:48:33 +0100
-Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
- point
-
----
- libavcodec/avcodec.h       | 1 +
- libavcodec/mpeg4videodec.c | 4 ++++
- 2 files changed, 5 insertions(+)
-
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index a36b675fba..880284d5bd 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -2612,6 +2612,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED       16384
- #define FF_BUG_IEDGE           32768
-+#define FF_BUG_GMC_UNSUPPORTED (1<<30)
- 
-     /**
-      * strictly follow the standard (MPEG-4, ...).
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index 055afabc7e..fa208660c8 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2662,6 +2662,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
- 
-         if (ctx->divx_version >= 0)
-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
-+
-+        if (ctx->num_sprite_warping_points > 1)
-+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-     }
- 
-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2686,6 +2689,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
- 
-+    avctx->workaround_bugs = s->workaround_bugs;
-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-         s->codec_id == AV_CODEC_ID_MPEG4 &&
-         avctx->idct_algo == FF_IDCT_AUTO) {
--- 
-2.20.1
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
deleted file mode 100644
index f7dbc5bc39..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
+++ /dev/null
@@ -1,269 +0,0 @@
-From a853a9c70339c30ea4d5081366fee4bb84bc09c5 Mon Sep 17 00:00:00 2001
-From: Hendrik Leppkes <h.leppkes@gmail.com>
-Date: Sat, 9 Jan 2016 16:34:09 +0100
-Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles
-
----
- libavcodec/avcodec.h    | 3 +++
- libavcodec/codec_desc.c | 7 +++++++
- libavcodec/profiles.c   | 1 +
- libavformat/mpegts.c    | 2 +-
- 4 files changed, 12 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 880284d5bd..88e01645cd 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -458,6 +458,8 @@ enum AVCodecID {
-     AV_CODEC_ID_LSCR,
-     AV_CODEC_ID_VP4,
- 
-+    AV_CODEC_ID_H264_MVC,
-+
-     /* various PCM "codecs" */
-     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
-     AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -2950,6 +2952,7 @@ typedef struct AVCodecContext {
- #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
- #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
- #define FF_PROFILE_H264_CAVLC_444            44
-+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
- 
- #define FF_PROFILE_VC1_SIMPLE   0
- #define FF_PROFILE_VC1_MAIN     1
-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 4d033c20ff..def006e033 100644
---- a/libavcodec/codec_desc.c
-+++ b/libavcodec/codec_desc.c
-@@ -1726,6 +1726,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
-         .long_name = NULL_IF_CONFIG_SMALL("On2 VP4"),
-         .props     = AV_CODEC_PROP_LOSSY,
-     },
-+    {
-+        .id        = AV_CODEC_ID_H264_MVC,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "h264_mvc",
-+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
-+        .props     = AV_CODEC_PROP_LOSSY,
-+    },
- 
-     /* various PCM "codecs" */
-     {
-diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
-index eaf0d68d32..fc1e152420 100644
---- a/libavcodec/profiles.c
-+++ b/libavcodec/profiles.c
-@@ -71,6 +71,7 @@ const AVProfile ff_h264_profiles[] = {
-     { FF_PROFILE_H264_CAVLC_444,            "CAVLC 4:4:4"           },
-     { FF_PROFILE_H264_MULTIVIEW_HIGH,       "Multiview High"        },
-     { FF_PROFILE_H264_STEREO_HIGH,          "Stereo High"           },
-+    { FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH, "Multiview High Depth"  },
-     { FF_PROFILE_UNKNOWN },
- };
- 
-diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index 0415ceea02..264bf9718b 100644
---- a/libavformat/mpegts.c
-+++ b/libavformat/mpegts.c
-@@ -798,7 +798,7 @@ static const StreamType ISO_types[] = {
- #endif
-     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
-     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
--    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
-+    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
-     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
-     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
-     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
--- 
-2.20.1
-
-
-From 8d479b1b5395f97a8e5ee0eddab6680941edfb5b Mon Sep 17 00:00:00 2001
-From: Hendrik Leppkes <h.leppkes@gmail.com>
-Date: Sat, 9 Jan 2016 16:34:40 +0100
-Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs
-
----
- libavcodec/h264_parser.c | 34 ++++++++++++++++++++++++++++++----
- libavcodec/parsers.c     |  1 +
- 2 files changed, 31 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index 5f9a9c46ef..991e8e82f3 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -62,6 +62,7 @@ typedef struct H264ParseContext {
-     int parse_last_mb;
-     int64_t reference_dts;
-     int last_frame_num, last_picture_structure;
-+    int is_mvc;
- } H264ParseContext;
- 
- 
-@@ -109,14 +110,18 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-         } else if (state <= 5) {
-             int nalu_type = buf[i] & 0x1F;
-             if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
--                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
-+                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
-+                nalu_type == H264_NAL_SUB_SPS) {
-                 if (pc->frame_start_found) {
-                     i++;
-                     goto found;
-                 }
-             } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
--                       nalu_type == H264_NAL_IDR_SLICE) {
-+                       nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_EXTEN_SLICE)) {
-                 state += 8;
-+
-+                if (nalu_type == H264_NAL_EXTEN_SLICE)
-+                    i += 3; // skip mvc extension
-                 continue;
-             }
-             state = 7;
-@@ -604,7 +609,8 @@ static int h264_parse(AVCodecParserContext *s,
-         }
-     }
- 
--    parse_nal_units(s, avctx, buf, buf_size);
-+    if (!p->is_mvc)
-+        parse_nal_units(s, avctx, buf, buf_size);
- 
-     if (avctx->framerate.num)
-         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
-@@ -661,7 +667,7 @@ static int h264_split(AVCodecContext *avctx,
-         if ((state & 0xFFFFFF00) != 0x100)
-             break;
-         nalu_type = state & 0x1F;
--        if (nalu_type == H264_NAL_SPS) {
-+        if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SUB_SPS) {
-             has_sps = 1;
-         } else if (nalu_type == H264_NAL_PPS)
-             has_pps = 1;
-@@ -713,3 +719,23 @@ AVCodecParser ff_h264_parser = {
-     .parser_close   = h264_close,
-     .split          = h264_split,
- };
-+
-+static av_cold int init_mvc(AVCodecParserContext *s)
-+{
-+    H264ParseContext *p = s->priv_data;
-+    int ret = init(s);
-+    if (ret < 0)
-+        return ret;
-+
-+    p->is_mvc = 1;
-+    return 0;
-+}
-+
-+AVCodecParser ff_h264_mvc_parser = {
-+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
-+    .priv_data_size = sizeof(H264ParseContext),
-+    .parser_init    = init_mvc,
-+    .parser_parse   = h264_parse,
-+    .parser_close   = h264_close,
-+    .split          = h264_split,
-+};
-diff --git a/libavcodec/parsers.c b/libavcodec/parsers.c
-index 33a71de8a0..80b269b324 100644
---- a/libavcodec/parsers.c
-+++ b/libavcodec/parsers.c
-@@ -47,6 +47,7 @@ extern AVCodecParser ff_gsm_parser;
- extern AVCodecParser ff_h261_parser;
- extern AVCodecParser ff_h263_parser;
- extern AVCodecParser ff_h264_parser;
-+extern AVCodecParser ff_h264_mvc_parser;
- extern AVCodecParser ff_hevc_parser;
- extern AVCodecParser ff_mjpeg_parser;
- extern AVCodecParser ff_mlp_parser;
--- 
-2.20.1
-
-
-From e2cad00490c9cd339b9266e8b9fe5d86afe2abe1 Mon Sep 17 00:00:00 2001
-From: Hendrik Leppkes <h.leppkes@gmail.com>
-Date: Tue, 28 Nov 2017 16:12:12 +0000
-Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame
- start was found
-
----
- libavcodec/h264_parser.c | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index 991e8e82f3..f573bd8629 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -597,6 +597,9 @@ static int h264_parse(AVCodecParserContext *s,
-     } else {
-         next = h264_find_frame_end(p, buf, buf_size, avctx);
- 
-+        if (next == END_NOT_FOUND && pc->frame_start_found == 0)
-+            s->fetch_timestamp = 1;
-+
-         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-             *poutbuf      = NULL;
-             *poutbuf_size = 0;
--- 
-2.20.1
-
-
-From fc2954e5eba1791443016cdd17fcea280f464db5 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 28 May 2018 13:35:36 +0100
-Subject: [PATCH 4/4] fixup
-
----
- libavcodec/extract_extradata_bsf.c | 8 +++++---
- 1 file changed, 5 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c
-index 85cf615ffa..b5f1657528 100644
---- a/libavcodec/extract_extradata_bsf.c
-+++ b/libavcodec/extract_extradata_bsf.c
-@@ -138,7 +138,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
-         HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS,
-     };
-     static const int extradata_nal_types_h264[] = {
--        H264_NAL_SPS, H264_NAL_PPS,
-+        H264_NAL_SPS, H264_NAL_SUB_SPS, H264_NAL_PPS,
-     };
- 
-     ExtractExtradataContext *s = ctx->priv_data;
-@@ -169,7 +169,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
-                 if (nal->type == HEVC_NAL_SPS) has_sps = 1;
-                 if (nal->type == HEVC_NAL_VPS) has_vps = 1;
-             } else {
--                if (nal->type == H264_NAL_SPS) has_sps = 1;
-+                if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SUB_SPS) has_sps = 1;
-             }
-         } else if (s->remove) {
-             filtered_size += nal->raw_size + 3;
-@@ -178,7 +178,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
- 
-     if (extradata_size &&
-         ((ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) ||
--         (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) {
-+         ((ctx->par_in->codec_id == AV_CODEC_ID_H264 || ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC) && has_sps))) {
-         AVBufferRef *filtered_buf = NULL;
-         uint8_t *extradata, *filtered_data;
- 
-@@ -334,6 +334,7 @@ static const struct {
-     { AV_CODEC_ID_AVS2,       extract_extradata_mpeg4   },
-     { AV_CODEC_ID_CAVS,       extract_extradata_mpeg4   },
-     { AV_CODEC_ID_H264,       extract_extradata_h2645   },
-+    { AV_CODEC_ID_H264_MVC,   extract_extradata_h2645   },
-     { AV_CODEC_ID_HEVC,       extract_extradata_h2645   },
-     { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12  },
-     { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12  },
-@@ -401,6 +402,7 @@ static const enum AVCodecID codec_ids[] = {
-     AV_CODEC_ID_AVS2,
-     AV_CODEC_ID_CAVS,
-     AV_CODEC_ID_H264,
-+    AV_CODEC_ID_H264_MVC,
-     AV_CODEC_ID_HEVC,
-     AV_CODEC_ID_MPEG1VIDEO,
-     AV_CODEC_ID_MPEG2VIDEO,
--- 
-2.20.1
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch
deleted file mode 100644
index 1060f54c50..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch
+++ /dev/null
@@ -1,45 +0,0 @@
-commit 578b5ee8c0fe7d9ef09ef91ffcafc916f1d7d97b
-Author: Lukas Rusak <lorusak@gmail.com>
-Date:   Wed Apr 10 13:39:21 2019 -0700
-
-    libavcodec/libdav1d: add libdav1d_get_format method to call ff_get_format
-    
-    This will allow applications to properly init the decoder in
-    cases where a hardware decoder is tried first and and software
-    decoder is tried after by calling the get_format callback.
-    
-    Even though there is no hardware pixel formats available
-    we still need to return the software pixel format.
-    
-    Tested with Kodi by checking if multithreaded software
-    decoding is properly activated.
-
-diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
-index 12c63245f8..1bbb83eda3 100644
---- a/libavcodec/libdav1d.c
-+++ b/libavcodec/libdav1d.c
-@@ -53,6 +53,16 @@ static const enum AVPixelFormat pix_fmt_rgb[3] = {
-     AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12,
- };
- 
-+static enum AVPixelFormat libdav1d_get_format(AVCodecContext *avctx, const Dav1dPicture *p)
-+{
-+   enum AVPixelFormat pix_fmts[2], *fmt = pix_fmts;
-+
-+   *fmt++ = pix_fmt[p->p.layout][p->seq_hdr->hbd];
-+   *fmt = AV_PIX_FMT_NONE;
-+
-+   return ff_get_format(avctx, pix_fmts);
-+}
-+
- static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl)
- {
-     AVCodecContext *c = opaque;
-@@ -229,6 +239,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
-     c->profile = p->seq_hdr->profile;
-     c->level = ((p->seq_hdr->operating_points[0].major_level - 2) << 2)
-                | p->seq_hdr->operating_points[0].minor_level;
-+    frame->format = c->pix_fmt = libdav1d_get_format(c, p);
-     frame->width = p->p.w;
-     frame->height = p->p.h;
-     if (c->width != p->p.w || c->height != p->p.h) {
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-yuv2rgb-logspam.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-yuv2rgb-logspam.patch
deleted file mode 100644
index 2895d7ac03..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-yuv2rgb-logspam.patch
+++ /dev/null
@@ -1,13 +0,0 @@
---- a/libswscale/yuv2rgb.c	2018-07-22 10:00:00.000000000 +0100
-+++ b/libswscale/yuv2rgb.c	2018-08-20 11:55:46.391543992 +0100
-@@ -687,10 +687,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsConte
-     if (t)
-         return t;
- 
--    av_log(c, AV_LOG_WARNING,
--           "No accelerated colorspace conversion found from %s to %s.\n",
--           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
--
-     switch (c->dstFormat) {
-     case AV_PIX_FMT_BGR48BE:
-     case AV_PIX_FMT_BGR48LE:
diff --git a/packages/multimedia/ffmpeg/patches/rpi-hevc/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/rpi-hevc/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
deleted file mode 100644
index ab41e33f2c..0000000000
--- a/packages/multimedia/ffmpeg/patches/rpi-hevc/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ /dev/null
@@ -1,44880 +0,0 @@
-From 27e3340bc0e488abde669f91dfae5d5bf535ad02 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 31 Jan 2020 11:27:22 +0000
-Subject: [PATCH] pi: hevc acclerated support by pfcd/jc
-
----
- .gitignore                                    |    2 +
- configure                                     |    7 +
- fftools/ffmpeg.c                              |  291 +-
- fftools/ffmpeg_filter.c                       |    4 +-
- fftools/ffmpeg_opt.c                          |    8 +
- libavcodec/Makefile                           |   37 +
- libavcodec/allcodecs.c                        |   36 +
- libavcodec/arm/Makefile                       |   16 +
- libavcodec/arm/cabac.h                        |  262 +-
- libavcodec/arm/rpi_hevc_cabac.h               |  605 ++
- libavcodec/arm/rpi_hevc_idct_fn_neon.S        |  161 +
- libavcodec/arm/rpi_hevc_misc_neon.S           |  238 +
- libavcodec/arm/rpi_hevc_misc_neon.h           |  438 ++
- libavcodec/arm/rpi_hevc_mv_arm.h              |   64 +
- libavcodec/arm/rpi_hevcdsp_arm.h              |   26 +
- libavcodec/arm/rpi_hevcdsp_deblock_neon.S     | 1633 +++++
- libavcodec/arm/rpi_hevcdsp_idct_neon.S        |  183 +
- libavcodec/arm/rpi_hevcdsp_init_arm.c         |   32 +
- libavcodec/arm/rpi_hevcdsp_init_neon.c        |  467 ++
- libavcodec/arm/rpi_hevcdsp_res16_neon.S       |  591 ++
- libavcodec/arm/rpi_hevcdsp_res8_neon.S        |  712 ++
- libavcodec/arm/rpi_hevcdsp_sao_neon.S         | 2245 ++++++
- libavcodec/arm/rpi_hevcpred_arm.h             |   28 +
- libavcodec/arm/rpi_hevcpred_init_arm.c        |   35 +
- libavcodec/arm/rpi_hevcpred_init_neon.c       |  210 +
- .../arm/rpi_hevcpred_intra_angular_neon.S     | 2975 ++++++++
- libavcodec/arm/rpi_hevcpred_intra_dc_neon.S   |  695 ++
- .../arm/rpi_hevcpred_intra_filter_neon.S      |  872 +++
- libavcodec/arm/rpi_hevcpred_intra_hv_neon.S   |  911 +++
- .../arm/rpi_hevcpred_intra_planar_neon.S      | 1034 +++
- libavcodec/avcodec.h                          |   19 +-
- libavcodec/cabac.h                            |    9 +-
- libavcodec/mmaldec.c                          |    4 +
- libavcodec/raw.c                              |    4 +
- libavcodec/rawenc.c                           |   62 +
- libavcodec/rpi_hevc_cabac.c                   | 2255 ++++++
- libavcodec/rpi_hevc_cabac_fns.h               |  191 +
- libavcodec/rpi_hevc_data.c                    |   75 +
- libavcodec/rpi_hevc_data.h                    |   31 +
- libavcodec/rpi_hevc_filter.c                  | 1206 ++++
- libavcodec/rpi_hevc_mv.h                      |   71 +
- libavcodec/rpi_hevc_mvs.c                     |  486 ++
- libavcodec/rpi_hevc_parse.c                   |  142 +
- libavcodec/rpi_hevc_parse.h                   |   36 +
- libavcodec/rpi_hevc_ps.c                      | 1936 ++++++
- libavcodec/rpi_hevc_ps.h                      |  447 ++
- libavcodec/rpi_hevc_refs.c                    |  485 ++
- libavcodec/rpi_hevc_sei.c                     |  368 +
- libavcodec/rpi_hevc_sei.h                     |  135 +
- libavcodec/rpi_hevc_shader.c                  | 1537 +++++
- libavcodec/rpi_hevc_shader.h                  |   63 +
- libavcodec/rpi_hevc_shader.qasm               | 1821 +++++
- libavcodec/rpi_hevc_shader_cmd.h              |  128 +
- libavcodec/rpi_hevc_shader_template.c         |   61 +
- libavcodec/rpi_hevc_shader_template.h         |   22 +
- libavcodec/rpi_hevc_shader_template_fn.h      |  475 ++
- libavcodec/rpi_hevc_transform.s               |  444 ++
- libavcodec/rpi_hevc_transform10.h             |   94 +
- libavcodec/rpi_hevc_transform8.h              |   94 +
- libavcodec/rpi_hevcdec.c                      | 6016 +++++++++++++++++
- libavcodec/rpi_hevcdec.h                      | 1087 +++
- libavcodec/rpi_hevcdsp.c                      |  450 ++
- libavcodec/rpi_hevcdsp.h                      |  177 +
- libavcodec/rpi_hevcdsp_template.c             | 2278 +++++++
- libavcodec/rpi_hevcpred.c                     |  166 +
- libavcodec/rpi_hevcpred.h                     |  121 +
- libavcodec/rpi_hevcpred_template.c            | 1522 +++++
- libavcodec/rpi_mailbox.c                      |  107 +
- libavcodec/rpi_mailbox.h                      |   55 +
- libavcodec/rpi_qpu.c                          |  957 +++
- libavcodec/rpi_qpu.h                          |  229 +
- libavcodec/rpi_zc.c                           |  741 ++
- libavcodec/rpi_zc.h                           |  105 +
- libavfilter/Makefile                          |    1 +
- libavfilter/allfilters.c                      |    1 +
- libavfilter/avfiltergraph.c                   |   86 +-
- libavfilter/buffersrc.c                       |    2 +-
- libavfilter/vf_unsand.c                       |  232 +
- libavformat/utils.c                           |   65 +-
- libavutil/Makefile                            |    1 +
- libavutil/arm/Makefile                        |    1 +
- libavutil/arm/rpi_sand_neon.S                 |   40 +
- libavutil/buffer.c                            |    6 +
- libavutil/buffer.h                            |    3 +
- libavutil/frame.c                             |   11 +
- libavutil/frame.h                             |   10 +
- libavutil/pixdesc.c                           |   24 +
- libavutil/pixfmt.h                            |    4 +
- libavutil/rpi_sand_fn_pw.h                    |  182 +
- libavutil/rpi_sand_fns.c                      |  151 +
- libavutil/rpi_sand_fns.h                      |  136 +
- pi-util/BUILD.txt                             |   25 +
- pi-util/conf_h265.2016.csv                    |  195 +
- pi-util/conf_h265.2016_HEVC_v1.csv            |  147 +
- pi-util/conf_h265.csv                         |  144 +
- pi-util/conf_pi1.sh                           |   30 +
- pi-util/conf_pi2.sh                           |   32 +
- pi-util/ffconf.py                             |  175 +
- pi-util/ffperf.py                             |  125 +
- pi-util/make_array.py                         |   23 +
- pi-util/perfcmp.py                            |  101 +
- pi-util/qem.sh                                |    9 +
- pi-util/v3dusage.py                           |  128 +
- 103 files changed, 43525 insertions(+), 95 deletions(-)
- create mode 100644 libavcodec/arm/rpi_hevc_cabac.h
- create mode 100644 libavcodec/arm/rpi_hevc_idct_fn_neon.S
- create mode 100644 libavcodec/arm/rpi_hevc_misc_neon.S
- create mode 100644 libavcodec/arm/rpi_hevc_misc_neon.h
- create mode 100644 libavcodec/arm/rpi_hevc_mv_arm.h
- create mode 100644 libavcodec/arm/rpi_hevcdsp_arm.h
- create mode 100644 libavcodec/arm/rpi_hevcdsp_deblock_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcdsp_idct_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcdsp_init_arm.c
- create mode 100644 libavcodec/arm/rpi_hevcdsp_init_neon.c
- create mode 100644 libavcodec/arm/rpi_hevcdsp_res16_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcdsp_res8_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcdsp_sao_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcpred_arm.h
- create mode 100644 libavcodec/arm/rpi_hevcpred_init_arm.c
- create mode 100644 libavcodec/arm/rpi_hevcpred_init_neon.c
- create mode 100644 libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
- create mode 100644 libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
- create mode 100644 libavcodec/rpi_hevc_cabac.c
- create mode 100644 libavcodec/rpi_hevc_cabac_fns.h
- create mode 100644 libavcodec/rpi_hevc_data.c
- create mode 100644 libavcodec/rpi_hevc_data.h
- create mode 100644 libavcodec/rpi_hevc_filter.c
- create mode 100644 libavcodec/rpi_hevc_mv.h
- create mode 100644 libavcodec/rpi_hevc_mvs.c
- create mode 100644 libavcodec/rpi_hevc_parse.c
- create mode 100644 libavcodec/rpi_hevc_parse.h
- create mode 100644 libavcodec/rpi_hevc_ps.c
- create mode 100644 libavcodec/rpi_hevc_ps.h
- create mode 100644 libavcodec/rpi_hevc_refs.c
- create mode 100644 libavcodec/rpi_hevc_sei.c
- create mode 100644 libavcodec/rpi_hevc_sei.h
- create mode 100644 libavcodec/rpi_hevc_shader.c
- create mode 100644 libavcodec/rpi_hevc_shader.h
- create mode 100644 libavcodec/rpi_hevc_shader.qasm
- create mode 100644 libavcodec/rpi_hevc_shader_cmd.h
- create mode 100644 libavcodec/rpi_hevc_shader_template.c
- create mode 100644 libavcodec/rpi_hevc_shader_template.h
- create mode 100644 libavcodec/rpi_hevc_shader_template_fn.h
- create mode 100644 libavcodec/rpi_hevc_transform.s
- create mode 100644 libavcodec/rpi_hevc_transform10.h
- create mode 100644 libavcodec/rpi_hevc_transform8.h
- create mode 100644 libavcodec/rpi_hevcdec.c
- create mode 100644 libavcodec/rpi_hevcdec.h
- create mode 100644 libavcodec/rpi_hevcdsp.c
- create mode 100644 libavcodec/rpi_hevcdsp.h
- create mode 100644 libavcodec/rpi_hevcdsp_template.c
- create mode 100644 libavcodec/rpi_hevcpred.c
- create mode 100644 libavcodec/rpi_hevcpred.h
- create mode 100644 libavcodec/rpi_hevcpred_template.c
- create mode 100644 libavcodec/rpi_mailbox.c
- create mode 100644 libavcodec/rpi_mailbox.h
- create mode 100644 libavcodec/rpi_qpu.c
- create mode 100644 libavcodec/rpi_qpu.h
- create mode 100644 libavcodec/rpi_zc.c
- create mode 100644 libavcodec/rpi_zc.h
- create mode 100644 libavfilter/vf_unsand.c
- create mode 100644 libavutil/arm/rpi_sand_neon.S
- create mode 100644 libavutil/rpi_sand_fn_pw.h
- create mode 100644 libavutil/rpi_sand_fns.c
- create mode 100644 libavutil/rpi_sand_fns.h
- create mode 100644 pi-util/BUILD.txt
- create mode 100644 pi-util/conf_h265.2016.csv
- create mode 100644 pi-util/conf_h265.2016_HEVC_v1.csv
- create mode 100644 pi-util/conf_h265.csv
- create mode 100755 pi-util/conf_pi1.sh
- create mode 100755 pi-util/conf_pi2.sh
- create mode 100755 pi-util/ffconf.py
- create mode 100755 pi-util/ffperf.py
- create mode 100755 pi-util/make_array.py
- create mode 100644 pi-util/perfcmp.py
- create mode 100755 pi-util/qem.sh
- create mode 100755 pi-util/v3dusage.py
-
-diff --git a/.gitignore b/.gitignore
-index 2450ee8fc5..4bcc3ae643 100644
---- a/.gitignore
-+++ b/.gitignore
-@@ -1,6 +1,7 @@
- *.a
- *.o
- *.o.*
-+*.bin
- *.d
- *.def
- *.dll
-@@ -26,6 +27,7 @@
- .\#*
- /.config
- /.version
-+/build/
- /ffmpeg
- /ffplay
- /ffprobe
-diff --git a/configure b/configure
-index 34c2adb4a4..531c7e754f 100755
---- a/configure
-+++ b/configure
-@@ -331,6 +331,7 @@ External library support:
-   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
-   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
-   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
-+  --enable-rpi             enable other rpi specific stuff [no]
-   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
-   --disable-nvenc          disable Nvidia video encoding code [autodetect]
-   --enable-omx             enable OpenMAX IL code [no]
-@@ -1866,6 +1867,7 @@ FEATURE_LIST="
-     gray
-     hardcoded_tables
-     omx_rpi
-+    rpi
-     runtime_cpudetect
-     safe_bitstream_reader
-     shared
-@@ -2390,6 +2392,7 @@ CONFIG_EXTRA="
-     rtpdec
-     rtpenc_chain
-     rv34dsp
-+    sand
-     scene_sad
-     sinewin
-     snappy
-@@ -2715,6 +2718,8 @@ hap_decoder_select="snappy texturedsp"
- hap_encoder_deps="libsnappy"
- hap_encoder_select="texturedspenc"
- hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
-+hevc_rpi_decoder_deps="rpi"
-+hevc_rpi_decoder_select="hevc_decoder sand"
- huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
- huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
- hymt_decoder_select="huffyuv_decoder"
-@@ -3543,6 +3548,8 @@ tonemap_filter_deps="const_nan"
- tonemap_opencl_filter_deps="opencl const_nan"
- transpose_opencl_filter_deps="opencl"
- transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
-+unsand_filter_deps="rpi"
-+unsand_filter_select="sand"
- unsharp_opencl_filter_deps="opencl"
- uspp_filter_deps="gpl avcodec"
- vaguedenoiser_filter_deps="gpl"
-diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index 01f04103cf..80dc56d0c6 100644
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -24,6 +24,12 @@
-  */
- 
- #include "config.h"
-+
-+#if CONFIG_RPI
-+#define RPI_DISPLAY
-+#define RPI_DISPLAY_ALL 0
-+#endif
-+
- #include <ctype.h>
- #include <string.h>
- #include <math.h>
-@@ -70,6 +76,25 @@
- # include "libavfilter/buffersrc.h"
- # include "libavfilter/buffersink.h"
- 
-+#ifdef RPI_DISPLAY
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <bcm_host.h>
-+#include <interface/mmal/mmal.h>
-+#include <interface/mmal/mmal_parameters_camera.h>
-+#include <interface/mmal/mmal_buffer.h>
-+#include <interface/mmal/mmal_port.h>
-+#include <interface/mmal/util/mmal_util.h>
-+#include <interface/mmal/util/mmal_default_components.h>
-+#include <interface/mmal/util/mmal_connection.h>
-+#include <interface/mmal/util/mmal_util_params.h>
-+#pragma GCC diagnostic pop
-+#include "libavcodec/rpi_qpu.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "libavcodec/rpi_zc.h"
-+#endif
-+
- #if HAVE_SYS_RESOURCE_H
- #include <sys/time.h>
- #include <sys/types.h>
-@@ -168,6 +193,241 @@ static int restore_tty;
- static void free_input_threads(void);
- #endif
- 
-+#ifdef RPI_DISPLAY
-+
-+#define NUM_BUFFERS 4
-+
-+
-+typedef struct rpi_display_env_s
-+{
-+    MMAL_COMPONENT_T* display;
-+    MMAL_COMPONENT_T* isp;
-+    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
-+    MMAL_CONNECTION_T * conn;
-+
-+    MMAL_POOL_T *rpi_pool;
-+    volatile int rpi_display_count;
-+    enum AVPixelFormat avfmt;
-+} rpi_display_env_t;
-+
-+static rpi_display_env_t * rpi_display_env = NULL;
-+
-+
-+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
-+{
-+    MMAL_POOL_T* pool;
-+    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
-+    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
-+    assert(pool);
-+
-+    return pool;
-+}
-+
-+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+    rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
-+    av_rpi_zc_unref(buffer->user_data);
-+    atomic_fetch_add(&de->rpi_display_count, -1);
-+    mmal_buffer_header_release(buffer);
-+}
-+
-+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-+  mmal_buffer_header_release(buffer);
-+}
-+
-+#define DISPLAY_PORT_DEPTH 4
-+
-+static rpi_display_env_t *
-+display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
-+{
-+    MMAL_STATUS_T err;
-+    MMAL_DISPLAYREGION_T region =
-+    {
-+        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-+        .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
-+        .layer = 2,
-+        .fullscreen = 0,
-+        .dest_rect = {x, y, w, h}
-+    };
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
-+#else
-+    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
-+#endif
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
-+    rpi_display_env_t * de;
-+    int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
-+
-+    bcm_host_init();  // Needs to be done by someone...
-+
-+    if ((de = av_mallocz(sizeof(*de))) == NULL) {
-+        return NULL;
-+    }
-+
-+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
-+    av_assert0(de->display);
-+    de->port_in = de->display->input[0];
-+
-+    if (isp_req)
-+    {
-+        mmal_component_create("vc.ril.isp", &de->isp);
-+        de->port_in = de->isp->input[0];
-+    }
-+
-+    mmal_port_parameter_set(de->display->input[0], &region.hdr);
-+
-+    {
-+        MMAL_PORT_T * const port = de->port_in;
-+        MMAL_ES_FORMAT_T* const format = port->format;
-+        port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
-+        port->buffer_num = DISPLAY_PORT_DEPTH;
-+        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
-+            fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
-+                MMAL_ENCODING_I420;
-+        format->es->video.width = geo.stride_y;
-+        format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
-+                                      (h + 15) & ~15 : geo.height_y;  // Magic
-+        format->es->video.crop.x = 0;
-+        format->es->video.crop.y = 0;
-+        format->es->video.crop.width = w;
-+        format->es->video.crop.height = h;
-+        mmal_port_format_commit(port);
-+    }
-+
-+    de->rpi_pool = display_alloc_pool(de->port_in);
-+    mmal_port_enable(de->port_in,display_cb_input);
-+
-+    if (isp_req) {
-+        MMAL_PORT_T * const port_out = de->isp->output[0];
-+        mmal_log_dump_port(de->port_in);
-+        mmal_format_copy(port_out->format, de->port_in->format);
-+        if (fmt == AV_PIX_FMT_SAND64_10) {
-+            if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
-+                (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
-+            {
-+                av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
-+            }
-+            else
-+                av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
-+
-+        }
-+        port_out->format->encoding = MMAL_ENCODING_I420;
-+        mmal_log_dump_port(port_out);
-+        if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
-+            goto fail;
-+        }
-+        if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
-+            av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
-+            goto fail;
-+        }
-+        if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
-+            av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
-+            goto fail;
-+        }
-+        mmal_port_enable(de->isp->control,display_cb_control);
-+        mmal_component_enable(de->isp);
-+    }
-+
-+    mmal_component_enable(de->display);
-+    mmal_port_enable(de->display->control,display_cb_control);
-+    de->avfmt = fmt;
-+
-+    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
-+
-+    return de;
-+
-+fail:
-+    // **** Free stuff
-+    return NULL;
-+}
-+
-+static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
-+{
-+    MMAL_BUFFER_HEADER_T* buf;
-+
-+    if (de == NULL)
-+        return;
-+
-+    if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
-+        return;
-+    }
-+
-+    buf = mmal_queue_get(de->rpi_pool->queue);
-+    if (!buf) {
-+        // Running too fast so drop the frame
-+        printf("Q alloc failure\n");
-+        return;
-+    }
-+    assert(buf);
-+    buf->cmd = 0;
-+    buf->offset = 0; // Offset to valid data
-+    buf->flags = 0;
-+    {
-+        const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
-+        if (fr_buf == NULL) {
-+            mmal_buffer_header_release(buf);
-+            return;
-+        }
-+
-+        buf->user_data = fr_buf;
-+        buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
-+        buf->offset = av_rpi_zc_offset(fr_buf);
-+        buf->length = av_rpi_zc_length(fr_buf);
-+        buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+        atomic_fetch_add(&de->rpi_display_count, 1);
-+    }
-+#if RPI_DISPLAY_ALL
-+    while (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+        usleep(5000);
-+    }
-+#endif
-+
-+    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
-+    {
-+        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
-+        display_cb_input(de->port_in, buf);
-+    }
-+}
-+
-+static void display_exit(rpi_display_env_t ** const pde)
-+{
-+    rpi_display_env_t * const de = *pde;
-+    *pde = NULL;
-+
-+    if (de != NULL) {
-+//    sleep(120);
-+
-+        if (de->port_in != NULL) {
-+            mmal_port_disable(de->port_in);
-+        }
-+
-+        // The above disable should kick out all buffers - check that
-+        if (atomic_load(&de->rpi_display_count) != 0) {
-+            av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
-+        }
-+
-+        if (de->conn != NULL) {
-+            mmal_connection_destroy(de->conn);
-+        }
-+        if (de->rpi_pool != NULL) {
-+            mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
-+        }
-+        if (de->isp != NULL) {
-+            mmal_component_destroy(de->isp);
-+        }
-+        if (de->display != NULL) {
-+            mmal_component_destroy(de->display);
-+        }
-+
-+        av_free(de);
-+    }
-+}
-+
-+#endif
-+
-+
- /* sub2video hack:
-    Convert subtitles to video with alpha to insert them in filter graphs.
-    This is a temporary solution until libavfilter gets real subtitles support.
-@@ -589,6 +849,11 @@ static void ffmpeg_cleanup(int ret)
-         avformat_close_input(&input_files[i]->ctx);
-         av_freep(&input_files[i]);
-     }
-+
-+#ifdef RPI_DISPLAY
-+    display_exit(&rpi_display_env);
-+#endif
-+
-     for (i = 0; i < nb_input_streams; i++) {
-         InputStream *ist = input_streams[i];
- 
-@@ -600,7 +865,9 @@ static void ffmpeg_cleanup(int ret)
-         av_freep(&ist->filters);
-         av_freep(&ist->hwaccel_device);
-         av_freep(&ist->dts_buffer);
--
-+#ifdef RPI_DISPLAY
-+        av_rpi_zc_uninit(ist->dec_ctx);
-+#endif
-         avcodec_free_context(&ist->dec_ctx);
- 
-         av_freep(&input_streams[i]);
-@@ -631,6 +898,7 @@ static void ffmpeg_cleanup(int ret)
-     }
-     term_exit();
-     ffmpeg_exited = 1;
-+
- }
- 
- void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -1070,6 +1338,17 @@ static void do_video_out(OutputFile *of,
-     if (ost->source_index >= 0)
-         ist = input_streams[ost->source_index];
- 
-+#ifdef RPI_DISPLAY
-+    if (next_picture && ist != NULL)
-+    {
-+        if (rpi_display_env == NULL)
-+            rpi_display_env = display_init(next_picture->format, 0, 0,
-+                                           next_picture->width - next_picture->crop_right,
-+                                           next_picture->height - next_picture->crop_bottom);
-+        display_frame(ist->dec_ctx, rpi_display_env, next_picture);
-+    }
-+#endif
-+
-     frame_rate = av_buffersink_get_frame_rate(filter);
-     if (frame_rate.num > 0 && frame_rate.den > 0)
-         duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base));
-@@ -2141,8 +2420,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
-                        ifilter->channel_layout != frame->channel_layout;
-         break;
-     case AVMEDIA_TYPE_VIDEO:
--        need_reinit |= ifilter->width  != frame->width ||
--                       ifilter->height != frame->height;
-+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
-+                       ifilter->height != av_frame_cropped_height(frame);
-         break;
-     }
- 
-@@ -2906,6 +3185,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
-         ist->dec_ctx->opaque                = ist;
-         ist->dec_ctx->get_format            = get_format;
-         ist->dec_ctx->get_buffer2           = get_buffer;
-+
-+#ifdef RPI_DISPLAY
-+        // Overrides the above get_buffer2
-+        av_rpi_zc_init(ist->dec_ctx);
-+#endif
-+
-         ist->dec_ctx->thread_safe_callbacks = 1;
- 
-         av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
-diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
-index 72838de1e2..6922cedc5d 100644
---- a/fftools/ffmpeg_filter.c
-+++ b/fftools/ffmpeg_filter.c
-@@ -1188,8 +1188,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
- 
-     ifilter->format = frame->format;
- 
--    ifilter->width               = frame->width;
--    ifilter->height              = frame->height;
-+    ifilter->width               = av_frame_cropped_width(frame);
-+    ifilter->height              = av_frame_cropped_height(frame);
-     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
- 
-     ifilter->sample_rate         = frame->sample_rate;
-diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
-index f5ca18aa64..483bb86b54 100644
---- a/fftools/ffmpeg_opt.c
-+++ b/fftools/ffmpeg_opt.c
-@@ -698,11 +698,19 @@ static AVCodec *choose_decoder(OptionsContext *o, AVFormatContext *s, AVStream *
- 
-     MATCH_PER_STREAM_OPT(codec_names, str, codec_name, s, st);
-     if (codec_name) {
-+        if (strcmp("hevc_rpi", codec_name) == 0) {
-+            return avcodec_find_decoder_by_id_and_fmt(AV_CODEC_ID_HEVC, st->codecpar->format);
-+        }
-         AVCodec *codec = find_codec_or_die(codec_name, st->codecpar->codec_type, 0);
-         st->codecpar->codec_id = codec->id;
-         return codec;
-     } else
-+    {
-+        if (st->codecpar->codec_id == AV_CODEC_ID_HEVC) {
-+            return avcodec_find_decoder_by_id_and_fmt(st->codecpar->codec_id, st->codecpar->format);
-+        }
-         return avcodec_find_decoder(st->codecpar->codec_id);
-+    }
- }
- 
- /* Add all the streams from the given input file to the global
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 3cd73fbcc6..aa15d411cd 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -6,6 +6,7 @@ HEADERS = ac3_parser.h                                                  \
-           avcodec.h                                                     \
-           avdct.h                                                       \
-           avfft.h                                                       \
-+          rpi_zc.h                                                      \
-           d3d11va.h                                                     \
-           dirac.h                                                       \
-           dv_profile.h                                                  \
-@@ -132,6 +133,7 @@ OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
- OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
- OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
- OBJS-$(CONFIG_RDFT)                    += rdft.o
-+OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
- OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
- OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
- OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
-@@ -368,6 +370,13 @@ OBJS-$(CONFIG_HCOM_DECODER)            += hcom.o
- OBJS-$(CONFIG_HEVC_DECODER)            += hevcdec.o hevc_mvs.o \
-                                           hevc_cabac.o hevc_refs.o hevcpred.o    \
-                                           hevcdsp.o hevc_filter.o hevc_data.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
-+                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
-+                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
-+                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
-+                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
-+                                          rpi_hevc_sei.o rpi_hevc_data.o
-+OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuvid.o
- OBJS-$(CONFIG_HEVC_AMF_ENCODER)        += amfenc_hevc.o
- OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuviddec.o
- OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
-@@ -1223,3 +1232,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
- $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
- $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
- endif
-+
-+ifdef CONFIG_HEVC_RPI_DECODER
-+QASM_PY := ../local/bin/qasm.py
-+VASMVIDCORE := ../local/bin/vasmvidcore_std
-+
-+ifneq ("$(wildcard $(QASM_PY))","")
-+$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+
-+$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+endif
-+
-+ifneq ("$(wildcard $(VASMVIDCORE))","")
-+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
-+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
-+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
-+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
-+
-+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
-+	python pi-util/make_array.py $<
-+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
-+	python pi-util/make_array.py $<
-+endif
-+
-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
-+$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
-+endif
-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index d2f9a39ce5..cc87600a2a 100644
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -145,6 +145,7 @@ extern AVCodec ff_h264_qsv_decoder;
- extern AVCodec ff_h264_rkmpp_decoder;
- extern AVCodec ff_hap_encoder;
- extern AVCodec ff_hap_decoder;
-+extern AVCodec ff_hevc_rpi_decoder;
- extern AVCodec ff_hevc_decoder;
- extern AVCodec ff_hevc_qsv_decoder;
- extern AVCodec ff_hevc_rkmpp_decoder;
-@@ -861,6 +862,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
-     }
- }
- 
-+static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
-+{
-+    const enum AVPixelFormat *pf = p->pix_fmts;
-+
-+    // Assume good if we lack info
-+    if (pf == NULL)
-+        return 1;
-+    if (fmt == AV_PIX_FMT_NONE)
-+        return 0;
-+
-+    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
-+        if (*pf == fmt)
-+            return 1;
-+    }
-+    return 0;
-+}
-+
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
-+{
-+    const AVCodec *p, *experimental = NULL;
-+    void *i = 0;
-+
-+    id= remap_deprecated_codec_id(id);
-+    while ((p = av_codec_iterate(&i))) {
-+        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
-+            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
-+                experimental = p;
-+            } else
-+                return (AVCodec *)p;
-+        }
-+        p = p->next;
-+    }
-+    return (AVCodec *)experimental;
-+}
-+
- static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
- {
-     const AVCodec *p, *experimental = NULL;
-diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index e656011c3c..f8801dfab6 100644
---- a/libavcodec/arm/Makefile
-+++ b/libavcodec/arm/Makefile
-@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
-                                           arm/sbrdsp_init_arm.o
- OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
- OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o    \
-+                                          arm/rpi_hevcpred_init_arm.o
- OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
- OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
- OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
-@@ -136,10 +138,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
- NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
- NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
- NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                          arm/hevcdsp_idct_neon.o    \
-                                           arm/hevcdsp_deblock_neon.o    \
-                                           arm/hevcdsp_idct_neon.o       \
-                                           arm/hevcdsp_qpel_neon.o       \
-                                           arm/hevcdsp_sao_neon.o
-+NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
-+                                          arm/rpi_hevc_misc_neon.o       \
-+                                          arm/rpi_hevcdsp_deblock_neon.o \
-+                                          arm/rpi_hevcdsp_idct_neon.o    \
-+                                          arm/rpi_hevcdsp_res8_neon.o    \
-+                                          arm/rpi_hevcdsp_res16_neon.o   \
-+                                          arm/rpi_hevcdsp_sao_neon.o     \
-+                                          arm/rpi_hevcpred_init_neon.o   \
-+                                          arm/rpi_hevcpred_intra_angular_neon.o \
-+                                          arm/rpi_hevcpred_intra_dc_neon.o \
-+                                          arm/rpi_hevcpred_intra_filter_neon.o \
-+                                          arm/rpi_hevcpred_intra_hv_neon.o \
-+                                          arm/rpi_hevcpred_intra_planar_neon.o
- NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
- NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
-                                           arm/rv40dsp_neon.o
-diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b45e..4755f20e2e 100644
---- a/libavcodec/arm/cabac.h
-+++ b/libavcodec/arm/cabac.h
-@@ -26,83 +26,209 @@
- #include "libavutil/internal.h"
- #include "libavcodec/cabac.h"
- 
-+
- #define get_cabac_inline get_cabac_inline_arm
- static av_always_inline int get_cabac_inline_arm(CABACContext *c,
--                                                 uint8_t *const state)
-+                                                 uint8_t *state)
- {
--    int bit;
--    void *reg_b, *reg_c, *tmp;
-+    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
-+    int bit, ptr, low, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
-+        "ldrb    %[ptr], [%[state]]                       \n\t"
-+        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
-+        "and     %[tmp2], %[bit], #0xc0                   \n\t"
-+        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
-+        "ldr     %[low], [%[c], %[low_off]]               \n\t"
-+        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
-+        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
-+        "mov     %[tmp1], %[bit]                          \n\t"
-+        "cmp     %[low], %[bit], lsl #17                  \n\t"
-+        "itt     ge                                       \n\t"
-+        "movge   %[tmp1], %[tmp2]                         \n\t"
-+        "mvnge   %[ptr], %[ptr]                           \n\t"
-+        "clz     %[tmp2], %[tmp1]                         \n\t"
-+        "it      ge                                       \n\t"
-+        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
-+        "sub     %[tmp2], %[tmp2], #23                    \n\t"
-+        "and     %[bit], %[ptr], #1                       \n\t"
-+        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
-+        "lsl     %[low], %[low], %[tmp2]                  \n\t"
-+        "lsls    %[ptr], %[low], #16                      \n\t"
-+        "bne     1f                                       \n\t"
-+        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
-+        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "rbit    %[state], %[low]                         \n\t"
-+        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
-+#else
-+        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "rbit    %[state], %[low]                         \n\t"
-+        "cmp     %[tmp1], %[ptr]                          \n\t"
-+#if CONFIG_THUMB
-+        "it      cs                                       \n\t"
-+        "ldrhcs  %[tmp1], [%[ptr]], #2                    \n\t"
-+#else
-+        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
-+#endif
-+#endif
-+        "clz     %[state], %[state]                       \n\t"
-+        "movw    %[mlps_tables], #0xffff                  \n\t"
-+        "sub     %[state], %[state], #16                  \n\t"
-+        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
-+        "rev     %[tmp1], %[tmp1]                         \n\t"
-+        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
-+        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
-+        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[tmp1], %[tmp1], %[state]               \n\t"
-+        "add     %[low], %[low], %[tmp1]                  \n\t"
-+#else
-+        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
-+#endif
-+        "str     %[low], [%[c], %[low_off]]               \n\t"
-+        "b       2f                                       \n\t"
-+        "1:                                               \n\t"
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
-+        "str     %[low], [%[c], %[low_off]]               \n\t"
-+        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
-+        "2:                                               \n\t"
-+    :  // Outputs
-+             [state]"+r"(state),
-+       [mlps_tables]"+r"(mlps_tables),
-+               [bit]"=&r"(bit),
-+               [ptr]"=&r"(ptr),
-+               [low]"=&r"(low),
-+              [tmp1]"=&r"(tmp1),
-+              [tmp2]"=&r"(tmp2)
-+    :  // Inputs
-+               [c]"r"(c),
-+         [low_off]"J"(offsetof(CABACContext, low)),
-+       [range_off]"J"(offsetof(CABACContext, range)),
-+         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
-+         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+    :  // Clobbers
-+       "cc", "memory"
-+    );
-+    return bit;
-+}
- 
--    __asm__ volatile(
--        "ldrb       %[bit]        , [%[state]]                  \n\t"
--        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
--        "mov        %[tmp]        , %[range]                    \n\t"
--        "and        %[range]      , %[range]    , #0xC0         \n\t"
--        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
--        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
--        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
--        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
--        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
--        "cmp        %[tmp]        , %[low]                      \n\t"
--        "it         gt                                          \n\t"
--        "movgt      %[range]      , %[r_c]                      \n\t"
--        "itt        cc                                          \n\t"
--        "mvncc      %[bit]        , %[bit]                      \n\t"
--        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
--        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
--        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
--        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
--        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
--        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
--        "uxth       %[r_c]        , %[low]                      \n\t"
--        "strb       %[r_b]        , [%[state]]                  \n\t"
--        "tst        %[r_c]        , %[r_c]                      \n\t"
--        "bne        2f                                          \n\t"
--        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
-+#define get_cabac_bypass get_cabac_bypass_arm
-+static inline int get_cabac_bypass_arm(CABACContext * const c)
-+{
-+    uint32_t low = c->low, range, ptr, tmp;
-+    int rv;
-+    __asm volatile (
-+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
-+        "mov        %[rv]    , #0                   \n\t"
-+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "lsl        %[low]   , #1                   \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
-+#endif
-+        "cmp        %[low]   , %[range], lsl #17    \n\t"
-+        "itt         cs                              \n\t"
-+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
-+        "movcs      %[rv]    , #1                   \n\t"
- #if UNCHECKED_BITSTREAM_READER
--        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
--        "add        %[r_c]        , %[r_c]      , #2            \n\t"
--        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
-+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "cmp        %[tmp]   , %[ptr]               \n\t"
-+#if CONFIG_THUMB
-+        "it         cs                              \n\t"
-+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
- #else
--        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
--        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
--        "cmp        %[r_c]        , %[r_b]                      \n\t"
--        "itt        lt                                          \n\t"
--        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
--        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
-+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
-+#endif
- #endif
--        "sub        %[r_c]        , %[low]      , #1            \n\t"
--        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
--        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
--        "rev        %[tmp]        , %[tmp]                      \n\t"
--        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
--        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
--        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
--        "movw       %[r_b]        , #0xFFFF                     \n\t"
--        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
--        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
--        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
--        "add        %[low]        , %[low]      , %[tmp]        \n\t"
--        "2:                                                     \n\t"
--        :    [bit]"=&r"(bit),
--             [low]"+&r"(c->low),
--           [range]"+&r"(c->range),
--             [r_b]"=&r"(reg_b),
--             [r_c]"=&r"(reg_c),
--             [tmp]"=&r"(tmp)
--        :        [c]"r"(c),
--             [state]"r"(state),
--            [tables]"r"(ff_h264_cabac_tables),
--              [byte]"M"(offsetof(CABACContext, bytestream)),
--               [end]"M"(offsetof(CABACContext, bytestream_end)),
--          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
--           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
--          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
--        : "memory", "cc"
--        );
-+        "lsls       %[range] , %[low], #16          \n\t"
-+        "bne        1f                              \n\t"
- 
--    return bit & 1;
-+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "rev        %[tmp]   , %[tmp]               \n\t"
-+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]   , 0xFFFF               \n\t"
-+        "sub        %[low]   , %[tmp]               \n\t"
-+        "1:                                         \n\t"
-+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
-+        : // Outputs
-+               [rv]"=&r"(rv),
-+              [low]"+r"(low),
-+            [range]"=&r"(range),
-+              [ptr]"=&r"(ptr),
-+              [tmp]"=&r"(tmp)
-+        : // Inputs
-+                    [c]"r"(c),
-+              [low_off]"J"(offsetof(CABACContext, low)),
-+            [range_off]"J"(offsetof(CABACContext, range)),
-+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+        : // Clobbers
-+            "memory", "cc"
-+    );
-+    return rv;
- }
-+
-+
-+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
-+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
-+{
-+    uint32_t low = c->low, range, ptr, tmp;
-+    __asm volatile (
-+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
-+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "lsl        %[low]   , #1                   \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
-+#endif
-+        "cmp        %[low]   , %[range], lsl #17    \n\t"
-+        "it         cs                              \n\t"
-+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
-+        "it         cc                              \n\t"
-+        "rsbcc      %[rv]    , %[rv], #0            \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "cmp        %[tmp]   , %[ptr]               \n\t"
-+#if CONFIG_THUMB
-+        "it         cs                              \n\t"
-+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
-+#endif
-+#endif
-+        "lsls       %[range] , %[low], #16          \n\t"
-+        "bne        1f                              \n\t"
-+
-+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "rev        %[tmp]   , %[tmp]               \n\t"
-+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]   , 0xFFFF               \n\t"
-+        "sub        %[low]   , %[tmp]               \n\t"
-+        "1:                                         \n\t"
-+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
-+        : // Outputs
-+               [rv]"+r"(rv),
-+              [low]"+r"(low),
-+            [range]"=&r"(range),
-+              [ptr]"=&r"(ptr),
-+              [tmp]"=&r"(tmp)
-+        : // Inputs
-+                    [c]"r"(c),
-+              [low_off]"J"(offsetof(CABACContext, low)),
-+            [range_off]"J"(offsetof(CABACContext, range)),
-+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+        : // Clobbers
-+            "memory", "cc"
-+    );
-+    return rv;
-+}
-+
- #endif /* HAVE_ARMV6T2_INLINE */
- 
- #endif /* AVCODEC_ARM_CABAC_H */
-diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
-new file mode 100644
-index 0000000000..c7df9f1e5a
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_cabac.h
-@@ -0,0 +1,605 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVC_CABAC_H
-+#define AVCODEC_ARM_HEVC_CABAC_H
-+
-+#include "config.h"
-+#if HAVE_ARMV6T2_INLINE
-+
-+#define hevc_mem_bits32 hevc_mem_bits32_arm
-+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
-+{
-+    unsigned int n;
-+    __asm__ (
-+        "rev        %[n], %[x]                     \n\t"
-+        : [n]"=r"(n)
-+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
-+        :
-+        );
-+    return n << (bits & 7);
-+}
-+
-+
-+// ---------------------------------------------------------------------------
-+//
-+// Helper fns - little bits of code where ARM has an instraction that the
-+// compiler doesn't know about / use
-+
-+#define trans_scale_sat trans_scale_sat_arm
-+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    int rv;
-+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
-+
-+    __asm__ (
-+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
-+    : [rv]"=r"(rv)
-+    : [t]"r"(t)
-+    :
-+    );
-+    return rv;
-+}
-+
-+#define update_rice update_rice_arm
-+static inline void update_rice_arm(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    int t = last_coeff_abs_level_remaining << 1;
-+    __asm__ (
-+    "lsrs  %[t], %[t], %[shift]             \n\t"
-+
-+    "it    eq                               \n\t"
-+    "subeq %[stat], %[stat], #1             \n\t"
-+    "cmp   %[t], #6                         \n\t"
-+    "adc   %[stat], %[stat], #0             \n\t"
-+    "usat  %[stat], #8, %[stat]             \n\t"
-+    : [stat]"+r"(*stat_coeff),
-+         [t]"+r"(t)
-+    :  [shift]"r"(c_rice_param)
-+    : "cc"
-+    );
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC get loops
-+//
-+// Where the loop is simple enough we can normally do 10-30% better than the
-+// compiler
-+
-+// Get the residual greater than 1 bits
-+
-+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
-+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i, reg_b, st, tmp, bit, rv;
-+     __asm__ (
-+         "mov        %[i]          , #0                          \n\t"
-+         "mov        %[rv]         , #0                          \n\t"
-+         "1:                                                     \n\t"
-+         "add        %[i]          , %[i]        , #1            \n\t"
-+         "cmp        %[rv]         , #0                          \n\t"
-+         "ite        eq                                          \n\t"
-+         "usateq     %[st]         , #2          , %[i]          \n\t"
-+         "movne      %[st]         , #0                          \n\t"
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "and        %[bit]        , %[bit]      , #1            \n\t"
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "it         ne                                          \n\t"
-+         "cmpne      %[n]          , %[i]                        \n\t"
-+         "bne        1b                                          \n\t"
-+
-+// If reload is not required then we must have run out of flags to decode
-+         "tst        %[tmp]        , %[tmp]                      \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "rbit       %[bit]        , %[low]                      \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "clz        %[bit]        , %[bit]                      \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "sub        %[bit]        , %[bit]      , #16           \n\t"
-+         "cmp        %[n]          , %[i]                        \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+r"(c->low),
-+            [range]"+r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+r"(c->bytestream),
-+                [i]"=&r"(i),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st),
-+               [rv]"=&r"(rv)
-+          :  [state0]"r"(state0),
-+                  [n]"r"(n),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+    return rv;
-+}
-+
-+
-+// n must be > 0 on entry
-+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * p)
-+{
-+    unsigned int reg_b, tmp, st, bit;
-+     __asm__ (
-+// Get bin from map
-+#if CONFIG_THUMB
-+         "add        %[ctx_map]    , %[n]                        \n\t"
-+         "ldrb       %[st]         , [%[ctx_map]]                \n\t"
-+#else
-+         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
-+#endif
-+         "1:                                                     \n\t"
-+
-+// Load state & ranges
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+
-+// Renorm
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+         "tst        %[bit]        , #1                          \n\t"
-+         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+// GCC asm seems to need strbne written differently for thumb and arm
-+#if CONFIG_THUMB
-+         "it         ne                                          \n\t"
-+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
-+#else
-+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
-+#endif
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "subs       %[n]          , %[n]        , #1            \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+#if CONFIG_THUMB
-+         "itt        ne                                          \n\t"
-+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
-+#else
-+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
-+#endif
-+         "bne        1b                                          \n\t"
-+
-+// If we have bits left then n must be 0 so give up now
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "rbit       %[bit]        , %[low]                      \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "clz        %[bit]        , %[bit]                      \n\t"
-+         "cmp        %[n]          , #0                          \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "sub        %[bit]        , %[bit]      , #16           \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+// Check to see if we still have more to do
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+r"(c->low),
-+            [range]"+r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+r"(c->bytestream),
-+              [idx]"+r"(p),
-+                [n]"+r"(n),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st),
-+          [ctx_map]"+r"(ctx_map)
-+          :  [state0]"r"(state0),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+
-+    return p;
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC_BY22 functions
-+
-+
-+#define get_cabac_by22_start get_cabac_by22_start_arm
-+static inline void get_cabac_by22_start_arm(CABACContext * const c)
-+{
-+    const uint8_t *ptr = c->bytestream;
-+    register uint32_t low __asm__("r1"), range __asm__("r2");
-+    uint32_t m, range8, bits;
-+#if !USE_BY22_DIV
-+    uintptr_t inv;
-+#endif
-+
-+    av_assert2(offsetof (CABACContext, low) == 0);
-+    av_assert2(offsetof (CABACContext, range) == 4);
-+    av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
-+    __asm__ volatile (
-+        "ldmia   %[c], {%[low], %[range]}                         \n\t"
-+        : // Outputs
-+               [low]"=r"(low),
-+             [range]"=r"(range)
-+        : // Inputs
-+                 [c]"r"(c)
-+        : // Clobbers
-+    );
-+#if !USE_BY22_DIV
-+    inv = (uintptr_t)cabac_by22_inv_range;
-+#endif
-+    __asm__ volatile (
-+        "ldr     %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
-+#if !USE_BY22_DIV
-+        "uxtb    %[range8], %[range]                              \n\t"
-+#endif
-+        "rbit    %[bits], %[low]                                  \n\t"
-+        "lsl     %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+        "clz     %[bits], %[bits]                                 \n\t"
-+        "str     %[ptr], [%[c], %[ptr_off]]                       \n\t"
-+        "rev     %[m], %[m]                                       \n\t"
-+        "rsb     %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+        "eor     %[m], %[m], #0x80000000                          \n\t"
-+#if !USE_BY22_DIV
-+        "ldr     %[inv], [%[inv], %[range8], lsl #2]              \n\t"
-+        "pkhbt   %[range], %[bits], %[range], lsl #16             \n\t"
-+        "str     %[range], [%[c], %[bits_off]]                    \n\t"
-+#else
-+        "strh    %[bits], [%[c], %[bits_off]]                     \n\t"
-+#endif
-+#if CONFIG_THUMB
-+        "lsr     %[m], %[ptr]                                     \n\t"
-+        "eor     %[range], %[low], %[m]                           \n\t"
-+#else
-+        "eor     %[range], %[low], %[m], lsr %[ptr]               \n\t"
-+#endif
-+        : // Outputs
-+               [ptr]"+&r"(ptr),
-+               [low]"+&r"(low),
-+             [range]"+&r"(range),
-+#if !USE_BY22_DIV
-+               [inv]"+&r"(inv),
-+#endif
-+                 [m]"=&r"(m),
-+            [range8]"=&r"(range8),
-+              [bits]"=&r"(bits)
-+        : // Inputs
-+                   [c]"r"(c),
-+            [bits_off]"J"(offsetof (CABACContext, by22.bits)),
-+             [ptr_off]"J"(offsetof (CABACContext, bytestream))
-+        : // Clobbers
-+            "memory"
-+    );
-+    c->low = range;
-+#if !USE_BY22_DIV
-+    c->range = inv;
-+#endif
-+}
-+
-+#define get_cabac_by22_peek get_cabac_by22_peek_arm
-+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
-+{
-+    uint32_t rv = c->low &~ 1, tmp;
-+    __asm__ (
-+        "cmp      %[inv] , #0                    \n\t"
-+        "it       ne                             \n\t"
-+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
-+        :  // Outputs
-+             [rv]"+r"(rv),
-+             [tmp]"=r"(tmp)
-+        :  // Inputs
-+             [inv]"r"(c->range)
-+        :  // Clobbers
-+                "cc"
-+    );
-+    return rv << 1;
-+}
-+
-+#define get_cabac_by22_flush get_cabac_by22_flush_arm
-+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
-+{
-+    uint32_t bits, ptr, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
-+        "rsb     %[tmp1], %[n], #32                \n\t"
-+        "add     %[bits], %[bits], %[n]            \n\t"
-+        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
-+        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
-+        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
-+#if CONFIG_THUMB
-+        "add     %[ptr], %[ptr], %[bits], lsr #3   \n\t"
-+        "ldr     %[ptr], [%[ptr]]                  \n\t"
-+#else
-+        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
-+#endif
-+        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
-+        "and     %[tmp2], %[bits], #7              \n\t"
-+        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
-+        "rev     %[ptr], %[ptr]                    \n\t"
-+        "lsl     %[tmp1], %[tmp1], #23             \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[val], %[n]                      \n\t"
-+        "sub     %[val], %[tmp1]                   \n\t"
-+#else
-+        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
-+#endif
-+        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
-+        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
-+        "str     %[val], [%[cc], %[low_off]]       \n\t"
-+        :  // Outputs
-+            [val]"+r"(val),
-+           [bits]"=&r"(bits),
-+            [ptr]"=&r"(ptr),
-+           [tmp1]"=&r"(tmp1),
-+           [tmp2]"=&r"(tmp2)
-+        :  // Inputs
-+                  [cc]"r"(c),
-+                   [n]"r"(n),
-+            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+           [range_off]"J"(offsetof(CABACContext, by22.range)),
-+             [low_off]"J"(offsetof(CABACContext, low))
-+        :  // Clobbers
-+           "memory"
-+    );
-+}
-+
-+#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
-+static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
-+{
-+    uint32_t last_coeff_abs_level_remaining;
-+    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
-+        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
-+        "bic     %[remain], %[remain], #1                     \n\t"
-+        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
-+        "cmp     %[prefix], #0                                \n\t"
-+        "it      ne                                           \n\t"
-+        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
-+        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
-+        "lsl     %[remain], %[remain], #1                     \n\t"
-+        "mvn     %[prefix], %[remain]                         \n\t"
-+        "clz     %[prefix], %[prefix]                         \n\t"
-+        "rsbs    %[n1], %[prefix], #2                         \n\t"
-+        "bcc     1f                                           \n\t"
-+        "adc     %[n1], %[rice], %[prefix]                    \n\t"
-+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
-+        "rsb     %[n2], %[n1], #32                            \n\t"
-+        "and     %[tmp1], %[tmp2], #7                         \n\t"
-+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
-+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
-+        "mul     %[n2], %[range], %[n2]                       \n\t"
-+        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
-+        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
-+        "rsb     %[tmp2], %[rice], #31                        \n\t"
-+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
-+        "lsl     %[n2], %[n2], #23                            \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[range], %[n1]                              \n\t"
-+        "sub     %[range], %[n2]                              \n\t"
-+#else
-+        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
-+#endif
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[remain], %[tmp2]                           \n\t"
-+        "add     %[remain], %[n2]                             \n\t"
-+#else
-+        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
-+#endif
-+        "b       3f                                           \n\t"
-+        "1:                                                   \n\t"
-+        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
-+        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
-+        "bhi     2f                                           \n\t"
-+        "sub     %[n1], %[n2], #2                             \n\t"
-+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
-+        "rsb     %[n2], %[n1], #32                            \n\t"
-+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
-+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
-+        "mul     %[n2], %[range], %[n2]                       \n\t"
-+        "rsb     %[range], %[rice], #34                       \n\t"
-+        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
-+        "and     %[tmp1], %[tmp2], #7                         \n\t"
-+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
-+        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
-+        "rsb     %[prefix], %[prefix], %[range]               \n\t"
-+        "orr     %[remain], %[remain], #0x80000000            \n\t"
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "lsl     %[n2], %[n2], #23                            \n\t"
-+        "mov     %[range], #2                                 \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[tmp2], %[n1]                               \n\t"
-+        "sub     %[tmp2], %[n2]                               \n\t"
-+#else
-+        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
-+#endif
-+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
-+        "lsl     %[rice], %[range], %[rice]                   \n\t"
-+        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[remain], %[prefix]                         \n\t"
-+        "add     %[remain], %[rice]                           \n\t"
-+#else
-+        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
-+#endif
-+        "b       4f                                           \n\t"
-+        "2:                                                   \n\t"
-+        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
-+#if CONFIG_THUMB
-+        "add     %[tmp2], %[ptr], %[n1], lsr #3               \n\t"
-+        "ldr     %[tmp2], [%[tmp2]]                           \n\t"
-+#else
-+        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
-+#endif
-+        "rsb     %[tmp1], %[prefix], #32                      \n\t"
-+        "push    {%[rice]}                                    \n\t"
-+        "and     %[rice], %[n1], #7                           \n\t"
-+        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
-+        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
-+        "mul     %[remain], %[range], %[tmp1]                 \n\t"
-+        "rev     %[tmp2], %[tmp2]                             \n\t"
-+        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
-+        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
-+        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
-+        "sub     %[tmp2], %[n2], #2                           \n\t"
-+        "lsl     %[remain], %[remain], #23                    \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[ptr], %[prefix]                            \n\t"
-+        "rsb     %[remain], %[ptr]                            \n\t"
-+#else
-+        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
-+#endif
-+        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
-+        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
-+        "bic     %[n1], %[remain], #1                         \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
-+        "cmp     %[tmp1], #0                                  \n\t"
-+        "rsb     %[rice], %[tmp2], #32                        \n\t"
-+        "it      ne                                           \n\t"
-+        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
-+        "and     %[tmp1], %[prefix], #7                       \n\t"
-+#if CONFIG_THUMB
-+        "add     %[ptr], %[ptr], %[prefix], lsr #3            \n\t"
-+        "ldr     %[ptr], [%[ptr]]                             \n\t"
-+#else
-+        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
-+#endif
-+        "lsl     %[n1], %[n1], #1                             \n\t"
-+        "lsr     %[rice], %[n1], %[rice]                      \n\t"
-+        "rsb     %[n2], %[n2], #34                            \n\t"
-+        "mul     %[range], %[range], %[rice]                  \n\t"
-+        "pop     {%[rice]}                                    \n\t"
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "orr     %[n1], %[n1], #0x80000000                    \n\t"
-+        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
-+        "mov     %[prefix], #2                                \n\t"
-+        "lsl     %[range], %[range], #23                      \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[remain], %[tmp2]                           \n\t"
-+        "rsb     %[range], %[remain]                          \n\t"
-+#else
-+        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
-+#endif
-+        "lsl     %[remain], %[prefix], %[rice]                \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[n1], %[n2]                                 \n\t"
-+        "add     %[remain], %[n1]                             \n\t"
-+#else
-+        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
-+#endif
-+        "3:                                                   \n\t"
-+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
-+        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
-+        "4:                                                   \n\t"
-+        "str     %[range], [%[cc], %[low_off]]                \n\t"
-+        :  // Outputs
-+            [remain]"=&r"(last_coeff_abs_level_remaining),
-+              [rice]"+r"(rice_param),
-+            [prefix]"=&r"(prefix),
-+                [n1]"=&r"(n1),
-+             [range]"=&r"(range),
-+                [n2]"=&r"(n2),
-+               [ptr]"=&r"(ptr),
-+              [tmp1]"=&r"(tmp1),
-+              [tmp2]"=&r"(tmp2)
-+        :  // Inputs
-+                          [cc]"r"(c),
-+            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
-+                     [low_off]"J"(offsetof(CABACContext, low)),
-+                   [range_off]"J"(offsetof(CABACContext, range)),
-+               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
-+                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
-+        :  // Clobbers
-+           "cc", "memory"
-+    );
-+    return last_coeff_abs_level_remaining;
-+}
-+
-+#endif /* HAVE_ARMV6T2_INLINE */
-+
-+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
-new file mode 100644
-index 0000000000..0211e447a8
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
-@@ -0,0 +1,161 @@
-+@ Included multiple times from hevc_idct_neon.S
-+@ Macros defined there
-+
-+#define DC_SHIFT  (15 - BIT_DEPTH)
-+#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
-+#define TRN_SHIFT (20 - BIT_DEPTH)
-+
-+function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r1, #DC_ADD
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q0, r1
-+        vdup.16     q1, r1
-+        vst1.16     {q0, q1}, [r0]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+        vst1.16     {q8, q9}, [r0], r3
-+        vst1.16     {q8, q9}, [r2], r3
-+        vst1.16     {q8, q9}, [r0]
-+        vst1.16     {q8, q9}, [r2]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        mov         ip, #16*16
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+1:      vst1.16     {q8, q9}, [r0], r3
-+        subs        ip, ip, #32
-+        vst1.16     {q8, q9}, [r2], r3
-+        bhi         1b
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        mov         ip, #32*32
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+1:      vst1.16     {q8, q9}, [r0], r3
-+        subs        ip, ip, #32
-+        vst1.16     {q8, q9}, [r2], r3
-+        bhi         1b
-+        bx lr
-+endfunc
-+
-+
-+function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
-+        vldr.i32    s0, =0x00240053 // 36 and 83
-+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
-+
-+        tr4_shift   #7
-+
-+        vzip.16     d28, d29
-+        vzip.16     d30, d31
-+        vzip.32     q14, q15
-+
-+        tr4_shift   #TRN_SHIFT
-+
-+        vst4.16     {q14, q15}, [r0 :256]
-+        bx lr
-+
-+        .ltorg
-+endfunc
-+
-+
-+
-+function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
-+        vmov.i32    d0, #0x4a  // 74
-+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
-+        vmov.i32    d1, #0x1d  // 29
-+        vmov.i32    d2, #0x37  // 55
-+
-+        tr4_luma_shift #7
-+
-+        vzip.16     d28, d29
-+        vzip.16     d30, d31
-+        vzip.32     q14, q15
-+
-+        tr4_luma_shift #TRN_SHIFT
-+
-+        vst4.16     {q14, q15}, [r0 :256]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
-+        add      r2, r0, #16
-+        adr      r3, tr4f
-+        vpush    {d8-d15}
-+        vld1.16  {d0, d1}, [r3]
-+        mov      r3, #32
-+
-+        tr8_vert  d16, d17, d18, d19, d24, d25, d26, d27, q8,  q9,  \
-+            "sub      r0, r0, #128-8",                              \
-+            "sub      r2, r2, #128-8",                              \
-+            "cmp      r1, #4"
-+        ble      2f
-+
-+        tr8_vert  d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
-+            "sub      r0, r0, #128+8",                              \
-+            "sub      r2, r2, #128+8+16-32",                        \
-+            "mov      r3, #64"
-+
-+        vzip.16  d16, d17
-+        vzip.16  d18, d19
-+
-+        vzip.16  d20, d21
-+        vzip.16  d22, d23
-+        vzip.16  d28, d29
-+        vzip.16  d30, d31
-+        vzip.32  q10, q11
-+        vzip.32  q14, q15
-+1:
-+        vzip.16  d24, d25
-+        vzip.16  d26, d27
-+        vzip.32  q8, q9
-+        vzip.32  q12, q13
-+
-+        tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8,  q9,  TRN_SHIFT
-+        tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
-+
-+        vpop     {d8-d15}
-+        bx       lr
-+
-+2:      vmov.i64 q10, #0
-+        sub      r0, r0, #8
-+        vmov.i64 q11, #0
-+        sub      r2, r2, #8+16-32
-+        vmov.i64 q14, #0
-+        mov      r3, #64
-+        vmov.i64 q15, #0
-+
-+        vzip.16  d16, d17
-+        vzip.16  d18, d19
-+
-+        b        1b
-+
-+endfunc
-+
-+#undef DC_SHIFT
-+#undef DC_ADD
-+#undef TRN_SHIFT
-+
-diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
-new file mode 100644
-index 0000000000..200eac416e
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.S
-@@ -0,0 +1,238 @@
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ rpi_zap_coeff_vals_neon(
-+@   uint16_t * buf,          [r0]
-+@   unsigned int log_n_m2)   [r1]
-+
-+function rpi_zap_coeff_vals_neon, export=1
-+        mov      ip, #1
-+        vmov.i64 q0, #0
-+        teq      r1, #0
-+        vmov.i64 q1, #0
-+        beq      2f
-+
-+        lsl      ip, r1    @ 2, 4 or 8
-+        add      r2, r0, #32
-+        lsl      ip, r1    @ 4, 16 or 64 = number of 32-byte blocks to zero
-+        mov      r3, #64
-+1:      vst1.8   {q0,q1}, [r0:256], r3
-+        subs     ip, #2
-+        vst1.8   {q0,q1}, [r2:256], r3
-+        bne      1b
-+        bx       lr
-+
-+2:      vst1.8   {q0,q1}, [r0:256]
-+        bx       lr
-+endfunc
-+
-+@ PIC jump tables are more expensive than absolute for A32 code
-+.set jent_pic, CONFIG_PIC || CONFIG_THUMB
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+T       .short ((0 + \lab) - (0 + 98b)) / 2
-+A       .short (0 + \lab) - (4 + 98b)
-+.else
-+T       .word   1 + \lab
-+A       .word   \lab
-+.endif
-+.endm
-+
-+.set expected_next, 0
-+
-+.macro cpy_compound val, p1, p2, drop_thru=0
-+.if \p1 + \p2 != \val
-+.error "Bad addition!  \p1 + \p2 != \val"
-+.endif
-+.if expected_next != 0 && expected_next != \val
-+.error "Drop thru failure"
-+.endif
-+\val\():
-+        push       {r0-r3}
-+        bl          100\p1\()b
-+        pop        {r0-r3}
-+        add         r0, #\p1
-+        add         r2, #\p1
-+.if \drop_thru == 0
-+        b           \p2\()b
-+.set expected_next, 0
-+.else
-+.set expected_next, \p2
-+.endif
-+.endm
-+
-+@ ff_hevc_cpy_blks8x4_neon(
-+@   dst         [r0]
-+@   dst_stride  [r1]
-+@   src         [r2]
-+@   src_stride  [r3]
-+@   width       [sp, #0] (bytes)
-+@   height)     [sp, #4]
-+@
-+@ Power of 2 widths are directly coded, all others are done in stripes
-+@ We expect the vast majority of calls to be power of 2
-+@
-+@ Currently has min width of 8, but we could make that 4 without issue
-+@ Min height is 4
-+
-+function ff_hevc_rpi_cpy_blks8x4_neon, export=1
-+        ldr         r12, [sp, #0]
-+        push       {r11, lr}
-+.if jent_pic
-+A       adr         lr,  98f - 2
-+.else
-+A       adr         lr,  98f - 4
-+.endif
-+        lsr         r12, #3
-+        ldr         r11, [sp, #(8 + 4)]
-+.if jent_pic
-+A       lsl         r12, #1
-+A       ldrsh       lr,  [lr,  r12]
-+A       add         pc,  lr
-+T       tbh         [pc, r12, lsl #1]
-+.else
-+        @ A32 only, Thumb is always PIC
-+        ldr         pc,  [lr,  r12, lsl #2]
-+.endif
-+
-+98:
-+T       .short      0 @ unused
-+        jent        8f
-+        jent        16f
-+        jent        24f
-+        jent        32f
-+        jent        40f
-+        jent        48f
-+        jent        56f
-+        jent        64f
-+        jent        72f
-+        jent        80f
-+        jent        88f
-+        jent        96f
-+        jent        104f
-+        jent        112f
-+        jent        120f
-+        jent        128f
-+
-+1008:
-+        push       {r11, lr}
-+8:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {d0 }, [r2],  r3
-+        vld1.32    {d1 }, [lr],  r3
-+        vld1.32    {d2 }, [r2],  r3
-+        vld1.32    {d3 }, [lr],  r3
-+        subs        r11,  #4
-+        vst1.32    {d0 }, [r0],  r1
-+        vst1.32    {d1 }, [r12], r1
-+        vst1.32    {d2 }, [r0],  r1
-+        vst1.32    {d3 }, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10016:
-+        push       {r11, lr}
-+16:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {q0 }, [r2],  r3
-+        vld1.32    {q1 }, [lr],  r3
-+        vld1.32    {q2 }, [r2],  r3
-+        vld1.32    {q3 }, [lr],  r3
-+        subs        r11, #4
-+        vst1.32    {q0 }, [r0],  r1
-+        vst1.32    {q1 }, [r12], r1
-+        vst1.32    {q2 }, [r0],  r1
-+        vst1.32    {q3 }, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10032:
-+        push       {r11, lr}
-+32:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r3
-+        vld1.32    {q10, q11}, [lr],  r3
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #4
-+        vst1.32    {q8,  q9 }, [r0],  r1
-+        vst1.32    {q10, q11}, [r12], r1
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10064:
-+        push       {r11, lr}
-+64:
-+        add         lr,  r2,  #32
-+        add         r12, r0,  #32
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r3
-+        vld1.32    {q10, q11}, [lr],  r3
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #2
-+        vst1.32    {q8,  q9 }, [r0],  r1
-+        vst1.32    {q10, q11}, [r12], r1
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+128:
-+        push       {r4, r5}
-+        @ We could do this with fewer registers if we jump around but I
-+        @ have a primative urge to load sequentially
-+        mov         r4,  #64
-+        add         lr,  r2,  #32
-+        add         r12, r0,  #32
-+        sub         r3,  r4
-+        sub         r1,  r4
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r4
-+        vld1.32    {q10, q11}, [lr],  r4
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #1
-+        vst1.32    {q8,  q9 }, [r0],  r4
-+        vst1.32    {q10, q11}, [r12], r4
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r4, r5, r11, pc}
-+
-+@ Use drop_thru where we can
-+cpy_compound 104, 64, 40, 1
-+cpy_compound 40, 32, 8
-+
-+cpy_compound 112, 64, 48, 1
-+cpy_compound 48, 32, 16
-+
-+cpy_compound 120, 64, 56, 1
-+cpy_compound 56, 32, 24, 1
-+cpy_compound 24, 16, 8
-+
-+cpy_compound 72, 64, 8
-+cpy_compound 80, 64, 16
-+cpy_compound 88, 64, 24
-+cpy_compound 96, 64, 32
-+
-+
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h
-new file mode 100644
-index 0000000000..9d21f6a882
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.h
-@@ -0,0 +1,438 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
-+#define AVCODEC_ARM_RPI_HEVC_MISC_H
-+
-+#include "config.h"
-+#if HAVE_NEON_INLINE && !CONFIG_THUMB
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                       ptrdiff_t stride_src)
-+{
-+    const uint8_t *src2 = src + stride_src;
-+    stride_src <<= 1;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {q0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {q1}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.32     {q0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.32     {q1}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.16     d0, d1                            \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.16     d2, d3                            \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d2}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vzip.16     d0, d1                            \n\t"
-+                "vst1.16     {d0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vzip.16     d2, d3                            \n\t"
-+                "vst1.16     {d2}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.8      d0, d1                            \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.8      d2, d3                            \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d2}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vzip.8      d0, d1                            \n\t"
-+                "vst1.8      {d0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vzip.8      d2, d3                            \n\t"
-+                "vst1.8      {d2}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                      ptrdiff_t stride_dst)
-+{
-+    uint8_t *dst2 = dst + stride_dst;
-+    stride_dst <<= 1;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.32     {q0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.32     {q1}, [%[src]]!                   \n\t"
-+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.32     {q0}, [%[src]]!                   \n\t"
-+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
-+                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
-+                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.16     {d0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.16     {d2}, [%[src]]!                   \n\t"
-+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.16     {d0}, [%[src]]!                   \n\t"
-+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
-+                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
-+                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "subs        %[height], #8                     \n\t"
-+                "vld1.8      {d0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.8      {d2}, [%[src]]!                   \n\t"
-+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.8      {d0}, [%[src]]!                   \n\t"
-+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
-+                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
-+                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    int x, y;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
-+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
-+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
-+                "str         %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "str         %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
-+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
-+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
-+                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "strh        %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
-+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
-+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
-+                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "strb        %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
-+static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
-+                                              int pixel_shift, int height,
-+                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    if (stride_dst == 1 << pixel_shift)
-+        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
-+    else if (stride_src == 1 << pixel_shift)
-+        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
-+    else
-+        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
-+}
-+
-+#endif /* HAVE_NEON_INLINE */
-+
-+#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
-diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h
-new file mode 100644
-index 0000000000..c73de55a48
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_mv_arm.h
-@@ -0,0 +1,64 @@
-+#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
-+#define AVCODEC_ARM_RPI_HEVC_MV_H
-+
-+#if HAVE_ARMV6T2_INLINE
-+static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
-+{
-+    MvXY r;
-+    __asm__ (
-+        "sadd16    %[r], %[a], %[b]        \n\t"
-+        : [r]"=r"(r)
-+        : [a]"r"(a),
-+          [b]"r"(b)
-+        :
-+        );
-+    return r;
-+}
-+#define mvxy_add mvxy_add_arm
-+#endif
-+
-+#if HAVE_ARMV6T2_INLINE
-+#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
-+static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
-+{
-+    int t;
-+    __asm__ (
-+    "ssat   %[td], #8,    %[td]          \n\t"
-+    "ssat   %[tb], #8,    %[tb]          \n\t"
-+    "eor    %[t],  %[td], %[td], asr #31 \n\t"
-+    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
-+    "asr    %[t],  #1                    \n\t"
-+    "add    %[t],  #0x4000               \n\t"
-+    "it ne                               \n\t"
-+    "sdivne %[t],  %[t],  %[td]          \n\t"
-+    "mov    %[td], #32                   \n\t"
-+    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
-+    "ssat   %[td], #13,   %[td], asr #6  \n\t"
-+    "mov    %[tb], #127                  \n\t"
-+    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
-+    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
-+// This takes the sign of x & y for rounding at the "wrong" point
-+// (i.e. after adding 127) but for the range of values (-1,-127)
-+// where it does the wrong thing you get the right answer (0) anyway
-+    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
-+    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
-+    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
-+    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
-+    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
-+    :
-+         [t]"=&r"(t),
-+        [xy]"+r"(xy),
-+        [td]"+r"(td),
-+        [tb]"+r"(tb)
-+    :
-+    :
-+        "cc"
-+    );
-+    return xy;
-+}
-+#define mv_scale_xy mv_scale_xy_arm
-+#endif
-+#endif
-+
-+#endif // AVCODEC_ARM_RPI_HEVC_MV_H
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
-new file mode 100644
-index 0000000000..62b9326532
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_arm.h
-@@ -0,0 +1,26 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
-+#define AVCODEC_ARM_HEVCDSP_ARM_H
-+
-+#include "libavcodec/rpi_hevcdsp.h"
-+
-+void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
-diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-new file mode 100644
-index 0000000000..18a76a4112
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-@@ -0,0 +1,1633 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
-+        vsubl.u8  q0, \Q0a, \P0a
-+        vsubl.u8  q1, \P1a, \Q1a
-+        vdup.16   d4, r2
-+        \I1
-+        vshl.i16  q0, #2
-+        \I2
-+        vadd.i16  q0, q1
-+        \I3
-+        vmovl.u8  q2, d4
-+        \I4
-+        vneg.s16  q1, q2
-+        \I5
-+        vrshr.s16 q0, #3
-+        \I6
-+        \I7
-+        \I8
-+        vmin.s16  q0, q2
-+        vmovl.u8  q2, \Q0a
-+        vmax.s16  q0, q1
-+        vaddw.u8  q1, q0, \P0a
-+        vsub.i16  q0, q2, q0
-+        vqmovun.s16 \P0a, q1
-+        vqmovun.s16 \Q0a, q0
-+.endm
-+
-+
-+.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
-+        vsubl.u8  q0, \Q0a, \P0a  @ q0a - p0a
-+        lsr       r12, r2, #16
-+        vsubl.u8  q1, \Q0b, \P0b  @ q0b - p0b
-+        vsubl.u8  q2, \P1a, \Q1a  @ p1a - q1a
-+        vsubl.u8  q3, \P1b, \Q1b  @ p1b - q1b
-+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
-+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
-+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
-+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
-+        vdup.16   d4, r2          @ tc0a, tc0b
-+        vdup.16   d6, r12         @ tc1a, tc1b
-+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+        \I1
-+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+        \I2
-+        vmovl.u8  q2, d4          @ tc0a, tc0b
-+        \I3
-+        vmovl.u8  q3, d6          @ tc1a, tc1b
-+        \I4
-+        vmin.s16  q0, q2
-+        \I5
-+        vneg.s16  q2, q2          @ -tc0a, -tc0b
-+        \I6
-+        vmin.s16  q1, q3
-+        \I7
-+        vneg.s16  q3, q3          @ -tc1a, -tc1b
-+        vmax.s16  q0, q2          @ delta0a
-+        vmovl.u8  q2, \Q0a
-+        vmax.s16  q1, q3          @ delta0b
-+        vaddw.u8  q3, q0, \P0a    @ p0a + delta0a
-+        vsub.i16  q0, q2, q0      @ q0a - delta0a
-+        vmovl.u8  q2, \Q0b
-+        vsub.i16  q2, q1          @ q0b - delta0b
-+        vaddw.u8  q1, \P0b        @ p0b + delta0b
-+        vqmovun.s16 \Q0a, q0
-+        vqmovun.s16 \P0a, q3
-+        vqmovun.s16 \Q0b, q2
-+        vqmovun.s16 \P0b, q1
-+.endm
-+
-+
-+@ Preserves r12
-+@ Clobbers r2
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@   [0..7]   tc U a
-+@   [8..15]  tc V a
-+
-+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
-+        vsub.i16  q0, \Q0a, \P0a
-+        vsub.i16  q1, \P1a, \Q1a
-+        vdup.16   d4, r2
-+        \I1
-+        vshl.i16  q0, #2
-+        \I2
-+        vadd.i16  q0, q1
-+        \I3
-+        vshll.u8  q2, d4, #\bit_depth - 8
-+        \I4
-+        vneg.s16  q1, q2
-+        \I5
-+        vrshr.s16 q0, #3
-+        \I6
-+        \I7
-+        \I8
-+        vmin.s16  q0, q2
-+        vmov.i16  q2, #0
-+        vmax.s16  q0, q1
-+        vadd.i16  \P0a, q0
-+        vsub.i16  \Q0a, q0
-+        vmov.i16  q1, #(1 << \bit_depth) - 1
-+        vmax.s16  \P0a, q2
-+        vmax.s16  \Q0a, q2
-+        vmin.s16  \P0a, q1
-+        vmin.s16  \Q0a, q1
-+.endm
-+
-+@ Clobbers r2, r12
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@   [0..7]   tc U a
-+@   [8..15]  tc V a
-+@  [16..23]  tc U b
-+@  [24..31]  tc V b
-+
-+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
-+        vsub.i16  q0, \Q0a, \P0a  @ q0a - p0a
-+        lsr       r12, r2, #16
-+        vsub.i16  q1, \Q0b, \P0b  @ q0b - p0b
-+        vsub.i16  q2, \P1a, \Q1a  @ p1a - q1a
-+        vsub.i16  q3, \P1b, \Q1b  @ p1b - q1b
-+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
-+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
-+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
-+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
-+        vdup.16   d4, r2          @ tc0a, tc0b
-+        vdup.16   d6, r12         @ tc1a, tc1b
-+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+        \I1
-+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+        \I2
-+        vshll.u8  q2, d4, #\bit_depth - 8 @ tc0a, tc0b
-+        \I3
-+        vshll.u8  q3, d6, #\bit_depth - 8 @ tc1a, tc1b
-+        \I4
-+        vmin.s16  q0, q2
-+        \I5
-+        vneg.s16  q2, q2          @ -tc0a, -tc0b
-+        \I6
-+        vmin.s16  q1, q3
-+        \I7
-+        vneg.s16  q3, q3          @ -tc1a, -tc1b
-+        vmax.s16  q0, q2          @ delta0a
-+        vadd.i16  \P0a, q0        @ p0a + delta0a
-+        vsub.i16  \Q0a, q0        @ q0a - delta0a
-+        vmax.s16  q1, q3          @ delta0b
-+        vadd.i16  \P0b, q1        @ p0b + delta0b
-+        vsub.i16  \Q0b, q1        @ q0b - delta0b
-+        vmov.i16  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        vmax.s16  \P0a, q2
-+        vmax.s16  \Q0a, q2
-+        vmax.s16  \P0b, q2
-+        vmax.s16  \Q0b, q2
-+        vmin.s16  \P0a, q3
-+        vmin.s16  \Q0a, q3
-+        vmin.s16  \P0b, q3
-+        vmin.s16  \Q0b, q3
-+.endm
-+
-+
-+
-+@   uint8_t *_no_p,     [sp+0]
-+@   uint8_t *_no_q)     [sp+4]
-+
-+.macro hevc_loop_filter_luma_start
-+        ldr     r12, [r3]
-+        ldr      r3, [r3, #4]
-+        orrs     r3, r12, r3, lsl #16
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldrd     r4, r5, [sp, #32]      @ &_no_p
-+        ldrb     r4, [r4]
-+        ldrb     r5, [r5]
-+        movs     r10, r4
-+        it ne
-+        movne    r10, #1
-+        cmp      r5, #0
-+        it ne
-+        orrne    r10, #2
-+.endm
-+
-+@ Input:
-+@  r2          beta    (raw: needs shift for bitdepth > 8)
-+@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
-+@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
-+@
-+@ Input & output
-+@  8-bit: d16-d23      (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
-+@ 16-bit:  q8-q15
-+@
-+@  r1         -r1
-+@  r10        b1->C, b0->N  (r10 junk)
-+@
-+@ Junks:
-+@  r5, r6, r7, r8, r9
-+
-+.macro m_filter_luma bit_depth, Q11, Q15
-+.if \bit_depth == 8
-+        vmovl.u8    q14, d22      @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
-+        vmovl.u8    q13, d21      @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
-+        vmovl.u8    q12, d20      @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
-+        vmovl.u8    \Q11, d19     @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
-+        vmovl.u8    q10, d18      @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
-+        vmovl.u8    q9, d17       @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
-+.endif
-+        vadd.i16    q0, q9, \Q11  @ P2 + P0
-+.if \bit_depth > 8
-+        lsl         r3, r3, #(\bit_depth - 8)
-+.endif
-+        vadd.i16    q1, q14, q12  @ Q2 + Q0
-+.if \bit_depth > 8
-+        lsl         r2, r2, #(\bit_depth - 8)
-+.endif
-+        vsub.i16    q0, q10       @ P2 - P1 + P0
-+        lsr         r5, r3, #16
-+        vsub.i16    q1, q13       @ Q2 - Q1 + Q0
-+.if \bit_depth == 8
-+        vmovl.u8    q8, d16       @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
-+        vmovl.u8    \Q15, d23     @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
-+.endif
-+        vabd.s16    q0, q10       @ dp0 = abs(P2 - 2 * P1 + P0)
-+        vabd.s16    q1, q13       @ dq0 = abs(Q2 - 2 * Q1 + Q0)
-+        vmov.i64    q2, #0xffffffff0000
-+        vbic        q0, q2        @ only dp0(') and dp3(')
-+        vbic        q1, q2        @ only dq0(') and dq3(')
-+        vsra.u64    q0, #16
-+        vsra.u64    q1, #16
-+        vdup.16     q3, r2        @ beta
-+        vdup.16     d14, r3       @ tC[0]
-+        vdup.16     d15, r5       @ tC[1]
-+        vabd.s16    q4, q8, \Q11  @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
-+        vmovn.i32   d0, q0        @ dp3' dp0' dp3 dp0
-+        vmovn.i32   d1, q1        @ dq3' dq0' dq3 dq0
-+        vadd.i16    d5, d0, d1    @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
-+        vabd.s16    q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
-+        vaba.s16    q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
-+        vpadd.i16   d2, d5, d5    @ dontcare dontcare d0'+d3' d0+d3
-+        vshl.s16    q6, q7, #2    @ tC[] * 4
-+        vrhadd.s16  q6, q7        @ tc25 = (tc[] * 5 + 1) >> 1
-+        vcgt.s16    d2, d6, d2    @ if (d0 + d3 < beta)
-+        vmov        r7, s4        @ (d2) r7 = mask of blocks to apply filtering (16b/block)
-+        vshr.s16    q1, q3, #3    @ beta_3 = beta >> 3
-+        cmp         r7, #0
-+        beq         .Lbypasswrite
-+
-+        vcgt.s16    q5, q6, q5    @ if < tc25
-+        vcgt.s16    q4, q1, q4    @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
-+        vand        q4, q5
-+        vbic        d8, d4
-+        vbic        d9, d4
-+        vshr.s16    q3, #2        @ beta_2 = beta >> 2
-+        vsra.u64    q4, #16
-+        vshl.s16    d5, #1        @ d3'<<1 d0'<<1 d3<<1 d0<<1
-+        vshl.i16    q7, #1        @ tc2 = tC[] << 1
-+        vcgt.s16    d6, d5        @ if (d3'<<1 < beta_2) etc
-+        vmovn.i32   d8, q4        @ beta_3 && tc25 tests, prime block in ms half
-+        vand        d6, d8        @ && beta_2 tests, prime in ms half
-+        vpadd.i16   d0, d1        @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
-+        vneg.s16    q6, q7        @ -tc2
-+        vmovn.i32   d8, q3
-+        vshrn.i32   d6, q3, #16
-+        vand        d6, d8
-+        vmov        r5, r6, d0    @ r5 = dp0'+dp3' dp0+dp3  r6 = dq0'+dq3' dq0+dq3
-+        vmov        r8, s12       @ (d6) r8 = mask of strong filtering blocks (16b/block)
-+        vadd.i16    q0, \Q11, q12 @ p0 + q0
-+        ands        r9, r7, r8
-+        beq         1f
-+
-+        vadd.i16    q2, q0, q10   @ p1 + p0 + q0
-+        vadd.i16    q3, q0, q13   @ p0 + q0 + q1
-+        lsr         r3, r9, #16
-+        vadd.i16    q1, q2, q9    @ p2 + p1 + p0 + q0 (new P1 before clipping)
-+        vadd.i16    q4, q3, q14   @ p0 + q0 + q1 + q2 (new Q1 before clipping)
-+        vadd.i16    q0, q8, q9    @ p3 + p2
-+        vadd.i16    q5, \Q15, q14 @ q2 + q3
-+        vadd.i16    q2, q1        @ p2 + 2 * p1 + 2 * p0 + 2 * q0
-+        vadd.i16    q3, q4        @ 2 * p0 + 2 * q0 + 2 * q1 + q2
-+        vshl.i16    q0, #1        @ 2 * p3 + 2 * p2
-+        vshl.i16    q5, #1        @ 2 * q2 + 2 * q3
-+        vadd.i16    q0, q1        @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
-+        vadd.i16    q5, q4        @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
-+        vadd.i16    q2, q13       @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
-+        vadd.i16    q3, q10       @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
-+        vrshr.s16   q0, #3        @ scale, with rounding
-+        vrshr.s16   q5, #3
-+        vrshr.s16   q1, #2
-+        vrshr.s16   q4, #2
-+        vrshr.s16   q2, #3
-+        vrshr.s16   q3, #3
-+        vsub.i16    q0, q9        @ find difference
-+        vsub.i16    q5, q14
-+        vsub.i16    q1, q10
-+        vsub.i16    q4, q13
-+        vsub.i16    q2, \Q11
-+        vsub.i16    q3, q12
-+        vmax.s16    q0, q6        @ clip difference to -tc2 .. tc2
-+        vmax.s16    q5, q6
-+        vmax.s16    q1, q6
-+        vmax.s16    q4, q6
-+        vmax.s16    q2, q6
-+        vmax.s16    q3, q6
-+        vdup.16     d12, r9       @ expand mask, reuse q6 due to register pressure
-+        vdup.16     d13, r3
-+        vmin.s16    q0, q7
-+        vmin.s16    q5, q7
-+        vmin.s16    q1, q7
-+        vmin.s16    q4, q7
-+        vmin.s16    q2, q7
-+        vmin.s16    q3, q7
-+        vadd.i16    q0, q9        @ apply difference
-+        vadd.i16    q5, q14
-+        vadd.i16    q1, q10
-+        vadd.i16    q4, q13
-+        vadd.i16    q2, \Q11
-+        vadd.i16    q3, q12
-+        vbit        q9, q0, q6    @ apply filtered values according to mask
-+        vbit        q14, q5, q6
-+        vbit        q10, q1, q6
-+        vbit        q13, q4, q6
-+        vbit        \Q11, q2, q6
-+        vbit        q12, q3, q6
-+        vneg.s16    q6, q7        @ restore -tc2
-+
-+1:
-+        bics        r9, r7, r8
-+        beq         2f
-+
-+        vsub.i16    q0, q12, \Q11 @ q0 - p0
-+        vsub.i16    q1, q13, q10  @ q1 - p1
-+        lsr         r3, r9, #16
-+        vshl.i16    q2, q0, #3
-+        lsr         r7, r5, #16
-+        vadd.i16    q3, q0, q2    @ 9 * (q0 - p0)
-+        lsr         r8, r6, #16
-+        vshl.i16    q2, q1, #1
-+        vadd.i16    q4, q1, q2    @ 3 * (q1 - p1)
-+        vshr.s16    q6, #1        @ -tc = -tc2 >> 1
-+        vsub.i16    q5, q3, q4
-+        vrhadd.s16  q1, q9, \Q11  @ (p2 + p0 + 1) >> 1
-+        vrhadd.s16  q3, q14, q12  @ (q2 + q0 + 1) >> 1
-+        vrshr.s16   q5, #4        @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
-+        vsub.i16    q1, q10       @ ((p2 + p0 + 1) >> 1) - p1
-+        vsub.i16    q3, q13       @ ((q2 + q0 + 1) >> 1) - q1
-+        vmax.s16    q6, q5        @
-+        vshr.s16    q4, q7, #1    @ tc = tc2 >> 1
-+        vdup.16     q0, r2        @ beta
-+        vmin.s16    q6, q4        @ delta0 clamped to [-tc, tc]
-+        vshr.s16    q4, #1        @ tc_2 = tc >> 1
-+        vhadd.s16   q1, q6        @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
-+        vhsub.s16   q3, q6        @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
-+        vshr.s16    q2, q0, #1    @ beta >> 1
-+        vadd.i16    q2, q0        @ beta + (beta >> 1)
-+        vneg.s16    q0, q4        @ -tc_2
-+        vabs.s16    q5, q5        @ abs(original delta0)
-+        vshr.s16    q2, #3        @ (beta + (beta >> 1)) >> 3
-+        vmax.s16    q1, q0
-+        vmax.s16    q3, q0
-+        vshl.s16    q0, q7, #2    @ 8 * tc
-+        vadd.i16    q7, q0        @ 10 * tc
-+        vdup.16     d0, r9
-+        vdup.16     d1, r3        @ q0 = mask of blocks to apply filtering
-+        vmin.s16    q1, q4        @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
-+        vmin.s16    q3, q4        @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
-+        vdup.16     d8, r5        @ dp0 + dp3
-+        vdup.16     d9, r7        @ dp0' + dp3'
-+        vcgt.s16    q7, q5        @ if ((10 * tc) > abs(delta0))
-+        vdup.16     d10, r6       @ dq0 + dq3
-+        vdup.16     d11, r8       @ dq0' + dq3'
-+        vand        q7, q0        @ AND block and line masks
-+        vcgt.s16    q4, q2, q4    @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
-+        vadd.i16    q0, q1, q10   @ p1 + deltap1
-+        vcgt.s16    q5, q2, q5    @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
-+        vadd.i16    q3, q3, q13   @ q1 + deltaq1
-+        vadd.i16    q1, \Q11, q6  @ p0 + delta0
-+        vsub.i16    q2, q12, q6   @ q0 - delta0
-+        vand        q4, q7        @ AND nd_p test with block/line masks
-+        vand        q5, q7        @ AND nd_q test with block/line masks
-+        vbit        q10, q0, q4
-+        vbit        \Q11, q1, q7
-+        vbit        q12, q2, q7
-+        vbit        q13, q3, q5
-+
-+2:
-+.if \bit_depth == 8
-+        vmovn.i16 d16, q8
-+        vmovn.i16 d23, \Q15
-+        neg       r1, r1
-+        vqmovun.s16 d17, q9
-+        vqmovun.s16 d18, q10
-+        vqmovun.s16 d19, \Q11
-+        lsls      r10, #31
-+        vqmovun.s16 d20, q12
-+        vqmovun.s16 d21, q13
-+        vqmovun.s16 d22, q14
-+.else
-+        vmov.i16  q0, #0
-+        vmov.i16  q1, #(1 << \bit_depth - 1)
-+        @ q8 & q15 should be unaltered and so don't require clipping
-+        neg       r1, r1
-+        vmax.s16  q9,  q0
-+        vmax.s16  q10, q0
-+        vmax.s16  q11, q0
-+        vmax.s16  q12, q0
-+        vmax.s16  q13, q0
-+        vmax.s16  q14, q0
-+        lsls      r10, #31
-+        vmin.s16  q9,  q1
-+        vmin.s16  q10, q1
-+        vmin.s16  q11, q1
-+        vmin.s16  q12, q1
-+        vmin.s16  q13, q1
-+        vmin.s16  q14, q1
-+.endif
-+        bx        lr
-+.endm
-+
-+function hevc_loop_filter_luma_body
-+        m_filter_luma 8, q15, q11
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
-+@   uint8_t *_pix,      [r0]
-+@   ptrdiff_t _stride,  [r1]
-+@   int _beta,          [r2]
-+@   int *_tc,           [r3]
-+@   uint8_t *_no_p,     [sp+0]
-+@   uint8_t *_no_q)     [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
-+        hevc_loop_filter_luma_start
-+
-+        sub      r4, r0, #4
-+        b        .Lv_loop_luma_common
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
-+@   uint8_t * pix_r,    [r0]
-+@   ptrdiff_t _stride,  [r1]
-+@   int _beta,          [r2]
-+@   int tc2,            [r3]
-+@   int no_f,           [sp+0]
-+@   uint8_t * pix_l)    [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r4, [sp, #36]
-+        ldr      r10, [sp, #32]
-+
-+.Lv_loop_luma_common:
-+        vpush    {d8-d15}
-+
-+        @ It's slightly faster to do unlaned loads and transpose in the
-+        @ 8-bit case, even though it needs more instructions, because
-+        @ VLD4.8 is a really slow way to read from memory.
-+        vld1.32 {d16[0]}, [r4:32], r1
-+        vld1.32 {d20[0]}, [r0:32], r1
-+        vld1.32 {d16[1]}, [r4:32], r1
-+        vld1.32 {d20[1]}, [r0:32], r1
-+        vld1.32 {d17[0]}, [r4:32], r1
-+        vld1.32 {d21[0]}, [r0:32], r1
-+        vld1.32 {d17[1]}, [r4:32], r1
-+        vld1.32 {d21[1]}, [r0:32], r1
-+        vld1.32 {d18[0]}, [r4:32], r1
-+        vld1.32 {d22[0]}, [r0:32], r1
-+        vld1.32 {d18[1]}, [r4:32], r1
-+        vld1.32 {d22[1]}, [r0:32], r1
-+        vld1.32 {d19[0]}, [r4:32], r1
-+        vld1.32 {d23[0]}, [r0:32], r1
-+        vld1.32 {d19[1]}, [r4:32]
-+        vld1.32 {d23[1]}, [r0:32]
-+        vuzp.16 q8, q9
-+        vuzp.16 q10, q11
-+        vuzp.8  q8, q9
-+        vuzp.8  q10, q11
-+        vswp    d17, d18
-+        vswp    d21, d22
-+
-+        bl hevc_loop_filter_luma_body
-+
-+        add     r6, r4, r1
-+        add     r2, r0, r1
-+        lsl     r1, #1
-+
-+        vpop     {d8-d15}
-+
-+        @ no_p[1]
-+        bmi     1f
-+        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
-+        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
-+        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
-+
-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
-+1:
-+        @ no_q[1]
-+        bcs     1f
-+        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
-+        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
-+        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
-+
-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
-+1:
-+        pop      {r4-r10,pc}
-+
-+.Lbypasswrite:
-+        vpop     {d8-d15}
-+        pop      {r4-r10,pc}
-+endfunc
-+
-+.macro m_filter_v_luma_16 bit_depth
-+        vpush    {d8-d15}
-+
-+        @ Uses slightly fewer instructions to do laned loads than unlaned
-+        @ and transpose.  This also means that we can use the same code for
-+        @ both split & unsplit deblock
-+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
-+        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
-+
-+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+
-+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
-+        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
-+
-+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+
-+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
-+        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
-+
-+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+
-+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
-+        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
-+
-+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
-+        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
-+
-+        bl hevc_loop_filter_luma_body_\bit_depth
-+
-+        add      r6, r4, r1
-+        add      r2, r0, r1
-+        lsl      r1, #1
-+
-+        vpop     {d8-d15}
-+
-+        @ p[1]
-+        bmi      1f
-+        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
-+        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
-+        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
-+        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
-+        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r6]
-+1:
-+        @ q[1]
-+        bcs      1f
-+        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
-+        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
-+        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
-+        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
-+        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
-+1:
-+        pop      {r4-r10,pc}
-+.endm
-+
-+
-+
-+
-+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
-+@                                 ptrdiff_t stride, [r1]
-+@                                 int beta,         [r2]
-+@                                 int32_t *tc,      [r3]
-+@                                 uint8_t *no_p,    sp[0]
-+@                                 uint8_t *no_q);   sp[4]
-+@
-+@ Src should always be on 8 byte boundry & all in the same slice
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
-+        hevc_loop_filter_luma_start
-+        b        .Lh_loop_filter_luma_common_8
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r10, [sp, #32]
-+
-+.Lh_loop_filter_luma_common_8:
-+        sub      r4, r0, r1, lsl #2
-+        add      r0, r4, r1
-+        lsl      r1, #1
-+        vpush    {d8-d15}
-+
-+        vld1.8  {d16}, [r4], r1
-+        vld1.8  {d17}, [r0], r1
-+        vld1.8  {d18}, [r4], r1
-+        vld1.8  {d19}, [r0], r1
-+        vld1.8  {d20}, [r4], r1
-+        vld1.8  {d21}, [r0], r1
-+        vld1.8  {d22}, [r4]
-+        vld1.8  {d23}, [r0]
-+
-+        bl hevc_loop_filter_luma_body
-+
-+        add      r0, r0, r1, lsl #1
-+        add      r2, r4, r1, lsl #1
-+        add      r6, r4, r1, asr #1
-+        vpop     {d8-d15}
-+
-+        @ P2-P0
-+        bcs      1f
-+        vst1.8   {d22}, [r4], r1
-+        vst1.8   {d21}, [r6]
-+        vst1.8   {d20}, [r4]
-+1:
-+        @ Q0-Q2
-+        bmi      1f
-+        vst1.8   {d19}, [r0], r1
-+        vst1.8   {d18}, [r2]
-+        vst1.8   {d17}, [r0]
-+1:
-+        pop      {r4-r10,pc}
-+endfunc
-+
-+
-+.macro m_filter_h_luma_16 bit_depth
-+        sub      r4, r0, r1, lsl #2
-+        add      r0, r4, r1
-+        lsl      r1, #1
-+        vpush    {d8-d15}
-+
-+        vld1.16 { q8}, [r4], r1
-+        vld1.16 { q9}, [r0], r1
-+        vld1.16 {q10}, [r4], r1
-+        vld1.16 {q11}, [r0], r1
-+        vld1.16 {q12}, [r4], r1
-+        vld1.16 {q13}, [r0], r1
-+        vld1.16 {q14}, [r4]
-+        vld1.16 {q15}, [r0]
-+
-+        bl hevc_loop_filter_luma_body_\bit_depth
-+
-+        add      r0, r0, r1, lsl #1
-+        add      r2, r4, r1, lsl #1
-+        add      r6, r4, r1, asr #1
-+        vpop     {d8-d15}
-+
-+        @ P2-P0
-+        bcs      1f
-+        vst1.16  {q14}, [r4], r1
-+        vst1.16  {q13}, [r6]
-+        vst1.16  {q12}, [r4]
-+1:
-+        bmi      1f
-+        vst1.16  {q11}, [r0], r1
-+        vst1.16  {q10}, [r2]
-+        vst1.16  { q9}, [r0]
-+1:
-+        pop      {r4-r10,pc}
-+.endm
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     unsigned int no_f);    // r3
-+@
-+@ no_f
-+@ 0  tl P0
-+@ 1  tr P1
-+@ 2  bl Q0
-+@ 3  br Q1
-+@
-+@ Probably not worth having the P/Qa only special case in this direction
-+@ Given layout we won't save any memory reads or avoid any cache dirtying
-+@ We would save a bit of computation but I expect the partials to be less
-+@ common in the H direction than V due to how we arrange deblock.
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
-+        sub      r12, r0, r1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        vld1.8   {d26,d27}, [r0]
-+        lsl      r1, #1
-+        sub      r0, r1
-+        vld1.8   {d18,d19}, [r12], r1
-+        vld1.8   {d16,d17}, [r0], r1
-+        vld1.8   {d28,d29}, [r12]
-+
-+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
-+        "sub      r12, r0, r1, asr #1"
-+
-+        lsls     r3, #29                @ b2 -> N, b3 -> C
-+        it pl
-+        vstrpl   d26, [r0, #0]
-+        it cc
-+        vstrcc   d27, [r0, #8]
-+        lsls     r3, #2                 @ b0 -> N, b1 -> C
-+        it pl
-+        vstrpl   d18, [r12, #0]
-+        it cc
-+        vstrcc   d19, [r12, #8]
-+        bx       lr
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     unsigned int no_f);    // r3
-+@
-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
-+@
-+@ Macro here actual function near bottom
-+
-+.macro m_filter_h_uv_16 bit_depth
-+        sub      r12, r0, r1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        vld1.16  {q12, q13}, [r0]
-+        lsl      r1, #1
-+        sub      r0, r1
-+        vld1.16  {q10, q11}, [r12], r1
-+        vld1.16  {q8,  q9 }, [r0], r1
-+        vld1.16  {q14, q15}, [r12]
-+
-+        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
-+        "sub      r12, r0, r1, asr #1", \
-+        "cmp      r3, #0"
-+
-+        bne      1f
-+        vst1.16  {q10, q11}, [r12]
-+        vst1.16  {q12, q13}, [r0]
-+        bx       lr
-+
-+        @ At least one no_f bit is set
-+        @ Which means we need to break this apart in an ugly fashion
-+1:
-+        lsls     r3, #29                @ b2 -> N, b3 -> C
-+        itt pl
-+        vstrpl   d24, [r0, #0]
-+        vstrpl   d25, [r0, #8]
-+        itt cc
-+        vstrcc   d26, [r0, #16]
-+        vstrcc   d27, [r0, #24]
-+        lsls     r3, #2                 @ b0 -> N, b1 -> C
-+        itt pl
-+        vstrpl   d20, [r12, #0]
-+        vstrpl   d21, [r12, #8]
-+        itt cc
-+        vstrcc   d22, [r12, #16]
-+        vstrcc   d23, [r12, #24]
-+        bx       lr
-+.endm
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     uint8_t * src_l,       // r3
-+@                                     unsigned int no_f);   // sp[0]
-+@
-+@ no_f:
-+@ 0  tl P0
-+@ 1  tr Q0
-+@ 2  bl P1
-+@ 3  br Q1
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        push     {lr}
-+        vld2.16  {d16[0], d18[0]}, [r3], r1
-+        vld2.16  {d20[0], d22[0]}, [r0], r1
-+
-+        cmp      r2, #0x10000
-+        vld2.16  {d16[1], d18[1]}, [r3], r1
-+        vld2.16  {d20[1], d22[1]}, [r0], r1
-+
-+        vld2.16  {d16[2], d18[2]}, [r3], r1
-+        vld2.16  {d20[2], d22[2]}, [r0], r1
-+
-+        vld2.16  {d16[3], d18[3]}, [r3], r1
-+        vld2.16  {d20[3], d22[3]}, [r0], r1
-+        blo      10f
-+
-+        vld2.16  {d17[0], d19[0]}, [r3], r1
-+        vld2.16  {d21[0], d23[0]}, [r0], r1
-+
-+        sub      ip, r0, r3
-+        vld2.16  {d17[1], d19[1]}, [r3], r1
-+        vld2.16  {d21[1], d23[1]}, [r0], r1
-+
-+        cmp      ip, #4
-+        vld2.16  {d17[2], d19[2]}, [r3], r1
-+        vld2.16  {d21[2], d23[2]}, [r0], r1
-+
-+        vld2.16  {d17[3], d19[3]}, [r3]
-+        vld2.16  {d21[3], d23[3]}, [r0]
-+
-+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
-+        "ldr      lr, [sp, #4]", \
-+        "neg      r1, r1",       \
-+        "it eq; cmpeq lr, #0",   \
-+        "add      r3, #2",       \
-+        "add      ip, r3, r1",   \
-+        "add      r2, r0, r1",   \
-+        "lsl      r1, #1"
-+
-+        bne      1f
-+
-+@ Much/most of the time r0 == r3 + 4 and no_f == 0
-+@ so it is worth having this special case
-+        vst2.16   {d19[3], d21[3]}, [r3], r1    @ P0b, Q0b
-+        vst2.16   {d19[2], d21[2]}, [ip], r1
-+        vst2.16   {d19[1], d21[1]}, [r3], r1
-+        vst2.16   {d19[0], d21[0]}, [ip], r1
-+        vst2.16   {d18[3], d20[3]}, [r3], r1    @ P0a, Q0a
-+        vst2.16   {d18[2], d20[2]}, [ip], r1
-+        vst2.16   {d18[1], d20[1]}, [r3]
-+        vst2.16   {d18[0], d20[0]}, [ip]
-+        pop       {pc}
-+
-+@ Either split or partial
-+1:
-+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+        ittt cs
-+        addcs    r0, r0, r1, lsl #1
-+        addcs    r2, r2, r1, lsl #1
-+        bcs      1f
-+        @ Q0b
-+        vst1.16  {d21[3]}, [r0], r1
-+        vst1.16  {d21[2]}, [r2], r1
-+        vst1.16  {d21[1]}, [r0], r1
-+        vst1.16  {d21[0]}, [r2], r1
-+1:
-+        ittt mi
-+        addmi    r3, r3, r1, lsl #1
-+        addmi    ip, ip, r1, lsl #1
-+        bmi      1f
-+        @ P0b
-+        vst1.16  {d19[3]}, [r3], r1
-+        vst1.16  {d19[2]}, [ip], r1
-+        vst1.16  {d19[1]}, [r3], r1
-+        vst1.16  {d19[0]}, [ip], r1
-+1:
-+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+        bcs      1f
-+        @ Q0a
-+        vst1.16  {d20[3]}, [r0], r1
-+        vst1.16  {d20[2]}, [r2], r1
-+        vst1.16  {d20[1]}, [r0]
-+        vst1.16  {d20[0]}, [r2]
-+1:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.16  {d18[3]}, [r3], r1
-+        vst1.16  {d18[2]}, [ip], r1
-+        vst1.16  {d18[1]}, [r3]
-+        vst1.16  {d18[0]}, [ip]
-+        pop      {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+        @ As we have post inced r0/r3 in the load the easiest thing to do is
-+        @ to subtract and write forwards, rather than backwards (as above)
-+        @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+        hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
-+        "ldr      lr, [sp, #4]",       \
-+        "add      r3, #2",             \
-+        "sub      r0, r0, r1, lsl #2", \
-+        "sub      r3, r3, r1, lsl #2", \
-+        "lsls     lr, #31",            \
-+        "add      r2, r0, r1",         \
-+        "add      ip, r3, r1",         \
-+        "lsl      r1, #1"
-+
-+        bcs      3f
-+        @ Q0a
-+        vst1.16  {d20[0]}, [r0], r1
-+        vst1.16  {d20[1]}, [r2], r1
-+        vst1.16  {d20[2]}, [r0]
-+        vst1.16  {d20[3]}, [r2]
-+3:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.16  {d18[0]}, [r3], r1
-+        vst1.16  {d18[1]}, [ip], r1
-+        vst1.16  {d18[2]}, [r3]
-+        vst1.16  {d18[3]}, [ip]
-+        pop      {pc}
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     uint8_t * src_l,       // r3
-+@                                     unsigned int no_f);   // sp[0]
-+@
-+
-+@ no_f
-+@ 0  tl P0a
-+@ 1  tr Q0a
-+@ 2  bl P0b
-+@ 3  br Q0b
-+
-+@ P1: q8,  q12
-+@ P0: q9,  q13
-+@ Q0: q10, q14
-+@ Q1: q11, q15
-+
-+.macro m_filter_v_uv2_16 bit_depth
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        push     {lr}
-+        vld2.32  {d16[0], d18[0]}, [r3], r1
-+        vld2.32  {d20[0], d22[0]}, [r0], r1
-+
-+        cmp      r2, #0x10000
-+        vld2.32  {d16[1], d18[1]}, [r3], r1
-+        vld2.32  {d20[1], d22[1]}, [r0], r1
-+
-+        vld2.32  {d17[0], d19[0]}, [r3], r1
-+        vld2.32  {d21[0], d23[0]}, [r0], r1
-+
-+        vld2.32  {d17[1], d19[1]}, [r3], r1
-+        vld2.32  {d21[1], d23[1]}, [r0], r1
-+        blo      10f
-+
-+        vld2.32  {d24[0], d26[0]}, [r3], r1
-+        vld2.32  {d28[0], d30[0]}, [r0], r1
-+
-+        sub      ip, r0, r3
-+        vld2.32  {d24[1], d26[1]}, [r3], r1
-+        vld2.32  {d28[1], d30[1]}, [r0], r1
-+
-+        cmp      ip, #8
-+        vld2.32  {d25[0], d27[0]}, [r3], r1
-+        vld2.32  {d29[0], d31[0]}, [r0], r1
-+
-+        vld2.32  {d25[1], d27[1]}, [r3]
-+        vld2.32  {d29[1], d31[1]}, [r0]
-+
-+        hevc_loop_filter_uv_body2_16  q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
-+        "ldr      lr, [sp, #4]", \
-+        "neg      r1, r1",       \
-+        "it eq; cmpeq lr, #0",   \
-+        "add      r3, #4",       \
-+        "add      ip, r3, r1",   \
-+        "add      r2, r0, r1",   \
-+        "lsl      r1, #1"
-+
-+        bne      1f
-+
-+@ Much/most of the time r0 == r3 + 8 and no_f == 0
-+@ so it is worth having this special case
-+        vst2.32   {d27[1], d29[1]}, [r3], r1    @ P0b, Q0b
-+        vst2.32   {d27[0], d29[0]}, [ip], r1
-+        vst2.32   {d26[1], d28[1]}, [r3], r1
-+        vst2.32   {d26[0], d28[0]}, [ip], r1
-+        vst2.32   {d19[1], d21[1]}, [r3], r1    @ P0a, Q0a
-+        vst2.32   {d19[0], d21[0]}, [ip], r1
-+        vst2.32   {d18[1], d20[1]}, [r3]
-+        vst2.32   {d18[0], d20[0]}, [ip]
-+        pop       {pc}
-+
-+@ Either split or partial
-+1:
-+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+        ittt cs
-+        addcs    r0, r0, r1, lsl #1
-+        addcs    r2, r2, r1, lsl #1
-+        bcs      1f
-+        @ Q0b
-+        vst1.32  {d29[1]}, [r0], r1
-+        vst1.32  {d29[0]}, [r2], r1
-+        vst1.32  {d28[1]}, [r0], r1
-+        vst1.32  {d28[0]}, [r2], r1
-+1:
-+        ittt mi
-+        addmi    r3, r3, r1, lsl #1
-+        addmi    ip, ip, r1, lsl #1
-+        bmi      1f
-+        @ P0b
-+        vst1.32  {d27[1]}, [r3], r1
-+        vst1.32  {d27[0]}, [ip], r1
-+        vst1.32  {d26[1]}, [r3], r1
-+        vst1.32  {d26[0]}, [ip], r1
-+1:
-+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+        bcs      1f
-+        @ Q0a
-+        vst1.32  {d21[1]}, [r0], r1
-+        vst1.32  {d21[0]}, [r2], r1
-+        vst1.32  {d20[1]}, [r0]
-+        vst1.32  {d20[0]}, [r2]
-+1:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.32  {d19[1]}, [r3], r1
-+        vst1.32  {d19[0]}, [ip], r1
-+        vst1.32  {d18[1]}, [r3]
-+        vst1.32  {d18[0]}, [ip]
-+        pop      {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+        @ As we have post inced r0/r3 in the load the easiest thing to do is
-+        @ to subtract and write forwards, rather than backwards (as above)
-+        @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+        hevc_loop_filter_uv_body1_16  q8, q9, q10, q11, \bit_depth, \
-+        "ldr      lr, [sp, #4]",       \
-+        "add      r3, #4",             \
-+        "sub      r0, r0, r1, lsl #2", \
-+        "sub      r3, r3, r1, lsl #2", \
-+        "lsls     lr, #31",            \
-+        "add      r2, r0, r1",         \
-+        "add      ip, r3, r1",         \
-+        "lsl      r1, #1"
-+
-+        bcs      3f
-+        @ Q0a
-+        vst1.32  {d20[0]}, [r0], r1
-+        vst1.32  {d20[1]}, [r2], r1
-+        vst1.32  {d21[0]}, [r0]
-+        vst1.32  {d21[1]}, [r2]
-+3:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.32  {d18[0]}, [r3], r1
-+        vst1.32  {d18[1]}, [ip], r1
-+        vst1.32  {d19[0]}, [r3]
-+        vst1.32  {d19[1]}, [ip]
-+        pop      {pc}
-+.endm
-+
-+
-+@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
-+@ But in real world testing it is ~20% slower, presumably due to code size
-+
-+#if 0 // NEON version
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc0, int in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+        mov         ip, sp
-+        push        {a1-a3,v1-v8,lr}
-+        ldm         ip, {v1-v6}
-+        cmp         a1, #2
-+        bls         2f
-+        vpush       {d8-d13}
-+        sub         v5, v5, #10
-+        sub         v6, v6, #10
-+1:
-+        vld2.32     {d0[0], d2[0]}, [a3]!
-+        vld2.32     {d4[0], d6[0]}, [a4]!
-+          vmov.u8     q12, #0
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        v8, [a3], #1
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[0]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[0]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d16[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d20[0]}, [ip]
-+        vld1.32     {d18[0]}, [v8]
-+        vld1.32     {d22[0]}, [lr]
-+
-+        vld2.32     {d0[1], d2[1]}, [a3]!
-+        vld2.32     {d4[1], d6[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+          vmov.u16    d12, #1
-+        ldrb        ip, [a4], #1
-+          vmov.u16    d13, #2
-+        ldrb        v8, [a3], #1
-+          vmov.u16    d27, #4
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[2]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[2]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d16[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d20[1]}, [ip]
-+        vld1.32     {d18[1]}, [v8]
-+        vld1.32     {d22[1]}, [lr]
-+
-+        vld2.32     {d1[0], d3[0]}, [a3]!
-+        vld2.32     {d5[0], d7[0]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        lr, [a4], #1
-+        ldrb        v8, [a3], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[4]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[4]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d17[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d21[0]}, [ip]
-+        vld1.32     {d19[0]}, [v8]
-+        vld1.32     {d23[0]}, [lr]
-+
-+        vld2.32     {d1[1], d3[1]}, [a3]!
-+        vld2.32     {d5[1], d7[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        v8, [a3], #1
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[6]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[6]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d17[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d21[1]}, [ip]
-+        vld1.32     {d19[1]}, [v8]
-+        vld1.32     {d23[1]}, [lr]
-+
-+        @ So now we have:
-+        @ q0.32[i]  = curr[i].mv[0]
-+        @ q1.32[i]  = curr[i].mv[1]
-+        @ q2.32[i]  = neigh[i].mv[0]
-+        @ q3.32[i]  = neigh[i].mv[1]
-+        @ q8.32[i]  = curr_rpl0[curr[i].ref_idx[0]]
-+        @ q9.32[i]  = curr_rpl1[curr[i].ref_idx[1]]
-+        @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+        @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+        @ d24.16[i] = curr[i].pred_flag
-+        @ d25.16[i] = neigh[i].pred_flag
-+
-+        vtst.16     d28, d24, d12
-+        vtst.16     d29, d24, d13
-+        vadd.i16    d8, d24, d12
-+        vadd.i16    d9, d25, d12
-+        vtst.16     d30, d25, d12
-+        vtst.16     d31, d25, d13
-+        veor        d26, d8, d9
-+          ldr         lr, [sp, 6*8 + 1*4]
-+        vmovl.s16   q4, d28
-+        vmovl.s16   q5, d29
-+          teq         lr, #1
-+        vmovl.s16   q14, d30
-+          it ne
-+          lslne       v1, lr, #1
-+        vmovl.s16   q15, d31
-+          it ne
-+          rsbne       v2, v1, #32
-+        vbif        q0, q1, q4
-+        vbif        q2, q3, q14
-+        vbif        q1, q0, q5
-+        vbif        q3, q2, q15
-+        vabd.s16    q12, q0, q2
-+        vabd.s16    q2, q1
-+        vabd.s16    q0, q3
-+        vabd.s16    q1, q3
-+        vbif        q8, q9, q4
-+        vbif        q10, q11, q14
-+        vbif        q9, q8, q5
-+        vbif        q11, q10, q15
-+        vclt.u16    d6, d24, d27
-+        vclt.u16    d8, d2, d27
-+        vclt.u16    d7, d25, d27
-+        vclt.u16    d9, d3, d27
-+        vclt.u16    d2, d0, d27
-+        vclt.u16    d0, d4, d27
-+        vclt.u16    d3, d1, d27
-+        vclt.u16    d1, d5, d27
-+        vceq.i32    q12, q10, q8
-+        vceq.i32    q10, q9
-+        vceq.i32    q8, q11
-+        vceq.i32    q9, q11
-+        vshrn.i32   d6, q3, #8
-+        vshrn.i32   d7, q4, #8
-+        vshrn.i32   d8, q1, #8
-+        vshrn.i32   d9, q0, #8
-+        vmovn.i32   d4, q12
-+        vmovn.i32   d2, q10
-+        vmovn.i32   d3, q8
-+        vmovn.i32   d5, q9
-+        vand        q2, q3
-+        vrev16.8    q3, q3
-+        vand        q2, q3
-+        vand        q1, q4
-+        vrev16.8    q4, q4
-+        vand        q1, q4
-+        vand        d4, d5
-+        vand        d2, d3
-+        vbic        d0, d12, d4
-+        vshr.u16    d26, #2
-+        vbic        d0, d2
-+        vmov.i16    d1, #0x5555
-+        vorr        d0, d26
-+          bne         10f
-+
-+        @ Merge results into result word, no duplicates
-+        vmov        a2, s0
-+        vmov        v8, s1
-+        vmov.u16    ip, d0[1]
-+        vmov.u16    lr, d0[3]
-+        lsl         a2, #30
-+        lsl         v8, #30
-+        lsl         ip, #30
-+        lsl         lr, #30
-+        orr         a2, ip, a2, lsr #2
-+        orr         v8, lr, v8, lsr #2
-+        orr         a2, v8, a2, lsr #4
-+        subs        a1, #4
-+        orr         v7, a2, v7, lsr #8
-+        bhi         1b
-+
-+        mov         a1, #32
-+        ldr         a3, [sp, #6*8]
-+        vpop        {d8-d13}
-+        sub         a1, a1, a3, lsl #1
-+        mov         a1, v7, lsr a1
-+        pop         {a2-a4,v1-v8,pc}
-+10:
-+        @ Merge results into result word, with duplicates
-+        vmul.i16    d0, d1
-+        vmov        a2, s0
-+        vmov        v8, s1
-+        vmov.u16    ip, d0[1]
-+        vmov.u16    lr, d0[3]
-+        lsl         a2, v2
-+        subs        a1, #4
-+        lsl         v8, v2
-+        lsl         ip, v2
-+        lsl         lr, v2
-+        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
-+T       lsr         a2, v1
-+T       orr         a2, ip, a2
-+A       orr         a2, ip, a2, lsr v1
-+        lsl         ip, v1, #1
-+T       lsr         v8, v1
-+T       orr         v8, lr, v8
-+A       orr         v8, lr, v8, lsr v1
-+        lsl         lr, v1, #2
-+T       lsr         a2, ip
-+T       orr         a2, v8, a2
-+A       orr         a2, v8, a2, lsr ip
-+        ldr         v1, [sp, #6*8 + 12*4]
-+T       lsr         v7, lr
-+T       orr         v7, a2, v7
-+A       orr         v7, a2, v7, lsr lr
-+        bhi         1b
-+
-+        mov         a1, #32
-+        ldrd        a3, a4, [sp, #6*8]
-+        vpop        {d8-d13}
-+        mls         a1, a3, a4, a1
-+        mls         a1, a3, a4, a1
-+        mov         a1, v7, lsr a1
-+        pop         {a2-a4,v1-v8,pc}
-+
-+
-+2:
-+        sub         v5, v5, #10
-+        sub         v6, v6, #10
-+        vmov.u8     d16, #0
-+        blo         3f
-+        vld2.32     {d0[0], d1[0]}, [a3]!
-+        vld2.32     {d2[0], d3[0]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        lr, [a4], #1
-+        ldrb        v8, [a3], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d16[0]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[4]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d4[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d5[0]}, [ip]
-+        vld1.32     {d6[0]}, [v8]
-+        vld1.32     {d7[0]}, [lr]
-+
-+3:
-+        vld2.32     {d0[1], d1[1]}, [a3]!
-+        vld2.32     {d2[1], d3[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+          vmov.u16    d17, #1
-+        ldrb        ip, [a4], #1
-+          vmov.u16    d18, #2
-+        ldrb        v8, [a3], #1
-+          vmov.u16    d19, #4
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d16[2]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[6]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d4[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d5[1]}, [ip]
-+        vld1.32     {d6[1]}, [v8]
-+        vld1.32     {d7[1]}, [lr]
-+
-+        @ So now we have:
-+        @ d0.32[i]  = curr[i].mv[0]
-+        @ d1.32[i]  = curr[i].mv[1]
-+        @ d2.32[i]  = neigh[i].mv[0]
-+        @ d3.32[i]  = neigh[i].mv[1]
-+        @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
-+        @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+        @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
-+        @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+        @ d16.16[i] = curr[i].pred_flag
-+        @ d16.16[2+i] = neigh[i].pred_flag
-+
-+        vtst.16     d20, d16, d17
-+        vtst.16     d22, d16, d18
-+        vadd.i16    d30, d16, d17
-+        vswp        d2, d3
-+        ldr         lr, [sp, #1*4]
-+        vmovl.s16   q10, d20
-+          teq         lr, #1
-+        vmovl.s16   q11, d22
-+          it ne
-+          lslne       v1, lr, #1
-+        vbif        d0, d1, d20
-+        vbif        d4, d6, d20
-+        vbif        d3, d2, d21
-+        vbif        d5, d7, d21
-+        vbif        d1, d0, d22
-+        vbif        d6, d4, d22
-+        vbif        d2, d3, d23
-+        vbif        d7, d5, d23
-+        vshr.u16    d30, #2
-+        vabd.s16    d24, d0, d3
-+        vabd.s16    d25, d1, d2
-+        vabd.s16    q0, q0, q1
-+        vceq.i32    d2, d4, d5
-+        vceq.i32    d20, d5, d6
-+        vceq.i32    d21, d4, d7
-+        vceq.i32    d3, d6, d7
-+        vclt.u16    d6, d24, d19
-+        vclt.u16    d7, d25, d19
-+        vclt.u16    d22, d1, d19
-+        vclt.u16    d23, d0, d19
-+        vshrn.i32   d6, q3, #8
-+        vmovn.i32   d2, q1
-+        vshrn.i32   d7, q11, #8
-+        vmovn.i32   d3, q10
-+        vand        q0, q3, q1
-+          it ne
-+          rsbne       v2, v1, #32
-+        vrev16.8    q3, q3
-+        vand        q0, q3
-+        vsra.u64    d30, #32
-+        vshr.u64    q1, q0, #32
-+        vand        q0, q1
-+        vbic        d0, d17, d0
-+        vand        d30, d30, d17
-+        vbic        d0, d1
-+        vmov.i16    d1, #0x5555
-+        vorr        d0, d30
-+          bne         10f
-+
-+        @ Construct result word, no duplicates
-+        cmp         a1, #2
-+        vmov.u16    a1, d0[1]
-+        vmov.u16    a2, d0[0]
-+        it eq
-+        orreq       a1, a2, a1, lsl #2
-+        pop         {a2-a4,v1-v8,pc}
-+10:
-+        @ Construct result word, with duplicates
-+        cmp         a1, #2
-+        vmul.i16    d0, d1
-+        vmov        a2, s0
-+        vmov.u16    a1, d0[1]
-+        lsl         a2, #16
-+        pkhbt       a1, a1, a1, lsl #16
-+        lsr         a2, v2
-+        lsr         a1, v2
-+T       itt eq
-+T       lsleq       a1, v1
-+T       orreq       a1, a2, a1
-+A       orreq       a1, a2, a1, lsl v1
-+        pop         {a2-a4,v1-v8,pc}
-+endfunc
-+
-+
-+
-+#else // non-NEON version
-+
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc0, in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+        add         ip, sp, #4*4
-+        push        {a2-a4,v1-v8,lr}
-+        mov         v6, #32
-+1:      ldmdb       ip, {v1-v4}
-+        ldrsb       v5, [a3, #8]    @ curr->ref_idx
-+        ldrsb       v8, [a3, #9]
-+        ldrsb       ip, [a4, #8]    @ neigh->ref_idx
-+        ldrsb       lr, [a4, #9]
-+        ldr         v1, [v1, v5, lsl #2]
-+        ldrb        v5, [a3, #10]   @ curr->pred_flag
-+        ldr         v2, [v2, v8, lsl #2]
-+        ldrb        v8, [a4, #10]   @ neigh->pred_flag
-+        ldr         v3, [v3, ip, lsl #2]
-+        ldr         v4, [v4, lr, lsl #2]
-+        teq         v5, #3
-+        beq         20f
-+        teq         v8, #3
-+        beq         90f
-+
-+        tst         v5, #1
-+        itee        ne
-+        ldrne       v5, [a3, #0]    @ curr->mv[0]
-+        moveq       v1, v2
-+        ldreq       v5, [a3, #4]    @ curr->mv[1]
-+        tst         v8, #1
-+        itee        ne
-+        ldrne       v8, [a4, #0]    @ neigh->mv[0]
-+        moveq       v3, v4
-+        ldreq       v8, [a4, #4]    @ neigh->mv[1]
-+        teq         v1, v3
-+        bne         10f
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v8, v5
-+        ssub16      v5, v5, v8
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        @ drop through
-+10:     it          ne
-+        movne       v5, #1<<30
-+11:
-+        sub         v6, v6, #2
-+T       mov         v7, v7, lsr #2
-+        subs        a2, a2, #1
-+A       orr         v7, v5, v7, lsr #2
-+T       orr         v7, v5, v7
-+        bhi         11b
-+
-+        ldrd        v3, v4, [sp, #16*4]
-+        ldr         a2, [sp]
-+        add         ip, sp, #16*4
-+        subs        a1, a1, #1
-+        add         a3, a3, v3
-+        add         a4, a4, v4
-+        bhi         1b
-+        mov         a1, v7, lsr v6
-+        pop         {a2-a4,v1-v8,pc}
-+
-+20:     teq         v8, #3
-+        bne         10b
-+
-+        teq         v1, v3
-+        it          eq
-+        teqeq       v2, v4
-+        bne         40f
-+        teq         v1, v2
-+        bne         30f
-+
-+        ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      v5, v1, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         25f
-+        ssub16      ip, v4, v2
-+        ssub16      v5, v2, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        beq         11b
-+        @ drop through
-+25:     ssub16      ip, v4, v1
-+        ssub16      v5, v1, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         10b
-+        ssub16      ip, v3, v2
-+        ssub16      v5, v2, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        b           10b
-+
-+30:     ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      v5, v1, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         10b
-+        ssub16      ip, v4, v2
-+        ssub16      v5, v2, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        b           10b
-+
-+40:     teq         v1, v4
-+        ite         eq
-+        teqeq       v2, v3
-+        bne         10b
-+
-+        ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        b           25b
-+
-+90:
-+        mov         v5, #1<<30
-+        b           11b
-+endfunc
-+
-+
-+#endif
-+
-+
-+@ =============================================================================
-+@
-+@ 10 bit
-+
-+function hevc_loop_filter_luma_body_10
-+        m_filter_luma 10, q11, q15
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
-+        hevc_loop_filter_luma_start
-+        b        .Lh_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r10, [sp, #32]
-+.Lh_loop_luma_common_10:
-+        m_filter_h_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
-+        hevc_loop_filter_luma_start
-+        sub      r4, r0, #8
-+        b        .Lv_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r4, [sp, #36]
-+        ldr      r10, [sp, #32]
-+
-+.Lv_loop_luma_common_10:
-+        m_filter_v_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
-+        m_filter_h_uv_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
-+        m_filter_v_uv2_16 10
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
-new file mode 100644
-index 0000000000..db10da16d3
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
-@@ -0,0 +1,183 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+/* uses registers q8 - q13 for temp values */
-+.macro tr4_luma_shift shift
-+        vaddl.s16   q8, d28, d30    // c0 = src0 + src2
-+        vaddl.s16   q9, d30, d31    // c1 = src2 + src3
-+        vsubl.s16   q10, d28, d31   // c2 = src0 - src3
-+        vaddl.s16   q11, d28, d31   // src0 + src3
-+
-+        vmul.i32    q12, q8, d1[0]  // 29 * c0
-+        vmul.i32    q13, q10, d2[0] // 55 * c2
-+        vmul.i32    q8, q8, d2[0]   // 55 * c0
-+        vmull.s16   q14, d29, d0[0] // c3 = 74 * src1
-+
-+        vsubw.s16   q11, q11, d30   // src0 - src2 + src3
-+        vmla.i32    q12, q9, d2[0]  // 29 * c0 + 55 * c1
-+        vmls.i32    q13, q9, d1[0]  // 55 * c2 - 29 * c1
-+        vmla.i32    q8, q10, d1[0]  // 55 * c0 + 29 * c2
-+
-+        vmul.i32    q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
-+        vadd.i32    q12, q12, q14   // dst0 = 29 * c0 + 55 * c1 + c3
-+        vadd.i32    q13, q13, q14   // dst1 = 55 * c2 - 29 * c1 + c3
-+        vsub.i32    q8, q8, q14     // dst3 = 55 * c0 + 29 * c2 - c3
-+
-+        vqrshrn.s32 d28, q12, \shift
-+        vqrshrn.s32 d29, q13, \shift
-+        vqrshrn.s32 d30, q11, \shift
-+        vqrshrn.s32 d31, q8, \shift
-+.endm
-+
-+/* uses registers q8 - q11 for temp values */
-+.macro tr4_shift shift
-+        vmull.s16   q9, d29, d0[0]   // 83 * src1
-+        vmull.s16   q8, d29, d0[1]   // 36 * src1
-+        vshll.s16   q14, d28, #6     // 64 * src0
-+        vshll.s16   q10, d30, #6     // 64 * src2
-+        vmlal.s16   q9, d31, d0[1]   // 83 * src1 + 36 * src3  o0
-+        vmlsl.s16   q8, d31, d0[0]   // 36 * src1 - 83 * src3  o1
-+        vadd.s32    q11, q14, q10    // 64 * (src0 + src2)     e0
-+        vsub.s32    q10, q14, q10    // 64 * (src0 - src2)     e1
-+        vadd.s32    q14, q11, q9     // e0 + o0
-+        vadd.s32    q15, q10, q8     // e1 + o1
-+        vsub.s32    q8, q10, q8      // e1 - o1
-+        vsub.s32    q9, q11, q9      // e0 - o0
-+
-+        vqrshrn.s32 d28, q14, \shift
-+        vqrshrn.s32 d29, q15, \shift
-+        vqrshrn.s32 d30, q8, \shift
-+        vqrshrn.s32 d31, q9, \shift
-+.endm
-+
-+.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7,                         \
-+                   tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
-+                   tmp1, /* Q reg which doesn't alias with d7 or d0     */ \
-+                   shift, I1, I2, I3
-+
-+        vmull.s16  q4, \d1, d1[1]        // 89 * src1
-+        \I1
-+        vmull.s16  q5, \d1, d1[0]        // 75 * src1
-+        \I2
-+        vmull.s16  q6, \d1, d1[3]        // 50 * src1
-+        \I3
-+        vmull.s16  q7, \d1, d1[2]        // 18 * src1
-+        vmlal.s16  q4, \d3, d1[0]        // 75 * src3
-+        vmlsl.s16  q5, \d3, d1[2]        //-18 * src3
-+        vmlsl.s16  q6, \d3, d1[1]        //-89 * src3
-+        vmlsl.s16  q7, \d3, d1[3]        //-50 * src3
-+
-+          // tr4
-+          vmull.s16  q1, \d2, d0[0]      // 83 * src(1*2)
-+          vmull.s16  q2, \d2, d0[1]      // 36 * src(1*2)
-+
-+        vmlal.s16  q4, \d5, d1[3]        // 50 * src5
-+        vmlsl.s16  q5, \d5, d1[1]        //-89 * src5
-+        vmlal.s16  q6, \d5, d1[2]        // 18 * src5
-+        vmlal.s16  q7, \d5, d1[0]        // 75 * src5
-+
-+          vshll.s16  q3, \d0, #6         // 64 * src(0*2)
-+          vshll.s16  \tmp0, \d4, #6      // 64 * src(2*2)
-+          vmlal.s16  q1, \d6, d0[1]      // 83 * src(1*2) + 36 * src(3*2)  o0
-+          vmlsl.s16  q2, \d6, d0[0]      // 36 * src(1*2) - 83 * src(3*2)  o1
-+          vadd.i32   \tmp1, q3, \tmp0    // 64 * (src(0*2) + src(2*2))     e0
-+          vsub.i32   \tmp0, q3, \tmp0    // 64 * (src(0*2) - src(2*2))     e1
-+
-+        vmlal.s16  q4, \d7, d1[2]        // 18 * src7
-+        vmlsl.s16  q5, \d7, d1[3]        //-50 * src7
-+        vmlal.s16  q6, \d7, d1[0]        // 75 * src7
-+        vmlsl.s16  q7, \d7, d1[1]        //-89 * src7
-+
-+          vsub.i32   q3, \tmp1, q1       // e0 - o0
-+          vadd.i32   \tmp1, \tmp1, q1    // e0 + o0
-+          vadd.i32   q1, \tmp0, q2       // e1 + o1
-+          vsub.i32   q2, \tmp0, q2       // e1 - o1
-+
-+        vadd.i32   \tmp0, \tmp1, q4      // e_8[0] + o_8[0], dst[0]
-+        vsub.i32   q4, \tmp1, q4         // e_8[0] - o_8[0], dst[7]
-+        vsub.i32   \tmp1, q3, q7         // e_8[3] - o_8[3], dst[4]
-+        vadd.i32   q7, q3, q7            // e_8[3] + o_8[3], dst[3]
-+        vadd.i32   q3, q1, q5            // e_8[1] + o_8[1], dst[1]
-+        vsub.i32   q5, q1, q5            // e_8[1] - o_8[1], dst[6]
-+        vsub.i32   q1, q2, q6            // e_8[2] - o_8[2], dst[5]
-+        vadd.i32   q6, q2, q6            // e_8[2] + o_8[2], dst[2]
-+        vqrshrn.s32   \d0, \tmp0, #\shift
-+        vqrshrn.s32   \d4, \tmp1, #\shift
-+        vqrshrn.s32   \d1, q3, #\shift
-+        vqrshrn.s32   \d5, q1, #\shift
-+        vqrshrn.s32   \d2, q6, #\shift
-+        vqrshrn.s32   \d6, q5, #\shift
-+        vqrshrn.s32   \d3, q7, #\shift
-+        vqrshrn.s32   \d7, q4, #\shift
-+.endm
-+
-+.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
-+        vld1.16     {\d0}, [r0 :64], r3
-+        vld1.16     {\d1}, [r2 :64], r3
-+        vld1.16     {\d2}, [r0 :64], r3
-+        vld1.16     {\d3}, [r2 :64], r3
-+        vld1.16     {\d4}, [r0 :64], r3
-+        vld1.16     {\d5}, [r2 :64], r3
-+        vld1.16     {\d6}, [r0 :64], r3
-+        vld1.16     {\d7}, [r2 :64], r3
-+
-+        tr8_process \
-+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+            \q01, \q23, 7, "\I1", "\I2", "\I3"
-+.endm
-+
-+.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
-+        tr8_process \
-+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+            \q01, \q23, \shift
-+
-+        vzip.16    \d0, \d4
-+        vzip.16    \d1, \d5
-+        vzip.16    \d2, \d6
-+        vzip.16    \d3, \d7
-+        vst4.16    {\d0-\d3}, [r0 :128], r3
-+        vst4.16    {\d4-\d7}, [r2 :128], r3
-+.endm
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevc_idct_fn_neon.S"
-+
-+.text
-+
-+.align 4
-+tr4f:
-+.word 0x00240053  // 36 and d1[0] = 83
-+.word 0x00000000
-+tr8f:
-+.word 0x0059004b  // 89, d0[0] = 75
-+.word 0x00320012  // 50, d0[2] = 18
-+tr16:
-+.word 0x005a0057  // 90, d2[0] = 87
-+.word 0x00500046  // 80, d2[2] = 70
-+.word 0x0039002b  // 57, d2[0] = 43
-+.word 0x00190009  // 25, d2[2] = 9
-+
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 10
-+#include "rpi_hevc_idct_fn_neon.S"
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c
-new file mode 100644
-index 0000000000..109fa98c29
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
-@@ -0,0 +1,32 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+
-+av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
-+{
-+    int cpu_flags = av_get_cpu_flags();
-+
-+    if (have_neon(cpu_flags))
-+        ff_hevcdsp_rpi_init_neon(c, bit_depth);
-+}
-diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-new file mode 100644
-index 0000000000..9294ab8010
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-@@ -0,0 +1,467 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+#include "libavcodec/avcodec.h"
-+#include "libavcodec/bit_depth_template.c"
-+
-+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
-+// have been removed from head as we never use them.
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                             uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                             uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
-+
-+void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
-+
-+void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+
-+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                                int in_inc0, int in_inc1);
-+void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
-+
-+
-+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+
-+#if SAO_FILTER_N == 6
-+static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+#endif
-+
-+
-+
-+#if RPI_HEVC_SAO_BUF_STRIDE != 160
-+#error SAO edge src stride not 160 - value used in .S
-+#endif
-+
-+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
-+{
-+    if (bit_depth == 8) {
-+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
-+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
-+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
-+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
-+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
-+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
-+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
-+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
-+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
-+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
-+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
-+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
-+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
-+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
-+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
-+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
-+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
-+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
-+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
-+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
-+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
-+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
-+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
-+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
-+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
-+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
-+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
-+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
-+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
-+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
-+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
-+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
-+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
-+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
-+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
-+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
-+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
-+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
-+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
-+#endif
-+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
-+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
-+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
-+
-+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
-+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
-+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
-+
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
-+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
-+#endif
-+    }
-+    else if (bit_depth == 10) {
-+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
-+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
-+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
-+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
-+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
-+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
-+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
-+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
-+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
-+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
-+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
-+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
-+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
-+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
-+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
-+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
-+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
-+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
-+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
-+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
-+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
-+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
-+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
-+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
-+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
-+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
-+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
-+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
-+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
-+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
-+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
-+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
-+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
-+
-+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
-+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
-+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
-+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
-+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
-+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
-+#endif
-+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
-+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
-+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
-+
-+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
-+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
-+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
-+
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
-+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
-+#endif
-+    }
-+
-+    assert(offsetof(HEVCRpiMvField, mv) == 0);
-+    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
-+    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
-+    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
-+    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
-+}
-diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
-new file mode 100644
-index 0000000000..f831e55a6d
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
-@@ -0,0 +1,591 @@
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+#define BIT_DEPTH 10
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
-+.endm
-+
-+@ add_residual4x4(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1]
-+        lsl         r2, #1
-+        vld1.16     {d0}, [r0 :64], r2
-+        vld1.16     {d1}, [ip :64], r2
-+        vld1.16     {d2}, [r0 :64]
-+        vld1.16     {d3}, [ip :64]
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q11
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0 :64], r2
-+        vst1.16     {d1}, [ip :64], r2
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [ip :64]
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r1
-+        vdup.16     q15, r2
-+        lsl         r1, #1
-+        vld1.16     {d0}, [r0 :64], r1
-+        vld1.16     {d1}, [ip :64], r1
-+        vld1.16     {d2}, [r0 :64]
-+        vld1.16     {d3}, [ip :64]
-+        sub         r0, r1
-+        vqadd.s16   q0,  q15
-+        sub         ip, r1
-+        vqadd.s16   q1,  q15
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0 :64], r1
-+        vst1.16     {d1}, [ip :64], r1
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [ip :64]
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ add_residual8x8(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
-+        mov         r3, #8
-+        vmov.i64    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vldm        r1!, {q10-q13}
-+        vld1.16     {q0}, [r0 :128], r2
-+        vld1.16     {q1}, [ip :128], r2
-+        vld1.16     {q2}, [r0 :128]
-+        vld1.16     {q3}, [ip :128]
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q11
-+        subs        r3, #4
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0}, [r0 :128], r2
-+        vst1.16     {q1}, [ip :128], r2
-+        vst1.16     {q2}, [r0 :128], r2
-+        vst1.16     {q3}, [ip :128], r2
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #4
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual8x8_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r2
-+        mov         r3, #8
-+9:
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r1, #1
-+1:
-+        vld1.16     {q0}, [r0 :128], r1
-+        vld1.16     {q1}, [ip :128], r1
-+        vld1.16     {q2}, [r0 :128]
-+        vld1.16     {q3}, [ip :128]
-+        sub         r0, r1
-+        vqadd.s16   q0,  q15
-+        sub         ip, r1
-+        vqadd.s16   q1,  q15
-+        subs        r3, #4
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0}, [r0 :128], r1
-+        vst1.16     {q1}, [ip :128], r1
-+        vst1.16     {q2}, [r0 :128], r1
-+        vst1.16     {q3}, [ip :128], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual16x16(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r2
-+        vmov.i16    q8,  #0
-+        lsl         r2, #1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        mov         r3, #16
-+1:
-+        vldm        r1!, {q10-q13}
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0 :128]
-+        subs        r3, #2
-+        vld1.16     {q2, q3}, [ip :128]
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q11
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0, q1}, [r0 :128], r2
-+        vst1.16     {q2, q3}, [ip :128], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #8
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual16x16_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
-+        vdup.i16    q15, r2
-+        mov         r3, #16
-+9:
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r1, #1
-+1:
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0 :128]
-+        subs        r3, #2
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q15
-+        vld1.16     {q2, q3}, [ip :128]
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0, q1}, [r0 :128], r1
-+        vst1.16     {q2, q3}, [ip :128], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ add_residual32x32(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        mov         r3, #32
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vldm        r1!, {q10-q13}
-+        vldm        r0,  {q0-q3}
-+        vqadd.s16   q0,  q10
-+          pldw        [lr]
-+        vqadd.s16   q1,  q11
-+          add         lr, r2
-+        vqadd.s16   q2,  q12
-+        subs        r3, #1
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0-q1}, [r0], r2
-+        vst1.16     {q2-q3}, [ip], r2
-+        bne         1b
-+        pop         {pc}
-+
-+endfunc
-+
-+@ add_residual16x16_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #16
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual32x32_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r2
-+        mov         r3, #32
-+9:
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vldm        r0,  {q0-q3}
-+        vqadd.s16   q0,  q15
-+        subs        r3, #1
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0-q1}, [r0], r1
-+        vst1.16     {q2-q3}, [ip], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1 :256]
-+        lsl         r2, #1
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+        sub         r0, r2
-+        vmov.i16    q8,  #0
-+        sub         ip, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        mov         r3, #8
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        subs        r3, #2
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual16x16_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        vdup.16     q15, r3
-+        mov         r3, #16
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q10
-+          pldw        [lr]
-+        vqadd.s16   q1,  q15
-+          add         lr, r2
-+        vqadd.s16   q2,  q11
-+        subs        r3, #1
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1 :256]
-+        lsl         r2, #1
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+        sub         r0, r2
-+        vmov.i16    q8,  #0
-+        sub         ip, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        mov         r3, #8
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        subs        r3, #2
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual16x16_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        vdup.16     q15, r3
-+        mov         r3, #16
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q15
-+          pldw        [lr]
-+        vqadd.s16   q1,  q10
-+          add         lr, r2
-+        vqadd.s16   q2,  q15
-+        subs        r3, #1
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+        vldm        r1, {q10-q13}
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        add         ip, r0, r2
-+        lsl         r2, #1
-+        vmov.i16    q8,  #0
-+        add         r3, r1, #(8*8*2)  @ Offset to V
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        mov         lr, #8
-+1:
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        subs        lr, #2
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q12, q13}, [r3 :256]!
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
-+        push        {r4, lr}
-+        vmov.i16    q8,  #0
-+        add         r3,  r1, #(16*16*2)  @ Offset to V
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+        add         r4, r0, r2
-+        mov         lr, #16
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vld1.16     {q12, q13}, [r3 :256]!
-+        vqadd.s16   q0,  q10
-+          pldw        [r4]
-+        vqadd.s16   q1,  q12
-+          add         r4, r2
-+        vqadd.s16   q2,  q11
-+        subs        lr, #1
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {r4,pc}
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
-new file mode 100644
-index 0000000000..ea3b3faf6f
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
-@@ -0,0 +1,712 @@
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+@ General notes:
-+@
-+@ Residual is generally only guaranteed to be clipped to 16 bits.
-+@ This means that we do need to do vmovl, vqadd, vqmovun
-+@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
-+@ with this).
-+@
-+@ There is an exception for the DC case because its transform is guaranteed
-+@ to be small enough that overflow cannot occur during the first add.
-+
-+@ ============================================================================
-+@ Y add
-+
-+function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]
-+        lsl         r2, #1
-+        vld1.32     d4[0], [r0], r2
-+        rsb         r3, r2, #0
-+        vld1.32     d4[1], [ip], r2
-+        vld1.32     d5[0], [r0], r3
-+        vld1.32     d5[1], [ip], r3
-+        vmovl.u8    q8, d4
-+        vmovl.u8    q9, d5
-+        vqadd.s16   q0, q8
-+        vqadd.s16   q1, q9
-+        vqmovun.s16 d0, q0
-+        vqmovun.s16 d1, q1
-+        vst1.32     d0[0], [r0], r2
-+        vst1.32     d0[1], [ip], r2
-+        vst1.32     d1[0], [r0]
-+        vst1.32     d1[1], [ip]
-+        bx          lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
-+        push        {r4, lr}
-+        vld1.16     {q0, q1}, [r1]!
-+        add         ip, r0, r2
-+        vld1.8      {d6}, [r0]
-+        add         r4, r0, r2, lsl #1
-+        vld1.8      {d7}, [ip]
-+        add         lr, ip, r2, lsl #1
-+        lsl         r2, #1
-+        mov         r3, #8-2
-+        vmovl.u8    q2, d6
-+        vmovl.u8    q3, d7
-+        vqadd.s16   q2, q0
-+        vqadd.s16   q3, q1
-+1:
-+          vld1.16     {q0, q1}, [r1]!
-+        subs        r3, #2
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+          vld1.8      {d6}, [r4], r2
-+          vld1.8      {d7}, [lr], r2
-+        vst1.8      {d4}, [r0], r2
-+        vst1.8      {d5}, [ip], r2
-+          vmovl.u8    q2, d6
-+            pldw        [r4]
-+          vmovl.u8    q3, d7
-+          vqadd.s16   q2, q0
-+          vqadd.s16   q3, q1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q2
-+          vqmovun.s16 d5, q3
-+          vst1.8      {d4}, [r0]
-+          vst1.8      {d5}, [ip]
-+          pop         {r4, pc}
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
-+        vld1.16     {q0, q1}, [r1]!
-+        add         ip, r0, r2
-+        vld1.8      {q3}, [r0]
-+        mov         r3, #16-1
-+        vmovl.u8    q2, d6
-+        vmovl.u8    q3, d7
-+        vqadd.s16   q2, q0
-+        vqadd.s16   q3, q1
-+1:
-+          vld1.16     {q0, q1}, [r1]!
-+        subs        r3, #1
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+          vld1.8      {q3}, [ip], r2
-+        vst1.8      {q2}, [r0], r2
-+          vmovl.u8    q2, d6
-+            pldw        [ip]
-+          vmovl.u8    q3, d7
-+          vqadd.s16   q2, q0
-+          vqadd.s16   q3, q1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q2
-+          vqmovun.s16 d5, q3
-+          vst1.8      {q2}, [r0]
-+          bx          lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
-+        vldm        r1!, {q0-q3}
-+        vld1.8      {q8, q9}, [r0]
-+        add         ip, r0, r2
-+        vmovl.u8    q10, d16
-+        mov         r3, #32-1
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vqadd.s16   q10, q0
-+        vqadd.s16   q11, q1
-+        vqadd.s16   q12, q2
-+        vqadd.s16   q13, q3
-+1:
-+          vldm        r1!, {q0-q3}
-+        vqmovun.s16 d20, q10
-+        vqmovun.s16 d21, q11
-+        vqmovun.s16 d22, q12
-+        vqmovun.s16 d23, q13
-+          vld1.8      {q8, q9}, [ip], r2
-+        subs        r3, #1
-+        vst1.8      {q10, q11}, [r0], r2
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q11, d17
-+          vmovl.u8    q12, d18
-+          vmovl.u8    q13, d19
-+          vqadd.s16   q10, q0
-+          vqadd.s16   q11, q1
-+          vqadd.s16   q12, q2
-+          vqadd.s16   q13, q3
-+        bne     1b
-+
-+          vqmovun.s16 d20, q10
-+          vqmovun.s16 d21, q11
-+          vqmovun.s16 d22, q12
-+          vqmovun.s16 d23, q13
-+          vst1.8      {q10, q11}, [r0]
-+          bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
-+        add         ip, r0, r1
-+        vdup.16     q15, r2
-+        lsl         r1, #1
-+        vld1.32     d4[0], [r0], r1
-+        rsb         r3, r1, #0
-+        vld1.32     d4[1], [ip], r1
-+        vld1.32     d5[0], [r0], r3
-+        vld1.32     d5[1], [ip], r3
-+        vaddw.u8    q0, q15, d4
-+        vaddw.u8    q1, q15, d5
-+        vqmovun.s16 d0, q0
-+        vqmovun.s16 d1, q1
-+        vst1.32     d0[0], [r0], r1
-+        vst1.32     d0[1], [ip], r1
-+        vst1.32     d1[0], [r0]
-+        vst1.32     d1[1], [ip]
-+        bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ DC Y or C add
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
-+        mov         r3,  #4-2
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3, #8-2
-+1:      vld1.8      d16, [r0]
-+        add         ip, r0, r1
-+        push        {r4, lr}
-+        vld1.8      d17, [ip]
-+        add         r4, r0, r1, lsl #1
-+        vaddw.u8    q0, q15, d16
-+        lsl         r1, #1
-+        vaddw.u8    q1, q15, d17
-+        add         lr, ip, r1
-+1:
-+          vld1.8      {d16}, [r4], r1
-+          vld1.8      {d17}, [lr], r1
-+        subs        r3, #2
-+        vqmovun.s16 d4, q0
-+        vqmovun.s16 d5, q1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+        vst1.8      {d4}, [r0], r1
-+        vst1.8      {d5}, [ip], r1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q0
-+          vqmovun.s16 d5, q1
-+          vst1.8      {d4}, [r0]
-+          vst1.8      {d5}, [ip]
-+          pop         {r4, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
-+        mov         r3,  #8-1
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3,  #16-1
-+1:      vld1.8      {q8}, [r0]
-+        add         ip, r0, r1
-+        vaddw.u8    q0, q15, d16
-+        vaddw.u8    q1, q15, d17
-+1:
-+          vld1.8      {q8}, [ip], r1
-+        subs        r3, #1
-+        vqmovun.s16 d4, q0
-+        vqmovun.s16 d5, q1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+        vst1.8      {q2}, [r0], r1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q0
-+          vqmovun.s16 d5, q1
-+          vst1.8      {q2}, [r0]
-+          bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
-+        mov         r3,  #16-1
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3, #32-1
-+1:      vld1.8      {q8, q9}, [r0]
-+        add         ip, r0, r1
-+        vaddw.u8    q0, q15, d16
-+        vaddw.u8    q1, q15, d17
-+        vaddw.u8    q2, q15, d18
-+        vaddw.u8    q3, q15, d19
-+1:
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d22, q2
-+        vqmovun.s16 d23, q3
-+          vld1.8      {q8, q9}, [ip], r1
-+        subs        r3, #1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+          vaddw.u8    q2, q15, d18
-+          vaddw.u8    q3, q15, d19
-+        vst1.8      {q10, q11}, [r0], r1
-+        bne     1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d22, q2
-+          vqmovun.s16 d23, q3
-+          vst1.8      {q10, q11}, [r0]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.8      {d18}, [r0 :64]
-+        sub         r0, r2
-+        vld1.8      {d19}, [ip :64]
-+        sub         ip, r2
-+        vdup.16     q2, r3
-+        vdup.16     q3, r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        push        {r4, lr}
-+        vld2.8      {d16, d17}, [r0 :128]
-+        lsl         r2, #1
-+        vld2.8      {d18, d19}, [ip :128]
-+        mov         r3, #8-2
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         r4, r0, r2
-+        vmovl.u8    q10, d16
-+        add         lr, ip, r2
-+        vmovl.u8    q11, d18
-+        vqadd.s16   q0,  q10
-+        vaddw.u8    q2,  q15, d17
-+        vqadd.s16   q1,  q11
-+        vaddw.u8    q3,  q15, d19
-+1:
-+        vqmovun.s16 d20,  q0
-+        vqmovun.s16 d21,  q2
-+          vld2.8      {d16, d17}, [r4 :128], r2
-+        subs        r3, #2
-+        vqmovun.s16 d22,  q1
-+        vqmovun.s16 d23,  q3
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vld2.8      {d18, d19}, [lr :128], r2
-+        vst2.8      {d22, d23}, [ip :128], r2
-+          vld1.16     {q0, q1}, [r1 :256]!
-+          vmovl.u8    q10, d16
-+          vmovl.u8    q11, d18
-+          vqadd.s16   q0,  q10
-+          vaddw.u8    q2,  q15, d17
-+          vqadd.s16   q1,  q11
-+          vaddw.u8    q3,  q15, d19
-+        bne         1b
-+
-+          vqmovun.s16 d20,  q0
-+          vqmovun.s16 d21,  q2
-+          vqmovun.s16 d22,  q1
-+          vqmovun.s16 d23,  q3
-+          vst2.8      {d20, d21}, [r0 :128]
-+          vst2.8      {d22, d23}, [ip :128]
-+          pop         {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_u(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld2.8      {q8, q9}, [r0 :256]
-+        mov         r3, #16-1
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        vmovl.u8    q11, d16
-+        vmovl.u8    q12, d17
-+        vqadd.s16   q0,  q11
-+        vaddw.u8    q11, q15, d18
-+        vqadd.s16   q1,  q12
-+        vaddw.u8    q12, q15, d19
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        r3, #1
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d22, q11
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d23, q12
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {q10, q11}, [r0 :256], r2
-+          vmovl.u8    q11, d16
-+            pldw        [ip]
-+          vmovl.u8    q12, d17
-+          vqadd.s16   q0,  q11
-+          vaddw.u8    q11, q15, d18
-+          vqadd.s16   q1,  q12
-+          vaddw.u8    q12, q15, d19
-+        bne         1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d22, q11
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d23, q12
-+          vst2.8      {q10, q11}, [r0 :256]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q2, q3}, [r1]
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.8      {d18}, [r0 :64]
-+        sub         r0, r2
-+        vld1.8      {d19}, [ip :64]
-+        sub         ip, r2
-+        vdup.16     q0, r3
-+        vdup.16     q1, r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        push        {r4, lr}
-+        vld2.8      {d16, d17}, [r0 :128]
-+        lsl         r2, #1
-+        vld2.8      {d18, d19}, [ip :128]
-+        mov         r3, #8-2
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         r4, r0, r2
-+        vmovl.u8    q10, d17
-+        add         lr, ip, r2
-+        vmovl.u8    q11, d19
-+        vqadd.s16   q0,  q10
-+        vaddw.u8    q2,  q15, d16
-+        vqadd.s16   q1,  q11
-+        vaddw.u8    q3,  q15, d18
-+1:
-+        vqmovun.s16 d20,  q2
-+        vqmovun.s16 d21,  q0
-+          vld2.8      {d16, d17}, [r4 :128], r2
-+        subs        r3, #2
-+        vqmovun.s16 d22,  q3
-+        vqmovun.s16 d23,  q1
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vld2.8      {d18, d19}, [lr :128], r2
-+        vst2.8      {d22, d23}, [ip :128], r2
-+          vld1.16     {q0, q1}, [r1 :256]!
-+          vmovl.u8    q10, d17
-+          vmovl.u8    q11, d19
-+          vqadd.s16   q0,  q10
-+          vaddw.u8    q2,  q15, d16
-+          vqadd.s16   q1,  q11
-+          vaddw.u8    q3,  q15, d18
-+        bne         1b
-+
-+          vqmovun.s16 d20,  q2
-+          vqmovun.s16 d21,  q0
-+          vqmovun.s16 d22,  q3
-+          vqmovun.s16 d23,  q1
-+          vst2.8      {d20, d21}, [r0 :128]
-+          vst2.8      {d22, d23}, [ip :128]
-+          pop         {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld2.8      {q8, q9}, [r0 :256]
-+        mov         r3, #16-1
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        vmovl.u8    q11, d18
-+        vmovl.u8    q12, d19
-+        vqadd.s16   q0,  q11
-+        vaddw.u8    q11, q15, d16
-+        vqadd.s16   q1,  q12
-+        vaddw.u8    q12, q15, d17
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        r3, #1
-+        vqmovun.s16 d20, q11
-+        vqmovun.s16 d22, q0
-+        vqmovun.s16 d21, q12
-+        vqmovun.s16 d23, q1
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {q10, q11}, [r0 :256], r2
-+          vmovl.u8    q11, d18
-+            pldw        [ip]
-+          vmovl.u8    q12, d19
-+          vqadd.s16   q0,  q11
-+          vaddw.u8    q11, q15, d16
-+          vqadd.s16   q1,  q12
-+          vaddw.u8    q12, q15, d17
-+        bne         1b
-+
-+          vqmovun.s16 d20, q11
-+          vqmovun.s16 d22, q0
-+          vqmovun.s16 d21, q12
-+          vqmovun.s16 d23, q1
-+          vst2.8      {q10, q11}, [r0 :256]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]!       @ all of U
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        rsb         r3, r2, #0
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.16     {q2, q3}, [r1]        @ all of V
-+        vld1.8      {d18}, [r0 :64], r3
-+        vld1.8      {d19}, [ip :64], r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
-+        vld2.8      {d16, d17}, [r0 :128]
-+        add         r3, r1, #(8*8*2)  @ Offset to V
-+        vld1.16     {q0}, [r1 :128]!
-+        add         ip, r0, r2
-+        vld1.16     {q1}, [r3 :128]!
-+        vmovl.u8    q10, d16
-+        push        {lr}
-+        vmovl.u8    q8,  d17
-+        mov         lr, #8-1
-+        vqadd.s16   q10, q0
-+        vqadd.s16   q1,  q8
-+1:
-+          vld2.8      {d16, d17}, [ip :128], r2
-+        subs        lr, #1
-+          vld1.16     {q0}, [r1 :128]!
-+        vqmovun.s16 d20, q10
-+        vqmovun.s16 d21, q1
-+          vld1.16     {q1}, [r3 :128]!
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q8,  d17
-+          vqadd.s16   q10, q0
-+          vqadd.s16   q1,  q8
-+        bne         1b
-+
-+          vqmovun.s16 d20, q10
-+          vqmovun.s16 d21, q1
-+          vst2.8      {d20, d21}, [r0 :128]
-+          pop         {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
-+        vld2.8      {q8, q9}, [r0 :256]
-+        add         r3, r1, #(16*16*2)  @ Offset to V
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         ip, r0, r2
-+        vld1.16     {q2, q3}, [r3 :256]!
-+        vmovl.u8    q10, d16
-+        push        {lr}
-+        vmovl.u8    q8,  d17
-+        mov         lr, #16-1
-+        vmovl.u8    q11, d18
-+        vmovl.u8    q9,  d19
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q8
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q9
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        lr, #1
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d22, q2
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d23, q3
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {d20-d23}, [r0 :256], r2
-+          vld1.16     {q2, q3}, [r3 :256]!
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q8,  d17
-+          vmovl.u8    q11, d18
-+          vmovl.u8    q9,  d19
-+          vqadd.s16   q0,  q10
-+          vqadd.s16   q1,  q8
-+          vqadd.s16   q2,  q11
-+          vqadd.s16   q3,  q9
-+        bne         1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d22, q2
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d23, q3
-+          vst2.8      {d20-d23}, [r0 :256]
-+          pop         {pc}
-+endfunc
-+
-+@ 32x32 chroma never occurs so NIF
-+
-+@ ============================================================================
-diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-new file mode 100644
-index 0000000000..b56e0f9644
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-@@ -0,0 +1,2245 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.set EDGE_SRC_STRIDE, 160
-+
-+@ PIC jump tables are fractionally more expensive than absolute in our code
-+.set jent_pic, CONFIG_PIC
-+
-+
-+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
-+        vshr.u8   q12, q8, #3
-+        \I1
-+        vadd.i8   q8, \Q_K128
-+        \I2
-+        vshr.u8   q13, q9, #3
-+        \I3
-+        vadd.i8   q9, \Q_K128
-+        \I4
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT0, d25
-+        vtbl.8    d26, \XLAT1, d26
-+        vtbl.8    d27, \XLAT1, d27
-+
-+        vqadd.s8  q8, q12
-+        vshr.u8   q12, q10, #3
-+        vadd.i8   q10, \Q_K128
-+        vqadd.s8  q9, q13
-+        vshr.u8   q13, q11, #3
-+        vadd.i8   q11, \Q_K128
-+
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT0, d25
-+        vtbl.8    d26, \XLAT1, d26
-+        vtbl.8    d27, \XLAT1, d27
-+        vqadd.s8  q10, q12
-+        vsub.i8   q8, \Q_K128
-+        vqadd.s8  q11, q13
-+        vsub.i8   q9, \Q_K128
-+        vsub.i8   q10, \Q_K128
-+        vsub.i8   q11, \Q_K128
-+.endm
-+
-+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+        \L1
-+        \L2
-+        \L3
-+        \L4
-+        \L5
-+        vadd.i8   q12, q8, \Q_K128
-+        vshr.u8   q8, #3
-+        vtbl.8    d16, \XLAT0, d16
-+        vtbl.8    d17, \XLAT1, d17
-+        vqadd.s8  q12, q8
-+        bmi       2f
-+1:        \L1
-+          \L2
-+          \L3
-+          \L4
-+          \L5
-+        vsub.i8   q13, q12, \Q_K128
-+          vadd.i8   q12, q8, \Q_K128
-+          vshr.u8   q8, #3
-+        \S1
-+        \S2
-+        \S3
-+        \S4
-+          vtbl.8    d16, \XLAT0, d16
-+          vtbl.8    d17, \XLAT1, d17
-+          vqadd.s8  q12, q8
-+          bpl       1b
-+2:        vsub.i8   q13, q12, \Q_K128
-+          \S1
-+          \S2
-+          \S3
-+          \S4
-+.endm
-+
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
-+.endm
-+
-+@ Clobbers q12, q13
-+.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
-+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
-+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
-+        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
-+        \I1
-+        vtbl.8    d24, \XLAT0, d24
-+        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
-+        vtbl.8    d25, \XLAT1, d25
-+        \I2
-+        vtbl.8    d26, \XLAT0, d26
-+        vtbl.8    d27, \XLAT1, d27
-+        vaddw.s8  \Q0, d24
-+        vaddw.s8  \Q1, d25
-+        vaddw.s8  \Q2, d26
-+        vaddw.s8  \Q3, d27
-+        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
-+.endm
-+
-+@ Clobbers q10, q11, q12
-+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+        \L1
-+        \L2
-+        \L3
-+        \L4
-+        \L5
-+        vshrn.i16 d24, \Q0, #\bit_depth - 5
-+        vshrn.i16 d25, \Q1, #\bit_depth - 5
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT1, d25
-+        vaddw.s8  q10, \Q0, d24
-+        vaddw.s8  q11, \Q1, d25
-+        bmi       2f
-+1:        \L1
-+          \L2
-+          \L3
-+          \L4
-+          \L5
-+        vmax.s16  q10, \Q_MIN
-+        vmax.s16  q11, \Q_MIN
-+          vshrn.i16 d24, \Q0, #\bit_depth - 5
-+          vshrn.i16 d25, \Q1, #\bit_depth - 5
-+        vmin.s16  q10, \Q_MAX
-+        vmin.s16  q11, \Q_MAX
-+        \S1
-+        \S2
-+        \S3
-+        \S4
-+          vtbl.8    d24, \XLAT0, d24
-+          vtbl.8    d25, \XLAT1, d25
-+          vaddw.s8  q10, \Q0, d24
-+          vaddw.s8  q11, \Q1, d25
-+          bpl       1b
-+2:        vmax.s16  q10, \Q_MIN
-+          vmax.s16  q11, \Q_MIN
-+          vmin.s16  q10, \Q_MAX
-+          vmin.s16  q11, \Q_MAX
-+          \S1
-+          \S2
-+          \S3
-+          \S4
-+.endm
-+
-+
-+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
-+@ so we are quite safe stuffing it into a byte array
-+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
-+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
-+@ precision
-+
-+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
-+@ array via the stack
-+@ Given that sao_left_class > 28 can cause wrap we can't just poke
-+@ all 4 bytes in at once
-+@
-+@ It also loads other common regs
-+
-+@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
-+function band_load_y
-+        ldr       ip, [sp, #16]         @ &sao_offset_val[0]
-+        ldr       r4, [sp, #20]         @ sao_left_class
-+        vmov.i64  d4, #0
-+        vmov.i64  q0, #0
-+        pld       [r1]
-+        vld2.8    {q8}, [ip]
-+        sub       ip, sp, #8*5
-+        vmov.i64  q1, #0
-+        add       r4, ip, r4
-+        vpush     {d0-d4}               @ Put zero array on stack
-+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
-+        ldr       ip, [ip, #8*5 + 28]   @ height
-+        vst1.32   {d16[0]}, [r4]
-+        add       r4, r1, r3
-+        vpop      {d0-d4}               @ Pop modified array
-+        sub       ip, ip, #1
-+        vorr      d0, d0, d4
-+        bx        lr
-+endfunc
-+
-+@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
-+function band_load_c
-+        ldr       ip, [sp, #16]         @ &sao_offset_val1[0]
-+        ldr       r4, [sp, #20]         @ sao_left_class1
-+        vmov.i64  d24, #0
-+        vmov.i64  q10, #0
-+        pld       [r1]
-+        vld2.8    {q8}, [ip]
-+        sub       ip, sp, #8*5
-+        vmov.i64  q11, #0
-+        add       r4, ip, r4
-+        ldr       ip, [sp, #24]         @ &sao_offset_val2[0]
-+        vpush     {d20-d24}             @ Put zero array on stack
-+        vld2.8    {q9}, [ip]
-+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
-+        ldr       ip, [sp, #8*5 + 28]   @ sao_left_class2
-+        vst1.32   {d16[0]}, [r4]
-+        add       ip, sp, ip
-+        vshr.u64  d18, d18, #8          @ 1st interesting val is [1]
-+        vldmia    sp, {d0-d3}           @ Load modified array
-+        vldr      d16, [sp, #8*4]
-+        add       r4, r1, r3
-+        vstmia    sp, {d20-d24}         @ Put zero array on stack (again)
-+        vst1.32   {d18[0]}, [ip]
-+        vorr      d0, d0, d16
-+        vldmia    sp, {d4-d7}           @ Load modified array
-+        vldr      d18, [sp, #8*4]
-+        ldr       ip, [sp, #8*5 + 36]   @ height
-+        add       sp, sp, #8*5
-+        vorr      d4, d4, d18
-+        sub       ip, ip, #1
-+        bx        lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_64_neon_8, export=1
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vldmia    r1, {q8-q11}
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
-+            "pld       [r4]",                 \
-+            "subs      ip, #1",               \
-+            "it ne; addne r4, r3",            \
-+            "add       r1, r3"
-+        vstmia    r0, {q8-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_32_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vld1.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #2
-+        vld1.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+        vst1.8    { q8, q9 }, [r0, :128], r2
-+        vst1.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_16_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vld1.8    { q8}, [r1, :128], r3
-+        subs      ip, #4
-+        vld1.8    { q9}, [r6, :128], r3
-+        vld1.8    {q10}, [r1, :128], r3
-+        vld1.8    {q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+        vst1.8    { q8}, [r0, :128], r2
-+        vst1.8    { q9}, [r5, :128], r2
-+        vst1.8    {q10}, [r0, :128], r2
-+        vst1.8    {q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_8_neon_8, export=1
-+        ldr       ip, [sp, #8]          @ width
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        cmp       ip, #8
-+        bl        band_load_y
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        blt       4f
-+
-+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
-+            "vld1.8    {d16}, [r1, :64], r3", \
-+            "subs      ip, #2",               \
-+            "vld1.8    {d17}, [r6, :64], r3", \
-+            "",                               \
-+            "",                               \
-+            "vst1.8 {d26}, [r0, :64], r2",    \
-+            "vst1.8 {d27}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15,    \
-+            "vld1.32   {d16[0]}, [r1, :32], r3", \
-+            "subs      ip, #4",                  \
-+            "vld1.32   {d16[1]}, [r6, :32], r3", \
-+            "vld1.32   {d17[0]}, [r1, :32], r3", \
-+            "vld1.32   {d17[1]}, [r6, :32], r3", \
-+            "vst1.32   {d26[0]}, [r0, :32], r2", \
-+            "vst1.32   {d26[1]}, [r5, :32], r2", \
-+            "vst1.32   {d27[0]}, [r0, :32], r2", \
-+            "vst1.32   {d27[1]}, [r5, :32], r2"
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        vmov.u8   q15, #128
-+        bl        band_load_c
-+
-+1:      vld2.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #1
-+        vld2.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
-+            "pld       [r4]",                 \
-+            "it ne; addne r4, r3"
-+
-+        vst2.8    { q8, q9 }, [r0, :128], r2
-+        vst2.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop     {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_c
-+
-+1:      vld2.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #2
-+        vld2.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15
-+
-+        vst2.8    { q8, q9 }, [r0, :128], r2
-+        vst2.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop     {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
-+        ldr       ip, [sp, #16]         @ width
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        cmp       ip, #8
-+        bl        band_load_c
-+        blt       4f
-+
-+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15,      \
-+            "vld2.8    {d16-d17}, [r1, :128], r3", \
-+            "subs      ip, #1",                    \
-+            "",                                    \
-+            "",                                    \
-+            "",                                    \
-+            "vst2.8    {d26-d27}, [r0, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
-+            "vld1.8    {d16}, [r1, :64], r3", \
-+            "subs      ip, #2",               \
-+            "vld1.8    {d17}, [r6, :64], r3", \
-+            "vuzp.8    d16, d17",             \
-+            "",                               \
-+            "vzip.8    d26, d27",             \
-+            "vst1.8    {d26}, [r0, :64], r2", \
-+            "vst1.8    {d27}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_64_16 bit_depth
-+        push      {r4-r6, lr}
-+        vmov.i64  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+        vpush     {q4-q7}
-+
-+1:      vldm      r1, {q4-q11}
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+            "subs      ip, #1",                                                  \
-+            "add       r1, r3"
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
-+        vstm      r0, {q4-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        vpop      {q4-q7}
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_64_neon_10, export=1
-+        band_64_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_32_16 bit_depth
-+        push      {r4-r6, lr}
-+        vmov.i64  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+
-+1:      vldm      r1, {q8-q11}
-+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+            "subs      ip, #1",                                                   \
-+            "add       r1, r3"
-+        vstm      r0, {q8-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_32_neon_10, export=1
-+        band_32_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_16_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+
-+1:      vld1.16   { q8, q9 }, [r1, :128], r3
-+        subs      r12, #2
-+        vld1.16   {q10, q11}, [r6, :128], r3
-+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
-+        vst1.16   { q8, q9 }, [r0, :128], r2
-+        vst1.16   {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_16_neon_10, export=1
-+        band_16_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_8_16 bit_depth
-+        ldr       ip, [sp, #8]          @ width
-+        push      {r4-r6, lr}
-+        vmov.i64  q14, #0
-+        cmp       ip, #8
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        blt       4f
-+
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+            "vld1.16   {q8}, [r1, :128], r3",                           \
-+            "subs      ip, #2",                                         \
-+            "vld1.16   {q9}, [r6, :128], r3",                           \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst1.16   {q10}, [r0, :128], r2",                          \
-+            "vst1.16   {q11}, [r5, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+            "vld1.16   {d16}, [r1, :64], r3",                           \
-+            "subs      ip, #4",                                         \
-+            "vld1.16   {d17}, [r6, :64], r3",                           \
-+            "vld1.16   {d18}, [r1, :64], r3",                           \
-+            "vld1.16   {d19}, [r6, :64], r3",                           \
-+            "vst1.16   {d20}, [r0, :64], r2",                           \
-+            "vst1.16   {d21}, [r5, :64], r2",                           \
-+            "vst1.16   {d22}, [r0, :64], r2",                           \
-+            "vst1.16   {d23}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_8_neon_10, export=1
-+        band_8_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_32_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        sub       r2, #64
-+        sub       r3, #64
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+        mov       lr, #64
-+        vpush     {q4-q7}
-+
-+1:      vld2.16   { q4, q5 }, [r1, :128], lr
-+        subs      ip, #1
-+        vld2.16   { q6, q7 }, [r6, :128], lr
-+        vld2.16   { q8, q9 }, [r1, :128], r3
-+        vld2.16   {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "pld       [r4]",                                                      \
-+            "it ne; addne r4, r3"
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+        vst2.16   { q4, q5 }, [r0, :128], lr
-+        vst2.16   { q6, q7 }, [r5, :128], lr
-+        vst2.16   { q8, q9 }, [r0, :128], r2
-+        vst2.16   {q10, q11}, [r5, :128], r2
-+
-+        bpl       1b
-+
-+        vpop      {q4-q7}
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
-+        band_c_32_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_16_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+
-+1:      vld2.16   { q8, q9 }, [r1, :128], r3
-+        subs      ip, #1
-+        vld2.16   {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+        vst2.16   { q8, q9 }, [r0, :128], r2
-+        vst2.16   {q10, q11}, [r5, :128], r2
-+
-+        bpl       1b
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
-+        band_c_16_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_8_16 bit_depth
-+        ldr       ip, [sp, #16]         @ width
-+        push      {r4-r6, lr}
-+        vmov.i64  q14, #0
-+        cmp       ip, #8
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+        blt       4f
-+
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "vld2.16   {q8,q9}, [r1, :128], r3",                        \
-+            "subs      ip, #1",                                         \
-+            "",                                                         \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst2.16   {q10,q11}, [r0, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "vld2.16   {d16,d18}, [r1, :128], r3",                      \
-+            "subs      ip, #2",                                         \
-+            "vld2.16   {d17,d19}, [r6, :128], r3",                      \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst2.16   {d20,d22}, [r0, :128], r2",                      \
-+            "vst2.16   {d21,d23}, [r5, :128], r2"
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
-+        band_c_8_16 10
-+endfunc
-+
-+
-+@ =============================================================================
-+@ SAO EDGE
-+
-+@ r0    destination address
-+@ r2    stride to post-increment r0 with
-+@ [r5]  translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27.  For Y d26=d27
-+
-+function edge_64b_body_8
-+
-+        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
-+        vcgt.u8 q13,  q5,  q1
-+        vcgt.u8 q14,  q6,  q2
-+        vcgt.u8 q15,  q7,  q3
-+
-+        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
-+        vcgt.u8  q1,  q5
-+        vcgt.u8  q2,  q6
-+        vcgt.u8  q3,  q7
-+
-+        vsub.s8  q0,  q12       @ a = sign(c-a)
-+        vsub.s8  q1,  q13
-+        vsub.s8  q2,  q14
-+        vsub.s8  q3,  q15
-+
-+        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
-+        vcgt.u8  q13, q5,  q9
-+        vcgt.u8  q14, q6,  q10
-+        vcgt.u8  q15, q7,  q11
-+
-+        vsub.s8  q0,  q12
-+        vsub.s8  q1,  q13
-+        vsub.s8  q2,  q14
-+        vsub.s8  q3,  q15
-+
-+        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
-+        vcgt.u8  q13, q9,  q5
-+        vcgt.u8  q14, q10, q6
-+        vcgt.u8  q15, q11, q7
-+
-+        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
-+        vadd.s8  q1,  q13
-+        vmov.u8  q12, #2
-+        vadd.s8  q2,  q14
-+        vadd.s8  q3,  q15
-+
-+        vadd.s8  q0,  q12
-+        vadd.s8  q1,  q12
-+
-+        vld1.8   {d26, d27}, [r5]
-+
-+        vadd.s8  q2,  q12
-+        vuzp.8   q0,  q1
-+        vmov.u8  q15, #128
-+        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
-+
-+        vtbl.8   d0,  {d26}, d0
-+        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
-+
-+        vtbl.8   d1,  {d26}, d1
-+        vadd.s8  q14, q5, q15
-+
-+        vtbl.8   d2,  {d27}, d2
-+        vuzp.8   q2,  q3
-+
-+        vtbl.8   d3,  {d27}, d3
-+
-+        vtbl.8   d4,  {d26}, d4
-+        vzip.8   q0,  q1
-+
-+        vtbl.8   d5,  {d26}, d5
-+        vqadd.s8 q0,  q12
-+        vqadd.s8 q1,  q14
-+        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
-+
-+        vtbl.8   d6,  {d27}, d6
-+        vtbl.8   d7,  {d27}, d7
-+        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
-+        vzip.8   q2,  q3
-+
-+        vsub.s8  q0,  q15
-+        vqadd.s8 q2,  q12
-+        vqadd.s8 q3,  q14
-+        vsub.s8  q1,  q15
-+        vsub.s8  q2,  q15
-+        vsub.s8  q3,  q15
-+
-+        bx      lr
-+endfunc
-+
-+@ r0    destination address
-+@ r2    stride to post-increment r0 with
-+@ r4    upper clip value
-+@ [r5]  translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27.  For Y d26=d27
-+
-+function edge_64b_body_16
-+
-+        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
-+        vcgt.u16 q13, q5, q1
-+        vcgt.u16 q14, q6, q2
-+        vcgt.u16 q15, q7, q3
-+
-+        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
-+        vcgt.u16 q1, q1, q5
-+        vcgt.u16 q2, q2, q6
-+        vcgt.u16 q3, q3, q7
-+
-+        vsub.s16 q0, q0, q12 // a = sign(c-a)
-+        vsub.s16 q1, q1, q13
-+        vsub.s16 q2, q2, q14
-+        vsub.s16 q3, q3, q15
-+
-+        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
-+        vcgt.u16 q13, q5, q9
-+        vcgt.u16 q14, q6, q10
-+        vcgt.u16 q15, q7, q11
-+
-+        vsub.s16 q0, q0, q12
-+        vsub.s16 q1, q1, q13
-+        vsub.s16 q2, q2, q14
-+        vsub.s16 q3, q3, q15
-+
-+        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
-+        vcgt.u16 q13, q9, q5
-+        vcgt.u16 q14, q10, q6
-+        vcgt.u16 q15, q11, q7
-+
-+        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
-+        vadd.s16 q1, q1, q13
-+        vadd.s16 q2, q2, q14
-+        vadd.s16 q3, q3, q15
-+
-+        vmov.u8  q12, #2
-+
-+        vmovn.s16 d0, q0
-+        vmovn.s16 d1, q1
-+        vmovn.s16 d2, q2
-+        vmovn.s16 d3, q3
-+
-+        vldr     d26, [r5]
-+
-+        vuzp.8   q0, q1
-+
-+        vldr     d27, [r5, #8]
-+
-+        vadd.s8  q0, q0, q12
-+        vadd.s8  q1, q1, q12
-+
-+        vmov.i64 q12, #0
-+
-+        vtbl.8   d0, {d26}, d0
-+        vtbl.8   d1, {d26}, d1
-+        vtbl.8   d2, {d27}, d2
-+        vtbl.8   d3, {d27}, d3
-+
-+        vdup.i16 q13, r4
-+
-+        vzip.8   q0, q1
-+
-+        @ Avoid overwrite whilst widening
-+        vaddw.s8 q2, q6, d2
-+        vaddw.s8 q3, q7, d3
-+        vaddw.s8 q1, q5, d1
-+        vaddw.s8 q0, q4, d0
-+
-+        @ now clip
-+        clip16_4 q2, q3, q1, q0, q12, q13
-+
-+        bx       lr
-+endfunc
-+
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3, q9, q10
-+@
-+@ d16, d17 (q8) xlat U, V
-+@ q14.u8 #2
-+@ q15.u8 #128
-+
-+function edge_16b_body_8
-+        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
-+        vadd.u8  q9,  q14, q9
-+        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
-+        vsub.u8  q9,  q9,  q0
-+        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
-+        vadd.u8  q9,  q9,  q0
-+        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
-+        vsub.u8  q0,  q9,  q0
-+
-+        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
-+
-+        vuzp.8   d0,  d1
-+
-+        vtbl.8   d0,  {d16}, d0
-+        vtbl.8   d1,  {d17}, d1
-+
-+        vzip.8   d0,  d1
-+        vqadd.s8 q0,  q3
-+        vsub.s8  q0,  q15
-+
-+        bx      lr
-+endfunc
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3
-+@
-+@ q12, #0
-+@ d16, d17 xlat U, V
-+@ q14.u8 #2
-+@ q15.u16 max
-+function edge_16b_body_16
-+        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
-+        vadd.u16 q9, q14, q9
-+        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
-+        vsub.u16 q9, q9, q0
-+        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
-+        vadd.u16 q9, q9, q0
-+        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
-+        vsub.u16 q0, q9, q0
-+
-+        vmovn.s16 d0, q0
-+        @ d1 will have random contents that we transform but
-+        @ that doesn't matter as we then discard them
-+        vuzp.8   d0, d1
-+
-+        vtbl.8   d0, {d16}, d0
-+        vtbl.8   d1, {d17}, d1
-+
-+        vzip.8   d0, d1
-+
-+        vaddw.s8 q0, q1, d0
-+
-+        @ now clip
-+        vmax.s16 q0, q12
-+        vmin.s16 q0, q15
-+        bx       lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
-+@   int eo,                           [sp, #sp_base + 0]
-+@   int width,                        [sp, #sp_base + 4]
-+@   int height)                       [sp, #sp_base + 8]
-+
-+@ Jumps via jump_tab with
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   EDGE_SRC_STRIDE                   [r3]
-+@   (1 << \bit_depth) - 1             [r4]
-+@   * xlat_table                      [r5]  // setup_64b only
-+@   int height                        [r12]
-+@
-+@   0                                 [q12] // > 8 bit
-+@   2                                 [q14]
-+@   128                               [q15] // = 8 bit
-+@   r4                                [q15] // > 8 bit
-+
-+.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
-+
-+@ Build translate registers
-+@ As translate values can only be 0-4 we don't care about junk in the rest
-+@ of the register
-+.if \is_chroma
-+        ldr      ip, [sp, #0]
-+        push     {r4-r6, lr}    @ 16 bytes
-+        vld1.8   {d16[2]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[2]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[0]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[0]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[1]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[1]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[3]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[3]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[4]}, [r3]
-+        vld1.8   {d17[4]}, [ip]
-+        movw     r3, EDGE_SRC_STRIDE
-+.set sp_base, 20
-+.else
-+        add      ip, r3, #4
-+        vld1.8   {d16[1]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[0]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[0]}, [r3]
-+        add      r3, r3, #6
-+        vld1.8   {d17[1]}, [ip]
-+        vld1.8   {d16[2]}, [r3]
-+        movw     r3, EDGE_SRC_STRIDE
-+        push     {r4-r6, lr}    @ 16 bytes
-+        vzip.8   d16, d17
-+        vmov     d17, d16
-+.set sp_base, 16
-+.endif
-+
-+@ If setup_64b we need the xlat table on the stack
-+.if \setup_64b
-+        sub      r5, sp, #16
-+.endif
-+
-+@ Get jump address
-+@ We have a special case for width 4 as the calling code doesn't detect it
-+@ If we may have w4 then we add a 2nd jump table after the 1st
-+.if \check_w4
-+        ldr      r12, [sp, #sp_base + 4]        @ width
-+        adr      r6, \jump_tab
-+        ldr      lr, [sp, #sp_base + 0]        @ e0
-+        cmp      r12, #8
-+        it lt
-+        addlt    r6, #16
-+.else
-+        ldr      lr, [sp, #sp_base + 0]        @ e0
-+        adr      r6, \jump_tab
-+.endif
-+
-+        ldr      r12, [sp, #sp_base + 8]        @ height
-+
-+.if \bit_depth > 8
-+        movw     r4, (1 << \bit_depth) - 1
-+.endif
-+.if \setup_16b
-+.if \bit_depth > 8
-+        vmov.i64 q12, #0
-+        vdup.16  q15, r4
-+        vmov.u16 q14, #2
-+.else
-+        vmov.u8  q15, #128
-+        vmov.u8  q14, #2
-+.endif
-+.endif
-+
-+@ If setup_64b we need q4-q7 saved.
-+.if \setup_64b
-+        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
-+.set sp_base, sp_base + 80
-+.endif
-+
-+        ldr      r6, [r6, lr, lsl #2]
-+
-+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
-+.if \do2
-+        push     {r0, r1, r6, r12}
-+.if jent_pic
-+        bl       98f
-+.else
-+        blx      r6
-+.endif
-+        pop      {r0, r1, r6, r12}
-+
-+        add      r0, #64
-+        add      r1, #64
-+.endif
-+
-+.if jent_pic
-+        bl       98f
-+.else
-+        blx      r6
-+.endif
-+
-+@ Tidy up & return
-+.if \setup_64b
-+        vpop     {q4-q8}        @ spurious but harmless load of q8
-+.endif
-+        pop      {r4-r6, pc}
-+
-+.if jent_pic && !\xjump
-+@ Magic label - used as 98b in jent macro
-+98:
-+        add      pc, r6
-+.endif
-+.endm
-+
-+
-+.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
-+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
-+.endm
-+
-+.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
-+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
-+.endm
-+
-+
-+.macro  edge_64b_e0, body_fn, pb
-+        sub      r1, #8
-+        mov      r6, lr
-+1:      vldm     r1, {d7-d16}
-+        // load a
-+        vext.8   q0,  q3,  q4, #(16 - \pb)
-+        add      r1, r3
-+        vext.8   q1,  q4,  q5, #(16 - \pb)
-+        subs     r12, #1
-+        vext.8   q2,  q5,  q6, #(16 - \pb)
-+        vext.8   q3,  q6,  q7, #(16 - \pb)
-+        pld      [r1]
-+        // load b
-+        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
-+        pld      [r1, #64]
-+        vext.8   q8,  q4,  q5, #\pb
-+        vext.8   q9,  q5,  q6, #\pb
-+        vext.8   q10, q6,  q7, #\pb
-+        bl       \body_fn
-+        vstm     r0, {q0-q3}
-+        add      r0, r0, r2
-+        bgt      1b
-+        bx       r6
-+.endm
-+
-+.macro  edge_32bx2_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        sub      r1, #8
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      vldmia   r1, {d7-d12}
-+        // load a
-+        vext.8   q0, q3, q4, #16 - \pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q1, q4, q5, #16 - \pb
-+        subs     r12, #2
-+        // load b
-+        vext.8   q8, q4, q5, #\pb
-+        vext.8   q9, q5, q6, #\pb
-+        vldr     d25, [r6, #-8]
-+        vldmia   r6, {d12-d15}
-+        vldr     d26, [r6, #32]
-+        // load a
-+        vext.8   q2, q12, q6, #16 - \pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q3, q6, q7, #16 - \pb
-+        // load b
-+        vext.8   q10, q6, q7, #\pb
-+        vext.8   q11, q7, q13, #\pb
-+        bl       \body_fn
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+.endm
-+
-+.macro  edge_16b_e0, body_fn, pb
-+        sub      r1, #8
-+        mov      r6, lr
-+1:      vldmia   r1, {d1-d4}
-+        add      r1, r3
-+        subs     r12, #1
-+        vext.8   q0, q0, q1, #16 - \pb
-+        vext.8   q2, q1, q2, #\pb
-+
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        bgt      1b
-+        bx       r6
-+.endm
-+
-+.macro  edge_8bx2_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        sub      r1, #8
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      vldmia   r1, {d1-d2}
-+        vldmia   r6, {d3-d4}
-+        vldr     d6, [r1, #16]
-+        subs     r12, #2
-+        vldr     d7, [r6, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vext.8   d0, d1, d2, #8 - \pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   d5, d3, d4, #\pb
-+        vext.8   d4, d2, d6, #\pb
-+        vext.8   d1, d7, d3, #8 - \pb
-+
-+        bl       \body_fn
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+.endm
-+
-+.macro  edge_4bx4_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+
-+        tst      r1, #4
-+        bne      2f
-+1:      // r1 (and assumed r6) are 64-bit aligned
-+        vldr     d2, [r1]
-+        vldr     d0, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d20, [r6]
-+        subs     r12, #4
-+        vldr     d18, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d3, [r1]
-+        vshr.u64 d4, d2, #\pb * 8
-+        vldr     d1, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d21, [r6]
-+        vext.8   d0, d0, d2, #8 - \pb
-+        vldr     d19, [r6,#-8]
-+        add      r6, r6, r3, lsl #1
-+        vshr.u64 d22, d20, #\pb * 8
-+        vext.8   d18, d18, d20, #8 - \pb
-+        vshr.u64 d5, d3, #\pb * 8
-+        vext.8   d1, d1, d3, #8 - \pb
-+        vshr.u64 d23, d21, #\pb * 8
-+        vext.8   d19, d19, d21, #8 - \pb
-+        vsli.64  q1, q10, #32
-+        vsli.64  q2, q11, #32
-+        vsli.64  q0, q9, #32
-+
-+        bl       \body_fn
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+
-+2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
-+        vldr     d20, [r1, #-4]
-+        vldr     d22, [r1, #4]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d2, [r6, #-4]
-+        subs     r12, #4
-+        vldr     d4, [r6, #4]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d21, [r1, #-4]
-+        vshl.i64 d18, d20, #\pb * 8
-+        vldr     d23, [r1, #4]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d3, [r6, #-4]
-+        vext.8   d22, d20, d22, #\pb
-+        vldr     d5, [r6, #4]
-+        add      r6, r6, r3, lsl #1
-+        vshl.i64 d0, d2, #\pb * 8
-+        vext.8   d4, d2, d4, #\pb
-+        vshl.i64 d19, d21, #\pb * 8
-+        vext.8   d23, d21, d23, #\pb
-+        vshl.i64 d1, d3, #\pb * 8
-+        vext.8   d5, d3, d5, #\pb
-+        vsri.64  q1, q10, #32
-+        vsri.64  q0, q9, #32
-+        vsri.64  q2, q11, #32
-+
-+        bl       \body_fn
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      2b
-+        pop      {r7,pc}
-+.endm
-+
-+
-+.macro  edge_64b_e1, body_fn
-+        sub      r1, r3
-+        push     {lr}
-+        add      r6, r1, #32
-+        // load a
-+        vld1.8   {q0-q1}, [r1, :256], r3
-+        vld1.8   {q2-q3}, [r6, :256], r3
-+        // load c
-+        vld1.8   {q4-q5}, [r1, :256], r3
-+        vld1.8   {q6-q7}, [r6, :256], r3
-+1:      // load b
-+        vld1.8   {q8-q9}, [r1, :256], r3
-+        subs     r12, #1
-+        vld1.8   {q10-q11}, [r6, :256], r3
-+        bl       \body_fn
-+        vstm     r0, {q0-q3}
-+        // copy c to a
-+        vmov.64  q0, q4
-+        pld      [r1, r3]
-+        vmov.64  q1, q5
-+        it       le
-+        pople    {lr}
-+        vmov.64  q2, q6
-+        it       le
-+        bxle     lr
-+        vmov.64  q3, q7
-+        add      r0, r0, r2
-+        // copy b to c
-+        vmov.64  q4, q8
-+        vmov.64  q5, q9
-+        vmov.64  q6, q10
-+        vmov.64  q7, q11
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e1, body_fn
-+        sub      r6, r1, r3
-+        vld1.8   {q2-q3}, [r1, :256], r3
-+        vld1.8   {q0-q1}, [r6, :256]
-+        mov      r6, lr
-+
-+1:      @ Given the data duplication here we could obviously do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        vld1.8   {q8-q9}, [r1, :256], r3
-+        subs     r12, #2
-+        vmov     q4, q2
-+        vmov     q5, q3
-+        vld1.8   {q10-q11}, [r1, :256], r3
-+        vmov     q6, q8
-+        vmov     q7, q9
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        // copy b to a
-+        vmov     q0, q8
-+        vmov     q1, q9
-+        vst1.8   {q2-q3}, [r0, :256], r2
-+        vmov     q2, q10
-+        it       le
-+        bxle     r6
-+        vmov     q3, q11
-+        b        1b
-+.endm
-+
-+.macro  edge_16b_e1, body_fn
-+        sub      r6, r1, r3
-+        // load c
-+        vld1.8   {q1}, [r1, :128], r3
-+        // load a
-+        vld1.8   {q0}, [r6, :128]
-+        mov      r6, lr
-+1:      // load b
-+        vld1.8   {q2}, [r1, :128], r3
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        subs     r12, #1
-+        // copy c to a
-+        vmov.64  q0, q1
-+        it       le
-+        bxle     r6
-+        // copy b to c
-+        vmov.64  q1, q2
-+        b        1b
-+.endm
-+
-+.macro  edge_8bx2_e1, body_fn
-+        sub      r6, r1, r3
-+        lsl      r3, #1
-+        push     {r7, lr}
-+        vld1.8   {d1}, [r1, :64], r3
-+        vld1.8   {d0}, [r6, :64], r3
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      @ Given the data duplication here we could obviously do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        vld1.8   {d4}, [r6, :64], r3
-+        vmov     d2, d1
-+        vld1.8   {d5}, [r1, :64], r3
-+        subs     r12, #2
-+        vmov     d3, d4
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+
-+        // copy b to a
-+        vmov     q0, q2
-+        bgt      1b
-+        pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e1, body_fn
-+        sub      r6, r1, r3
-+        lsl      r3, #1
-+        push     {r7, lr}
-+        vld1.32  {d0[1]}, [r1, :32], r3
-+        add      r7, r0, r2
-+        vld1.32  {d0[0]}, [r6, :32], r3
-+        lsl      r2, #1
-+        vld1.32  {d4[1]}, [r1, :32], r3
-+        vld1.32  {d4[0]}, [r6, :32], r3
-+        vld1.32  {d5[1]}, [r1, :32], r3
-+        vld1.32  {d5[0]}, [r6, :32], r3
-+        vmov     d1, d4
-+        vext.32  d2, d0, d4, #1
-+        subs     r12, #4
-+        vmov     d22, d5
-+        vext.32  d3, d4, d5, #1
-+        b        2f
-+
-+1:      vst1.32  {d0[0]}, [r0, :32], r2
-+        vext.32  d2, d22, d4, #1
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vmov     d0, d22
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vext.32  d3, d4, d5, #1
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        vmov     d1, d4
-+        vmov     d22, d5
-+2:      @ Given the data duplication here we could probably do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        bl       \body_fn
-+        ble      3f
-+        vld1.32  {d4[0]}, [r6, :32], r3
-+        subs     r12, #4
-+        vld1.32  {d4[1]}, [r1, :32], r3
-+        vld1.32  {d5[0]}, [r6, :32], r3
-+        vld1.32  {d5[1]}, [r1, :32], r3
-+        b        1b
-+
-+3:      vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32]
-+        vst1.32  {d1[1]}, [r7, :32]
-+        pop      {r7, pc}
-+.endm
-+
-+.macro  edge_64b_e2, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        // load c and a
-+        vld1.8   {q4-q5}, [r1, :128]
-+        vldr     d25, [r6, #-8]
-+        vldmia   r6, {d16-d23}
-+        vext.8   q0, q12, q8, #16 - \pb
-+        add      r6, r1, #32
-+        vext.8   q1, q8, q9, #16 - \pb
-+        add      r1, r1, r3
-+        vext.8   q2, q9, q10, #16 - \pb
-+        vld1.8   {q6-q7}, [r6, :128]
-+        sub      r6, r1, r3
-+        vext.8   q3, q10, q11, #16 - \pb
-+
-+1:      // load b
-+        vldmia   r1, {d16-d24}
-+        vext.8   q8, q8, q9, #\pb
-+        pld      [r1, r3]
-+        vext.8   q9, q9, q10, #\pb
-+        subs     r12, #1
-+        vext.8   q10, q10, q11, #\pb
-+        vext.8   q11, q11, q12, #\pb
-+        bl       \body_fn
-+        // next a is mostly available in c
-+        vldr     d25, [r6, #-8]
-+        vstmia   r0, {q0-q3}
-+        vext.8   q3, q6, q7, #16 - \pb
-+        it       le
-+        pople    {lr}
-+        vext.8   q2, q5, q6, #16 - \pb
-+        it       le
-+        bxle     lr
-+        vext.8   q1, q4, q5, #16 - \pb
-+        add      r6, r6, r3
-+        vext.8   q0, q12, q4, #16 - \pb
-+        add      r0, r0, r2
-+        // next c is mostly available in b
-+        vldr     d8, [r1]
-+        vext.8   d9, d16, d17, #8 - \pb
-+        vext.8   q5, q8, q9, #16 - \pb
-+        add      r1, r1, r3
-+        vext.8   q6, q9, q10, #16 - \pb
-+        pld      [r6, #-8]
-+        vext.8   q7, q10, q11, #16 - \pb
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        // load a and first 32b of c
-+        vld1.8   {q4-q5}, [r1, :256]
-+        vldr     d25, [r6, #-8]
-+        vld1.8   {q13-q14}, [r6, :256]
-+        vldr     d31, [r1, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q0, q12, q13, #16 - \pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q1, q13, q14, #16 - \pb
-+        vext.8   q2, q15, q4, #16 - \pb
-+        vext.8   q3, q4, q5, #16 - \pb
-+1:
-+        // load second 32b of c and second 32b of b
-+        vldmia   r6, {d12-d16}
-+        vldmia   r1, {d20-d24}
-+        // first 32b of b is mostly available in second 32b of c
-+        vext.8   q9, q7, q8, #\pb
-+        subs     r12, #2
-+        vext.8   q8, q6, q7, #\pb
-+        vext.8   q10, q10, q11, #\pb
-+        vext.8   q11, q11, q12, #\pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        ble      2f
-+
-+        vldr     d25, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d8, [r1]
-+        vext.8   d9, d20, d21, #8 - \pb
-+        vldr     d31, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        // first 32b of a is mostly available in second 32b of c
-+        vext.8   q1, q6, q7, #16 - \pb
-+        vext.8   q0, q12, q6, #16 - \pb
-+        // first 32b of c is mostly available in second 32b of b
-+        vext.8   q5, q10, q11, #16 - \pb
-+        // second 32b of a is mostly available in first 32b of c
-+        vext.8   q2, q15, q4, #16 - \pb
-+        vext.8   q3, q4, q5, #16 - \pb
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_16b_e2, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        vld1.8   {q1}, [r1, :128], r3
-+        vldr     d19, [r6, #-8]
-+        vld1.8   {q10}, [r6, :128], r3
-+
-+1:      vldmia   r1, {d4-d6}
-+        vext.8   q0, q9, q10, #16 - \pb
-+        subs     r12, #1
-+        vext.8   q2, q2, q3, #\pb
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        ble      2f
-+        vmov     q10, q1
-+        vldr     d2, [r1]
-+        add      r1, r1, r3
-+        vldr     d19, [r6, #-8]
-+        add      r6, r6, r3
-+        vext.8   d3, d4, d5, #8 - \pb
-+        b        1b
-+
-+2:      pop      {pc}
-+.endm
-+
-+.macro  edge_8bx2_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        vldr     d18, [r6, #-8]
-+        vldr     d19, [r6]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #-8]
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldmia   r6, {d3-d4}
-+        vld1.8   {d21-d22}, [r1, :128]
-+
-+1:      vext.8   d0, d18, d19, #8 - \pb
-+        vext.8   d4, d3, d4, #\pb
-+        vext.8   d1, d20, d2, #8 - \pb
-+        subs     r12, #2
-+        vext.8   d5, d21, d22, #\pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        ble      2f
-+
-+        vldr     d18, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #-8]
-+        vmov     d19, d3
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldmia   r6, {d3-d4}
-+        vld1.8   {d21-d22}, [r1, :128]
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7-r9, lr}
-+        add      r8, r1, r3
-+        sub      r6, r6, #\pb
-+        add      r8, r8, #\pb
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+
-+1:      vld1.32  {d0[0]}, [r6], r3
-+        subs     r12, #4
-+        vld1.32  {d2[0]}, [r1], r3
-+        vld1.32  {d4[0]}, [r8], r3
-+        vld1.32  {d0[1]}, [r6], r3
-+        vld1.32  {d2[1]}, [r1], r3
-+        vld1.32  {d4[1]}, [r8], r3
-+        vld1.32  {d1[0]}, [r6], r3
-+        vld1.32  {d3[0]}, [r1], r3
-+        vld1.32  {d5[0]}, [r8], r3
-+        vld1.32  {d1[1]}, [r6], r3
-+        vld1.32  {d3[1]}, [r1], r3
-+        vld1.32  {d5[1]}, [r8], r3
-+
-+        bl       \body_fn
-+
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      1b
-+
-+        pop      {r7-r9,pc}
-+.endm
-+
-+.macro  edge_64b_e3, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        // load c and a
-+        vld1.8   {q4-q5}, [r1, :128]
-+        vldmia   r6, {d16-d24}
-+        vext.8   q0, q8, q9, #\pb
-+        add      r6, r1, #32
-+        vext.8   q1, q9, q10, #\pb
-+        add      r1, r1, r3
-+        vext.8   q2, q10, q11, #\pb
-+        vld1.8   {q6-q7}, [r6, :128]
-+        sub      r6, r1, r3
-+        vext.8   q3, q11, q12, #\pb
-+
-+1:      // load b
-+        vldr     d17, [r1, #-8]
-+        vldmia   r1, {d18-d25}
-+        vext.8   q8, q8, q9, #16 - \pb
-+        pld      [r1, r3]
-+        vext.8   q9, q9, q10, #16 - \pb
-+        subs     r12, #1
-+        vext.8   q10, q10, q11, #16 - \pb
-+        vext.8   q11, q11, q12, #16 - \pb
-+        bl       \body_fn
-+        // next a is mostly available in c
-+        vldr     d24, [r6, #64]
-+        vstmia   r0, {q0-q3}
-+        vext.8   q0, q4, q5, #\pb
-+        it       le
-+        pople    {lr}
-+        vext.8   q1, q5, q6, #\pb
-+        it       le
-+        bxle     lr
-+        vext.8   q2, q6, q7, #\pb
-+        add      r6, r6, r3
-+        vext.8   q3, q7, q12, #\pb
-+        add      r0, r0, r2
-+        // next c is mostly available in b
-+        vext.8   d14, d22, d23, #\pb
-+        vldr     d15, [r1, #56]
-+        vext.8   q4, q8, q9, #\pb
-+        add      r1, r1, r3
-+        vext.8   q5, q9, q10, #\pb
-+        vext.8   q6, q10, q11, #\pb
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e3, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        // load a and first 32b of c
-+        vldmia   r1, {d8-d12}
-+        vldmia   r6, {d24-d28}
-+        vext.8   q2, q4, q5, #\pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q3, q5, q6, #\pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q0, q12, q13, #\pb
-+        vext.8   q1, q13, q14, #\pb
-+1:
-+        // load second 32b of c and second 32b of b
-+        vldr     d25, [r6, #-8]
-+        subs     r12, #2
-+        vldmia   r6, {d12-d15}
-+        vldr     d27, [r1, #-8]
-+        vldmia   r1, {d20-d23}
-+        // first 32b of b is mostly available in second 32b of c
-+        vext.8   q8, q12, q6, #16 - \pb
-+        vext.8   q9, q6, q7, #16 - \pb
-+        vext.8   q11, q10, q11, #16 - \pb
-+        vext.8   q10, q13, q10, #16 - \pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        ble      2f
-+
-+        vldr     d24, [r6, #32]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d11, [r1, #24]
-+        vext.8   d10, d22, d23, #\pb
-+        vldr     d30, [r1, #32]
-+        add      r1, r1, r3, lsl #1
-+        // first 32b of a is mostly available in second 32b of c
-+        vext.8   q0, q6, q7, #\pb
-+        vext.8   q1, q7, q12, #\pb
-+        // first 32b of c is mostly available in second 32b of b
-+        vext.8   q4, q10, q11, #\pb
-+        // second 32b of a is mostly available in first 32b of c
-+        vext.8   q3, q5, q15, #\pb
-+        vext.8   q2, q4, q5, #\pb
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_16b_e3, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        vld1.8   {q1}, [r1, :128], r3
-+        vldmia   r6, {d18-d20}
-+        add      r6, r6, r3
-+
-+1:      vldr     d5, [r1, #-8]
-+        vld1.8   {q3}, [r1, :128]
-+        subs     r12, #1
-+        vext.8   q0, q9, q10, #\pb
-+        vext.8   q2, q2, q3, #16 - \pb
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        ble      2f
-+        vmov     q9, q1
-+        vldr     d3, [r1, #8]
-+        add      r1, r1, r3
-+        vldr     d20, [r6, #16]
-+        add      r6, r6, r3
-+        vext.8   d2, d4, d5, #\pb
-+        b        1b
-+
-+2:      pop      {pc}
-+.endm
-+
-+.macro  edge_8bx2_e3, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        vld1.8   {d18-d19}, [r6]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #8]
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d4, [r6, #-8]
-+        vldr     d3, [r6]
-+        vldr     d21, [r1, #-8]
-+        vldr     d22, [r1]
-+
-+1:      vext.8   d0, d18, d19, #\pb
-+        vext.8   d4, d4, d3, #8 - \pb
-+        vext.8   d1, d2, d20, #\pb
-+        subs     r12, #2
-+        vext.8   d5, d21, d22, #8 - \pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        ble      2f
-+
-+        vldr     d19, [r6, #8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #8]
-+        vmov     d18, d3
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d4, [r6, #-8]
-+        vldr     d3, [r6]
-+        vldr     d21, [r1, #-8]
-+        vldr     d22, [r1]
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e3, body_fn, pb
-+        @ e3 is the same as e2 but with the X offset reversed
-+        edge_4bx4_e2 \body_fn, (-\pb)
-+.endm
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
-+@ simpler and clearer in the code to stick with .word
-+T       .word  (0 + \lab) - (4 + 98b)
-+A       .word  (0 + \lab) - (8 + 98b)
-+.else
-+T       .word   1 + \lab
-+A       .word   \lab
-+.endif
-+.endm
-+
-+.macro edge_64b_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_64b_e0     \body_fn, \pb
-+10:     edge_64b_e1     \body_fn
-+20:     edge_64b_e2     \body_fn, \pb
-+30:     edge_64b_e3     \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_32bx2_e0   \body_fn, \pb
-+10:     edge_32bx2_e1   \body_fn
-+20:     edge_32bx2_e2   \body_fn, \pb
-+30:     edge_32bx2_e3   \body_fn, \pb
-+.endm
-+
-+.macro edge_16b_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_16b_e0     \body_fn, \pb
-+10:     edge_16b_e1     \body_fn
-+20:     edge_16b_e2     \body_fn, \pb
-+30:     edge_16b_e3     \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_32bx2_e0   \body_fn_64b, \pb
-+10:     edge_32bx2_e1   \body_fn_64b
-+20:     edge_32bx2_e2   \body_fn_64b, \pb
-+30:     edge_32bx2_e3   \body_fn_64b, \pb
-+5:      edge_16b_e0     \body_fn_16b, \pb
-+15:     edge_16b_e1     \body_fn_16b
-+25:     edge_16b_e2     \body_fn_16b, \pb
-+35:     edge_16b_e3     \body_fn_16b, \pb
-+.endm
-+
-+.macro edge_16b_8bx2_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_16b_e0     \body_fn, \pb
-+10:     edge_16b_e1     \body_fn
-+20:     edge_16b_e2     \body_fn, \pb
-+30:     edge_16b_e3     \body_fn, \pb
-+5:      edge_8bx2_e0    \body_fn, \pb
-+15:     edge_8bx2_e1    \body_fn
-+25:     edge_8bx2_e2    \body_fn, \pb
-+35:     edge_8bx2_e3    \body_fn, \pb
-+.endm
-+
-+.macro edge_8bx2_4bx4_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_8bx2_e0    \body_fn, \pb
-+10:     edge_8bx2_e1    \body_fn
-+20:     edge_8bx2_e2    \body_fn, \pb
-+30:     edge_8bx2_e3    \body_fn, \pb
-+5:      edge_4bx4_e0    \body_fn, \pb
-+15:     edge_4bx4_e1    \body_fn
-+25:     edge_4bx4_e2    \body_fn, \pb
-+35:     edge_4bx4_e3    \body_fn, \pb
-+.endm
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_8, export=1
-+        edge_16b_init   8, 0, 1, 99f
-+99:
-+        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_8, export=1
-+        edge_16b_init   8, 0, 0, 99f
-+99:
-+        edge_16b_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_8, export=1
-+        edge_64b_init   8, 0, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_64_neon_8, export=1
-+        edge_64b_init   8, 0, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
-+        edge_16b_init   8, 1, 1, 99f
-+99:
-+        edge_16b_8bx2_bodies edge_16b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
-+        edge_64b_init   8, 1, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
-+        edge_64b_init   8, 1, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_10, export=1
-+        edge_16b_init   10, 0, 1, 99f
-+99:
-+        edge_16b_8bx2_bodies edge_16b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_10, export=1
-+        edge_64b_init   10, 0, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+@ We simply split the 32 case into 2 vertical stripes
-+@ and call the fns for w32
-+@
-+@ Calling code will always have src != dst so we don't have to worry
-+@ about edge effects
-+
-+function ff_hevc_rpi_sao_edge_64_neon_10, export=1
-+        edge_64b_init   10, 0, 1, 99f, xjump=1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_10, export=1
-+        edge_64b_init   10, 0, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
-+        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
-+99:
-+        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
-+        edge_64b_init   10, 1, 1, 99f, xjump=1
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
-+        edge_64b_init   10, 1, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_16, 4
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
-new file mode 100644
-index 0000000000..36a23a5bf9
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_arm.h
-@@ -0,0 +1,28 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
-+#define AVCODEC_ARM_HEVCPRED_ARM_H
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+
-+void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
-new file mode 100644
-index 0000000000..80724d4cf3
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
-@@ -0,0 +1,35 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/cpu.h"
-+#include "libavutil/arm/cpu.h"
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+#include "rpi_hevcpred_arm.h"
-+
-+av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+    int cpu_flags = av_get_cpu_flags();
-+
-+    if (have_neon(cpu_flags))
-+        ff_hevc_rpi_pred_init_neon(c, bit_depth);
-+}
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
-new file mode 100644
-index 0000000000..21e7700174
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
-@@ -0,0 +1,210 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcpred_arm.h"
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+    switch (bit_depth)
-+    {
-+    case 8:
-+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
-+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
-+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16;  // Equivalent to c_4_neon_8
-+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+
-+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
-+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
-+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
-+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
-+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
-+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
-+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
-+
-+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
-+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
-+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
-+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
-+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
-+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
-+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
-+
-+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
-+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
-+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
-+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
-+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
-+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
-+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
-+
-+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
-+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
-+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
-+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
-+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
-+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
-+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
-+
-+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_8;
-+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_8;
-+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_8;
-+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_8;
-+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
-+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
-+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
-+        break;
-+    case 10:
-+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
-+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+        c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
-+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
-+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
-+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
-+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
-+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
-+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
-+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
-+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
-+
-+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
-+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
-+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
-+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
-+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
-+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
-+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
-+
-+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
-+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
-+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
-+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
-+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
-+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
-+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
-+
-+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
-+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
-+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
-+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
-+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
-+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
-+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
-+
-+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_10;
-+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_10;
-+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_10;
-+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_10;
-+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
-+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
-+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
-+        break;
-+    default:
-+        break;
-+    }
-+}
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-new file mode 100644
-index 0000000000..3dd9246a16
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-@@ -0,0 +1,2975 @@
-+/*
-+ * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/*
-+ * General angular pred
-+ *
-+ * Horizontal (10) & Vertical (26) cases have their own file
-+ * and are not dealt with properly here (luma filtering is missing)
-+ *
-+ * The inv_angle calculations are annoying - if it wasn't for the +128
-+ * rounding step then the result would simply be the loop counter :-(
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.text
-+
-+@ Horizontal Patch functions
-+@ These need a transpose before store so exist as smaller patches
-+@ Patches can be called repeatedly without any intermediate setup
-+@ to generate a horizontal block
-+@
-+@ It is almost certainly the case that larger patch fns can be built
-+@ and they would be a little faster, but we would still need the small
-+@ fns and code size (or at least instruction cache size) is an issue
-+@ given how much code we already have here
-+
-+@ Generate 8x8 luma 8 patch
-+@
-+@ r3   Out stride
-+@ r4   Angle add
-+@ r7   Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2   Left ptr - updated
-+@ r10  Inv angle accumulator (_up only)
-+@ r12  32 - angle frac (_down) or angle frac (_up)
-+@ d0   Older reference samples
-+@ d1=r8+r9  Newer reference samples
-+@ d2   32 - angle frac
-+@ d3   Angle frac
-+@ q2   Partially computed next result (_up only)
-+@
-+@ Temps
-+@ r5   Loop counter
-+@ r6
-+@ r7   (_down only)
-+@ r11  (_up only)
-+@ q2, q8-q11
-+
-+patch_h_down_8x8_8:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        lsr         r8, #8
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #24
-+        ldr         r9, [r2, #5]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_8x8_8_continue:
-+        mov         r5, #8
-+1:
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+        vext.8      q8, q8, q9, #8
-+          itt         mi
-+          lsrmi       r7, r8, #8
-+          vmovmi      d0, r8, r9
-+          vdup.8      d2, r12
-+        vext.8      q9, q9, q10, #8
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #24
-+        vext.8      q10, q10, q11, #8
-+          it          mi
-+          ldrmi       r9, [r2, #1]!
-+        vmov        d22, d23
-+        vrshrn.u16  d23, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.8      d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_8x8_8:
-+        vzip.8      d16, d17
-+        add         r6, r0, r3
-+        vzip.8      d18, d19
-+        lsl         r3, #1
-+        vzip.8      d20, d21
-+        add         r5, r0, r3
-+        vzip.8      d22, d23
-+        vzip.16     q8, q9
-+        vzip.16     q10, q11
-+        vzip.32     q8, q10
-+        vzip.32     q9, q11
-+        vst1.8      {d16}, [r0]!
-+        vst1.8      {d17}, [r6], r3
-+        vst1.8      {d20}, [r5], r3
-+        vst1.8      {d21}, [r6], r3
-+        vst1.8      {d18}, [r5], r3
-+        vst1.8      {d19}, [r6], r3
-+        vst1.8      {d22}, [r5]
-+        asr         r3, #1
-+        vst1.8      {d23}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_8x8_8:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r4
-+        lsr         r11, r8, #24
-+        vdup.8      d2, r6
-+        ldr         r8, [r2, #-1]!
-+        orr         r9, r11, r9, lsl #8
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmull.u8    q2, d0, d2
-+        vmlal.u8    q2, d1, d3
-+patch_h_up_8x8_8_continue:
-+        mov         r5, #8
-+1:
-+          add         r12, r4
-+          mov         r11, #0
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+        vext.8      q8, q8, q9, #8
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #8
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          vdup.8      d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #24
-+        vext.8      q9, q9, q10, #8
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #8
-+          ldrbcs      r11, [r1, r11]
-+          vdup.8      d3, r12
-+        vext.8      q10, q10, q11, #8
-+          it          hi
-+          ldrbhi      r11, [r2, #-1]!
-+        vmov        d22, d23
-+        vrshrn.u16  d23, q2, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #8
-+          vmovcs      d1, r8, r9
-+          vmull.u8    q2, d0, d2
-+        subs        r5, #1
-+          vmlal.u8    q2, d1, d3
-+        bne         1b
-+
-+        b           store_tran_8x8_8
-+
-+
-+.macro ADRT reg, val
-+@ adr in T32 has enough range but not in A32
-+A       adrl        \reg, \val
-+T       adr         \reg, \val
-+.endm
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        ldr         lr, [r2], #1        @ Top
-+        rsb         r12, r6, #32
-+        vmov        s0, lr
-+        vdup.8      d3, r6
-+        ldr         lr, [r2], #1
-+        vdup.8      d2, r12
-+        vmov        s2, lr
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      s0, lr
-+          ldrmi       lr, [r2], #1
-+          vdup.8      d2, r12
-+          it          mi
-+          vmovmi      s2, lr
-+          vdup.8      d3, r6
-+        mov         r5, #2
-+1:
-+        vrshrn.u16  d20, q2, #5
-+            subs        r12, r4
-+          vmull.u8    q2, d0, d2
-+            it          mi
-+            addmi       r12, #32
-+          vmlal.u8    q2, d1, d3
-+            rsb         r6, r12, #32
-+        vext.64     q8, q8, q9, #1
-+            it          mi
-+            vmovmi      s0, lr
-+        vext.64     q9, q9, q10, #1
-+            it          mi
-+            ldrmi       lr, [r2], #1
-+            vdup.8      d2, r12
-+            it          mi
-+            vmovmi      s2, lr
-+        subs        r5, #1
-+            vdup.8      d3, r6
-+        bne         1b
-+
-+          vrshrn.u16  d20, q2, #5
-+            vmull.u8    q2, d0, d2
-+        add         r12, r0,  r3
-+            vmlal.u8    q2, d1, d3
-+        lsl         r3,  #1
-+          vext.64     q8, q8, q9, #1
-+          vext.64     q9, q9, q10, #1
-+            vrshrn.u16  d20, q2, #5
-+
-+98:
-+        vst4.8      {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
-+        vst4.8      {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
-+        vst4.8      {d17[2], d18[2], d19[2], d20[2]}, [r0]
-+        vst4.8      {d17[3], d18[3], d19[3], d20[3]}, [r12]
-+        pop        {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        rsb         r12, r6, #32
-+        ldr         lr, [r2]            @ Left
-+        ldrb        r2, [r2, #-1]       @ Top-left
-+        vmov        s0, lr
-+        vdup.8      d2, r12
-+        vdup.8      d3, r6
-+        orr         lr, r2, lr, lsl #8
-+        vmov        s2, lr
-+        sub         r8, r7, #128
-+        mov         r5, #3
-+2:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+T         it          mi
-+          addmi       r12, #32
-+T         asr         r6, r8, #8
-+T         it          mi
-+T         ldrbmi      r2, [r1, r6]
-+A         ldrbmi      r2, [r1, r8, asr #8]
-+          rsb         r6, r12, #32
-+          vdup.8      d2, r12
-+          ittt        mi
-+          vmovmi      s0, lr
-+          orrmi       lr, r2, lr, lsl #8
-+          vmovmi      s2, lr
-+        vrshrn.u16  d20, q2, #5
-+          vdup.8      d3, r6
-+          it          mi
-+          addmi       r8, r7
-+        subs        r5, #1
-+        vext.64     q8, q8, q9, #1
-+        vext.64     q9, q9, q10, #1
-+        bne         2b
-+
-+          vmull.u8    q2, d0, d2
-+        add         r12, r0,  r3
-+          vmlal.u8    q2, d1, d3
-+        lsl         r3,  #1
-+          vrshrn.u16  d20, q2, #5
-+        b           98b
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrh        r7, [r7]
-+        rsb         r12, r6, #32
-+        ldr         lr, [r1]            @ Top
-+        ldrb        r1, [r2, #-1]       @ Top-left
-+        vmov        s0, lr
-+        vdup.8      d2, r12
-+        vdup.8      d3, r6
-+        orr         lr, r1, lr, lsl #8
-+        vmov        s2, lr
-+        sub         r8, r7, #128
-+        mov         r5, #3
-+2:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+T         it          mi
-+          addmi       r12, #32
-+T         asr         r6, r8, #8
-+T         it          mi
-+T         ldrbmi      r1, [r2, r6]
-+A         ldrbmi      r1, [r2, r8, asr #8]
-+          rsb         r6, r12, #32
-+          vdup.8      d2, r12
-+          ittt        mi
-+          vmovmi      s0, lr
-+          orrmi       lr, r1, lr, lsl #8
-+          vmovmi      s2, lr
-+        vrshrn.u16  d4, q2, #5
-+          vdup.8      d3, r6
-+          it          mi
-+          addmi       r8, r7
-+        subs        r5, #1
-+        vst1.32     {d4[0]}, [r0], r3
-+        bne         2b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.32     {d4[0]}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldr         lr, [r1], #1        @ Top
-+        rsb         r12, r6, #32
-+        vmov        s0, lr
-+        vdup.8      d3, r6
-+        ldr         lr, [r1], #1
-+        vdup.8      d2, r12
-+        vmov        s2, lr
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      s0, lr
-+          ldrmi       lr, [r1], #1
-+          vdup.8      d2, r12
-+          it          mi
-+          vmovmi      s2, lr
-+          vdup.8      d3, r6
-+        mov         r5, #2
-+1:
-+        vrshrn.u16  d6, q2, #5
-+            subs        r12, r4
-+          vmull.u8    q2, d0, d2
-+            it          mi
-+            addmi       r12, #32
-+          vmlal.u8    q2, d1, d3
-+            rsb         r6, r12, #32
-+        vst1.32     {d6[0]}, [r0], r3
-+            itt         mi
-+            vmovmi      s0, lr
-+            ldrmi       lr, [r1], #1
-+            vdup.8      d2, r12
-+            it          mi
-+            vmovmi      s2, lr
-+        subs        r5, #1
-+            vdup.8      d3, r6
-+        bne         1b
-+
-+          vrshrn.u16  d6, q2, #5
-+            vmull.u8    q2, d0, d2
-+            vmlal.u8    q2, d1, d3
-+          vst1.32     {d6[0]}, [r0], r3
-+            vrshrn.u16  d6, q2, #5
-+            vst1.32     {d6[0]}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_8x8_8
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_8x8_8
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrb        lr, [r2, #-1]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #8
-+        vdup.8      d2, r12
-+        orr         r9, r9, r8, lsr #24
-+        orr         r8, lr, r8, lsl #8
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #7
-+1:
-+        vdup.8      d3, r6
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r12, r4
-+        vmlal.u8    q2, d1, d3
-+          ittt        mi
-+          addmi       lr, r2, r1, asr #8
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #8
-+          ldrbmi      lr, [lr]
-+          vdup.8      d2, r12
-+        vrshrn.u16  d4, q2, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #24
-+          orrmi       r8, lr, r8, lsl #8
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.8      {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.8      d3, r6
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.8      {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        mov         r5, #7
-+        lsr         r8, #8
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #24
-+        ldr         r9, [r1, #5]!
-+        vmov        d1, r8, r9
-+1:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #8
-+          vdup.8      d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #24
-+          ldrmi       r9, [r1, #1]!
-+        vrshrn.u16  d6, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.8      d3, r6
-+        subs        r5, #1
-+        vst1.8      {d6}, [r0], r3
-+        bne         1b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d6, q2, #5
-+          vst1.8      {d6}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+
-+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #3
-+
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #3
-+        sub         r10, r10, r7, lsl #3
-+
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q9}, [r1]
-+        sub         r1, r2, #1
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.8      d6, r6
-+        vext.8      q8, q9, q9, #15
-+        sub         r8, r7, #128
-+        vld1.8      {d16[0]}, [r1]
-+        vdup.8      d7, r12
-+        mov         r5, #15
-+1:
-+        vmull.u8    q0, d18, d7
-+        subs        r12, r4
-+        vmlal.u8    q0, d16, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d19, d7
-+        it          cc
-+        addcc       r1, r2, r8, asr #8
-+        vmlal.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vext.8      q10, q8, q8, #15
-+        sub         r5, #1
-+        vld1.8      {d20[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d22, d7
-+        subs        r12, r4
-+        vmlal.u8    q0, d20, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d23, d7
-+        it          cc
-+        addcc       r1, r2, r8, asr #8
-+        vmlal.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vext.8      q8, q10, q10, #15
-+        sub         r5, #1
-+        vld1.8      {d16[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d22, d7
-+        vmlal.u8    q0, d20, d6
-+        vmull.u8    q1, d23, d7
-+        vmlal.u8    q1, d21, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d18, d7
-+        vmlal.u8    q0, d16, d6
-+        vmull.u8    q1, d19, d7
-+        vmlal.u8    q1, d17, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.8      {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vext.8      q8, q9, q9, #1
-+        vld1.8      {d17[7]}, [r1]!
-+        mov         r5, #15
-+1:
-+        vmull.u8    q0, d16, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d18, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d19, d7
-+        sub         r5, #1
-+        vext.8      q10, q8, q8, #1
-+        teq         r5, #0
-+        vld1.8      {d21[7]}, [r1]
-+        it          cc
-+        addcc       r1, #1
-+        vmov        q11, q8
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d20, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d22, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d23, d7
-+        sub         r5, #1
-+        vext.8      q8, q10, q10, #1
-+        teq         r5, #0
-+        vld1.8      {d17[7]}, [r1]
-+        it          cc
-+        addcc       r1, #1
-+        vmov        q9, q10
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d20, d6
-+        vmlal.u8    q0, d22, d7
-+        vmull.u8    q1, d21, d6
-+        vmlal.u8    q1, d23, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d16, d6
-+        vmlal.u8    q0, d18, d7
-+        vmull.u8    q1, d17, d6
-+        vmlal.u8    q1, d19, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+        bl          patch_h_down_8x8_8_continue
-+        bl          patch_h_down_8x8_8_continue
-+
-+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
-+        add         r1, r1, #8
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #3
-+        bne         1b
-+
-+        pop        {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2,r10}
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+        bl          patch_h_up_8x8_8_continue
-+        bl          patch_h_up_8x8_8_continue
-+        pop         {r2,r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #3
-+        sub         r10, r10, r7, lsl #3
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop        {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q0-q1}, [r1]
-+        sub         r9, r2, #1
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        mov         r5, #32
-+1:
-+        vld1.8      {d17[7]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        add         r9, r2, r8, asr #8
-+        vext.8      q1, q0, q1, #15
-+        vext.8      q0, q8, q0, #15
-+2:
-+        vmull.u8    q10, d4, d19
-+        subs        r12, r4
-+        vmlal.u8    q10, d0, d18
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d5, d19
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d1, d18
-+        sub         r5, #1
-+        vmull.u8    q12, d6, d19
-+        teq         r5, #0
-+        vmlal.u8    q12, d2, d18
-+        vmull.u8    q13, d7, d19
-+        vmlal.u8    q13, d3, d18
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.8      {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.8      {d16[0]}, [r5]
-+        mov         r5, #32
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #1
-+        vmov        q3, q1
-+        vext.8      q0, q0, q1, #1
-+        vext.8      q1, q1, q8, #1
-+2:
-+        vmull.u8    q10, d0, d18
-+        subs        r12, r4
-+        vmlal.u8    q10, d4, d19
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d1, d18
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d5, d19
-+        sub         r5, #1
-+        vmull.u8    q12, d2, d18
-+        teq         r5, #0
-+        vmlal.u8    q12, d6, d19
-+        vmull.u8    q13, d3, d18
-+        vmlal.u8    q13, d7, d19
-+        vld1.8      {d16[0]}, [r1]
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ Chroma 8 bit 4x4 patch fns
-+        .text
-+
-+patch_h_down_c_4x4_8:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        lsr         r8, #16
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r2, #6]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_c_4x4_8_continue:
-+        mov         r5, #4
-+1:
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+        vext.8      q8, q8, q9, #8
-+          it          mi
-+          lsrmi       r7, r8, #16
-+        vmov        d18, d19
-+          it          mi
-+          vmovmi      d0, r8, r9
-+          vdup.8      d2, r12
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #16
-+        vrshrn.u16  d19, q2, #5
-+          itt         mi
-+          ldrmi       r9, [r2, #2]!
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.8      d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_c_4x4_8:
-+        vzip.16     d16, d17
-+        add         r6, r0, r3
-+        vzip.16     d18, d19
-+        lsl         r3, #1
-+        vzip.32     q8, q9
-+        add         r5, r0, r3
-+        vst1.16     {d16}, [r0]!
-+        vst1.16     {d17}, [r6], r3
-+        vst1.16     {d18}, [r5]
-+        asr         r3, #1
-+        vst1.16     {d19}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_c_4x4_8:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r4
-+        lsr         r11, r8, #16
-+        vdup.8      d2, r6
-+        ldr         r8, [r2, #-2]!
-+        orr         r9, r11, r9, lsl #16
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmull.u8    q2, d0, d2
-+        vmlal.u8    q2, d1, d3
-+patch_h_up_c_4x4_8_continue:
-+        mov         r5, #4
-+1:
-+          add         r12, r4
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+          mov         r11, #0
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #7
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          it          eq
-+          biceq       r11, #1
-+          vdup.8      d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #16
-+          vdup.8      d3, r12
-+        vext.8      q8, q8, q9, #8
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #16
-+          ldrhcs      r11, [r1, r11]
-+        vmov        d18, d19
-+          it          hi
-+          ldrhhi      r11, [r2, #-2]!
-+        vrshrn.u16  d19, q2, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #16
-+          vmovcs      d1, r8, r9
-+          vmull.u8    q2, d0, d2
-+        subs        r5, #1
-+          vmlal.u8    q2, d1, d3
-+        bne         1b
-+
-+        b           store_tran_c_4x4_8
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_c_4x4_8
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_c_4x4_8
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrh        lr, [r2, #-2]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #16
-+        vdup.8      d2, r12
-+        orr         r9, r9, r8, lsr #16
-+        orr         r8, lr, r8, lsl #16
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #3
-+1:
-+        vdup.8      d3, r6
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r12, r4
-+        vmlal.u8    q2, d1, d3
-+          itttt       mi
-+          addmi       lr, r2, r1, asr #7
-+          bicmi       lr, #1
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #16
-+          ldrhmi      lr, [lr]
-+          vdup.8      d2, r12
-+        vrshrn.u16  d4, q2, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #16
-+          orrmi       r8, lr, r8, lsl #16
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.8      d3, r6
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        mov         r5, #3
-+        lsr         r8, #16
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r1, #6]!
-+        vmov        d1, r8, r9
-+1:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #16
-+          vdup.8      d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #16
-+          ldrmi       r9, [r1, #2]!
-+        vrshrn.u16  d6, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.8      d3, r6
-+        subs        r5, #1
-+        vst1.16     {d6}, [r0], r3
-+        bne         1b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d6, q2, #5
-+          vst1.16     {d6}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q9}, [r1]
-+        sub         r1, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.8      d6, r6
-+        vext.8      q8, q9, q9, #14
-+        sub         r8, r7, #128
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.8      d7, r12
-+        mov         r5, #7
-+1:
-+        subs        r12, r4
-+        vmull.u8    q0, d18, d7
-+        it          cc
-+        asrcc       r1, r8, #8
-+        vmlal.u8    q0, d16, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d19, d7
-+        it          cc
-+        addcc       r1, r2, r1, lsl #1
-+        vmlal.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vext.8      q10, q8, q8, #14
-+        sub         r5, #1
-+        vld1.16     {d20[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        subs        r12, r4
-+        vmull.u8    q0, d22, d7
-+        it          cc
-+        asrcc       r1, r8, #8
-+        vmlal.u8    q0, d20, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d23, d7
-+        it          cc
-+        addcc       r1, r2, r1, lsl #1
-+        vmlal.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vext.8      q8, q10, q10, #14
-+        sub         r5, #1
-+        vld1.16     {d16[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d22, d7
-+        vmlal.u8    q0, d20, d6
-+        vmull.u8    q1, d23, d7
-+        vmlal.u8    q1, d21, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d18, d7
-+        vmlal.u8    q0, d16, d6
-+        vmull.u8    q1, d19, d7
-+        vmlal.u8    q1, d17, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.8      {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vext.8      q8, q9, q9, #2
-+        vld1.16     {d17[3]}, [r1]!
-+        mov         r5, #7
-+1:
-+        vmull.u8    q0, d16, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d18, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d19, d7
-+        sub         r5, #1
-+        vext.8      q10, q8, q8, #2
-+        teq         r5, #0
-+        vld1.16     {d21[3]}, [r1]
-+        it          cc
-+        addcc       r1, #2
-+        vmov        q11, q8
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d20, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d22, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d23, d7
-+        sub         r5, #1
-+        vext.8      q8, q10, q10, #2
-+        teq         r5, #0
-+        vld1.16     {d17[3]}, [r1]
-+        it          cc
-+        addcc       r1, #2
-+        vmov        q9, q10
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d20, d6
-+        vmlal.u8    q0, d22, d7
-+        vmull.u8    q1, d21, d6
-+        vmlal.u8    q1, d23, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d16, d6
-+        vmlal.u8    q0, d18, d7
-+        vmull.u8    q1, d17, d6
-+        vmlal.u8    q1, d19, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+        bl          patch_h_down_c_4x4_8_continue
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+        bl          patch_h_up_c_4x4_8_continue
-+        bl          patch_h_up_c_4x4_8_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q0-q1}, [r1]
-+        sub         r9, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        mov         r5, #16
-+1:
-+        vld1.16     {d17[3]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.8      q1, q0, q1, #14
-+        add         r9, r2, r9, lsl #1
-+        vext.8      q0, q8, q0, #14
-+2:
-+        vmull.u8    q10, d4, d19
-+        subs        r12, r4
-+        vmlal.u8    q10, d0, d18
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d5, d19
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d1, d18
-+        sub         r5, #1
-+        vmull.u8    q12, d6, d19
-+        teq         r5, #0
-+        vmlal.u8    q12, d2, d18
-+        vmull.u8    q13, d7, d19
-+        vmlal.u8    q13, d3, d18
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.8      {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.16     {d16[0]}, [r5]
-+        mov         r5, #16
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #2
-+        vmov        q3, q1
-+        vext.8      q0, q0, q1, #2
-+        vext.8      q1, q1, q8, #2
-+2:
-+        vmull.u8    q10, d0, d18
-+        subs        r12, r4
-+        vmlal.u8    q10, d4, d19
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d1, d18
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d5, d19
-+        sub         r5, #1
-+        vmull.u8    q12, d2, d18
-+        teq         r5, #0
-+        vmlal.u8    q12, d6, d19
-+        vmull.u8    q13, d3, d18
-+        vmlal.u8    q13, d7, d19
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+@------------------------------------------------------------------------------
-+@ Data
-+
-+        .text
-+        .balign  64
-+angle_2:
-+        .byte    32
-+        .byte    26,  21,  17,  13,   9,   5,   2,   0
-+        @ Sign inverted from standards table
-+        .byte     2,   5,   9,  13,  17,  21,  26,  32
-+        .byte    26,  21,  17,  13,   9,   5,   2,   0
-+        @ Standard sign
-+        .byte     2,   5,   9,  13,  17,  21,  26,  32
-+
-+        .balign   2
-+
-+        @ Sign inverted from standards table
-+inv_angle:
-+        .short   4096, 1638,  910,  630,  482,  390,  315
-+        .short    256
-+        .short    315,  390,  482,  630,  910, 1638, 4096
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 bit fns
-+@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
-+@ but runs out of register width for 12+ bit
-+
-+        .text
-+        .balign 64
-+
-+patch_h_down_4x4_10:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r6
-+        lsr         r8, #16
-+        vdup.16     d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r2, #6]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_4x4_10_continue:
-+        mov         r5, #4
-+1:
-+          subs        r12, r4
-+        vmul.u16    d4, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmla.u16    d4, d1, d3
-+          rsb         r6, r12, #32
-+        vext.16     q8, q8, q9, #4
-+          it          mi
-+          lsrmi       r7, r8, #16
-+        vmov        d18, d19
-+          it          mi
-+          vmovmi      d0, r8, r9
-+          vdup.16     d2, r12
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #16
-+        vrshr.u16   d19, d4, #5
-+          itt         mi
-+          ldrmi       r9, [r2, #2]!
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.16     d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_4x4_10:
-+        vzip.16     d16, d17
-+        add         r6, r0, r3
-+        vzip.16     d18, d19
-+        lsl         r3, #1
-+        vzip.32     q8, q9
-+        add         r5, r0, r3
-+        vst1.16     {d16}, [r0]!
-+        vst1.16     {d17}, [r6], r3
-+        vst1.16     {d18}, [r5]
-+        asr         r3, #1
-+        vst1.16     {d19}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_4x4_10:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r4
-+        lsr         r11, r8, #16
-+        vdup.16     d2, r6
-+        ldr         r8, [r2, #-2]!
-+        orr         r9, r11, r9, lsl #16
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmul.u16    d4, d0, d2
-+        vmla.u16    d4, d1, d3
-+patch_h_up_4x4_10_continue:
-+        mov         r5, #4
-+1:
-+          add         r12, r4
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+          mov         r11, #0
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #7
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          it          eq
-+          biceq       r11, #1
-+          vdup.16     d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #16
-+          vdup.16     d3, r12
-+        vext.16     q8, q8, q9, #4
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #16
-+          ldrhcs      r11, [r1, r11]
-+        vmov        d18, d19
-+          it          hi
-+          ldrhhi      r11, [r2, #-2]!
-+        vrshr.u16   d19, d4, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #16
-+          vmovcs      d1, r8, r9
-+          vmul.u16    d4, d0, d2
-+        subs        r5, #1
-+          vmla.u16    d4, d1, d3
-+        bne         1b
-+
-+        b           store_tran_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_4x4_10
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_4x4_10
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrh        lr, [r2, #-2]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #16
-+        vdup.16     d2, r12
-+        orr         r9, r9, r8, lsr #16
-+        orr         r8, lr, r8, lsl #16
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #3
-+1:
-+        sel         lr, lr, lr          @ force pipeline 0 on Cortex-A53
-+        vdup.16     d3, r6
-+        vmul.u16    d4, d0, d2
-+          subs        r12, r12, r4
-+        vmla.u16    d4, d1, d3
-+          itttt       mi
-+          addmi       lr, r2, r1, asr #7
-+          bicmi       lr, #1
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #16
-+          ldrhmi      lr, [lr]
-+          vdup.16     d2, r12
-+        vrshr.u16   d4, d4, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #16
-+          orrmi       r8, lr, r8, lsl #16
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.16     d3, r6
-+          nop                           @ force next insn into pipeline 0 to enable
-+          vmul.u16    d4, d0, d2        @ vmla to execute back-to-back on Cortex-A53
-+          vmla.u16    d4, d1, d3
-+          vrshr.u16   d4, d4, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r6
-+        lsr         r8, #16
-+        vdup.16     d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r1, #6]!
-+        vmov        d1, r8, r9
-+        mov         r5, #3
-+1:
-+        vmul.u16    d4, d0, d2
-+          subs        r12, r4
-+        vmla.u16    d4, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #16
-+          vdup.16     d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #16
-+          ldrmi       r9, [r1, #2]!
-+        vrshr.u16   d4, d4, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.16     d3, r6
-+        subs        r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vmul.u16    d4, d0, d2
-+          vmla.u16    d4, d1, d3
-+          vrshr.u16   d4, d4, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q9}, [r1]
-+        sub         r1, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.16     q2, r6
-+        vext.16     q8, q9, q9, #7
-+        sub         r8, r7, #128
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.16     q3, r12
-+        mov         r5, #7
-+1:
-+        vmul.u16    q0, q9, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q8, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #1
-+        vext.16     q10, q8, q8, #7
-+        rsb         r6, r12, #32
-+        vmov        q11, q8
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.16     {d20[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q11, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q10, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #1
-+        vext.16     q8, q10, q10, #7
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.16     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q11, q3
-+        vmla.u16    q0, q10, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q9, q3
-+        vmla.u16    q0, q8, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.16     {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vext.16     q8, q9, q9, #1
-+        vld1.16     {d17[3]}, [r1]!
-+        mov         r5, #7
-+1:
-+        vmul.u16    q0, q8, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q9, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q10, q8, q8, #1
-+        rsb         r6, r12, #32
-+        vld1.16     {d21[3]}, [r1]
-+        sub         r5, #1
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #2
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q10, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q11, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q8, q10, q10, #1
-+        rsb         r6, r12, #32
-+        vld1.16     {d17[3]}, [r1]
-+        sub         r5, #1
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #2
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q10, q2
-+        vmla.u16    q0, q11, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q8, q2
-+        vmla.u16    q0, q9, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q0-q1}, [r1]
-+        sub         r9, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        mov         r5, #16
-+1:
-+        vld1.16     {d17[3]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.16     q1, q0, q1, #7
-+        add         r9, r2, r9, lsl #1
-+        vext.16     q0, q8, q0, #7
-+2:
-+        vmul.u16    q11, q2, q10
-+        subs        r12, r4
-+        vmla.u16    q11, q0, q9
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q3, q10
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q1, q9
-+        sub         r5, #1
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.16     {d16[0]}, [r5]
-+        mov         r5, #16
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #2
-+        vmov        q3, q1
-+        vext.16     q0, q0, q1, #1
-+        vext.16     q1, q1, q8, #1
-+2:
-+        vmul.u16    q11, q0, q9
-+        subs        r12, r4
-+        vmla.u16    q11, q2, q10
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q1, q9
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q3, q10
-+        sub         r5, #1
-+        vld1.16     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        vpush       {d8}
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        add         sp, #8
-+        mov         r10, #8
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #64
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        add         sp, #8
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<6
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #64
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        sub         r9, r2, #2
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vmov        d0, d9
-+        vmov        s2, r12
-+        add         r10, r0, #32
-+        mov         r5, #32
-+1:
-+        vld1.16     {d1[3]}, [r9]
-+        add         r8, r7
-+        vmov        q11, q4
-+        vmov        q10, q3
-+        asr         r9, r8, #8
-+        vmov        q9, q2
-+        add         r9, r2, r9, lsl #1
-+        vmov        q8, q1
-+        vext.16     q4, q3, q4, #7
-+        vext.16     q3, q2, q3, #7
-+        vext.16     q2, q1, q2, #7
-+        vext.16     q1, q0, q1, #7
-+2:
-+        vmul.u16    q12, q8, d1[1]
-+        adds        r12, r4
-+        vmla.u16    q12, q1, d1[0]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q9, d1[1]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q2, d1[0]
-+        sub         r5, #1
-+        vmul.u16    q14, q10, d1[1]
-+        teq         r5, #0
-+        vmla.u16    q14, q3, d1[0]
-+        vmul.u16    q15, q11, d1[1]
-+        vmla.u16    q15, q4, d1[0]
-+        vmov        s2, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d0
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        add         r1, r1, #64
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        vmov        d1, d9
-+        vmov        s1, r12
-+        add         r10, r0, #32
-+        mov         r5, #32
-+1:
-+        vld1.16     {d0[0]}, [r1]!
-+        vmov        q8, q1
-+        vmov        q9, q2
-+        vmov        q10, q3
-+        vmov        q11, q4
-+        vext.16     q1, q1, q2, #1
-+        vext.16     q2, q2, q3, #1
-+        vext.16     q3, q3, q4, #1
-+        vext.16     q4, q4, q0, #1
-+2:
-+        vmul.u16    q12, q1, d0[2]
-+        adds        r12, r4
-+        vmla.u16    q12, q8, d0[3]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q2, d0[2]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q9, d0[3]
-+        sub         r5, #1
-+        vmul.u16    q14, q3, d0[2]
-+        teq         r5, #0
-+        vmla.u16    q14, q10, d0[3]
-+        vmul.u16    q15, q4, d0[2]
-+        vmla.u16    q15, q11, d0[3]
-+        vmov        s1, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d1
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+
-+@ Generate 4x4 chroma patch
-+@
-+@ In (const)
-+@ r1   Up ptr (_up only)
-+@ r3   Out stride
-+@ r4   Angle add
-+@ r7   Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2   Left ptr - updated
-+@ r6   Angle frac (init to r4 + 32)
-+@ r8   Inv angle accumulator
-+@ q2   Cur Line - load before 1st call for down - set by _up
-+@ q8   Cur Line - load before 1st call for up   - set by _down
-+@
-+@ Temps
-+@ r5   Loop counter
-+@ r12
-+@ d0, q1, q12-q15
-+
-+patch_h_down_c_4x4_10:
-+        vld1.16     {q12}, [r2]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        mov         r5, #4
-+1:
-+        vmov        q13, q12
-+        vext.16     q12, q12, q12, #2
-+        vld1.32     {d25[1]}, [r2]!
-+patch_h_down_c_4x4_10_continue:
-+2:
-+        vmov        q8, q9
-+        subs        r12, r4
-+        vmul.u16    q0, q13, q3
-+        it          cc
-+        addcc       r12, #32
-+        vmla.u16    q0, q12, q2
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vmov        q10, q11
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vrshr.u16   q11, q0, #5
-+        bhi         2b
-+        bne         1b
-+
-+        bcs         3f
-+        vmov        q13, q12
-+        vext.16     q12, q12, q12, #2
-+        vld1.32     {d25[1]}, [r2]!
-+3:
-+
-+store_tran_c_4x4_10:
-+T       add         r6, r0, r3
-+        vzip.32     q8, q10
-+A       add         r6, r0, r3
-+T       lsl         r3, #1
-+        vzip.32     q9, q11
-+A       add         r5, r0, r3, lsl #1
-+T       add         r5, r0, r3
-+        vst2.32     {d16,d18}, [r0]!
-+A       lsl         r3, #1
-+        vst2.32     {d17,d19}, [r6], r3
-+        asr         r3, #1
-+        vst2.32     {d20,d22}, [r5]
-+        mov         r5, #4
-+        vst2.32     {d21,d23}, [r6]
-+        bx          lr
-+
-+patch_h_up_c_4x4_10:
-+        vld1.16     {q1}, [r2]
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        mov         r5, #4
-+1:
-+        adds        r8, r7
-+        vmov        q12, q1
-+        it          mi
-+        ldrmi       r6, [r2, #-4]!
-+        vext.16     q1, q1, q1, #6
-+        itt         pl
-+        asrpl       r6, r8, #8
-+        ldrpl       r6, [r1, r6, lsl #2]
-+        vmov        s4, r6
-+patch_h_up_c_4x4_10_continue:
-+2:
-+        vmov        q8, q9
-+        subs        r12, r4
-+        vmul.u16    q0, q12, q3
-+        it          cc
-+        addcc       r12, #32
-+        vmla.u16    q0, q1, q2
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vmov        q10, q11
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vrshr.u16   q11, q0, #5
-+        bhi         2b
-+        bne         1b
-+
-+        bcs         store_tran_c_4x4_10
-+        adds        r8, r7
-+        vmov        q12, q1
-+        it          mi
-+        ldrmi       r6, [r2, #-4]!
-+        vext.16     q1, q1, q1, #6
-+        itt         pl
-+        asrpl       r6, r8, #8
-+        ldrpl       r6, [r1, r6, lsl #2]
-+        vmov        s4, r6
-+        b           store_tran_c_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_c_4x4_10
-+        pop         {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+        bl          patch_h_up_c_4x4_10
-+        pop         {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q9}, [r1]
-+        sub         r1, r2, #4
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.16     q2, r6
-+        vext.16     q8, q9, q9, #6
-+        sub         r8, r7, #128
-+        vld1.32     {d16[0]}, [r1]
-+        vdup.16     q3, r12
-+        mov         r5, #3
-+1:
-+        vmul.u16    q0, q9, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q8, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #2
-+        vext.16     q10, q8, q8, #6
-+        rsb         r6, r12, #32
-+        vmov        q11, q8
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.32     {d20[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q11, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q10, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #2
-+        vext.16     q8, q10, q10, #6
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.32     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q11, q3
-+        vmla.u16    q0, q10, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q9, q3
-+        vmla.u16    q0, q8, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.16     {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vext.16     q8, q9, q9, #2
-+        vld1.32     {d17[1]}, [r1]!
-+        mov         r5, #3
-+1:
-+        vmul.u16    q0, q8, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q9, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q10, q8, q8, #2
-+        rsb         r6, r12, #32
-+        vld1.32     {d21[1]}, [r1]
-+        sub         r5, #1
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #4
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q10, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q11, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q8, q10, q10, #2
-+        rsb         r6, r12, #32
-+        vld1.32     {d17[1]}, [r1]
-+        sub         r5, #1
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #4
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q10, q2
-+        vmla.u16    q0, q11, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q8, q2
-+        vmla.u16    q0, q9, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        add         r2, r1, #4*4        @ restore r2, but 4 rows further down left
-+        sub         r0, #32
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        pop         {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+
-+        push        {r2, r8}
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+        pop         {r2, r8}
-+
-+        sub         r0, #32
-+        mov         r6, r4
-+        add         r2, #16
-+        sub         r8, r8, r7, lsl #2
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+
-+        pop         {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q0-q1}, [r1]
-+        sub         r9, r2, #4
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        mov         r5, #8
-+1:
-+        vld1.32     {d17[1]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.16     q1, q0, q1, #6
-+        add         r9, r2, r9, lsl #2
-+        vext.16     q0, q8, q0, #6
-+2:
-+        vmul.u16    q11, q2, q10
-+        subs        r12, r4
-+        vmla.u16    q11, q0, q9
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q3, q10
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q1, q9
-+        sub         r5, #1
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.32     {d16[0]}, [r5]
-+        mov         r5, #8
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #4
-+        vmov        q3, q1
-+        vext.16     q0, q0, q1, #2
-+        vext.16     q1, q1, q8, #2
-+2:
-+        vmul.u16    q11, q0, q9
-+        subs        r12, r4
-+        vmla.u16    q11, q2, q10
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q1, q9
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q3, q10
-+        sub         r5, #1
-+        vld1.32     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r10, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        vpush       {d8}
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        add         sp, #8
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+        bl          patch_h_down_c_4x4_10_continue
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        add         r2, r1, #4*4         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*4
-+        mov         r6, r4
-+        sub         r0, #64
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r10, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        add         sp, #8
-+        mov         r10, #4
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+2:
-+        push        {r2, r8}
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+        bl          patch_h_up_c_4x4_10_continue
-+        bl          patch_h_up_c_4x4_10_continue
-+        pop         {r2, r8}
-+
-+        sub         r0, #64
-+        mov         r6, r4
-+        add         r2, #16
-+        sub         r8, r8, r7, lsl #2
-+        add         r0, r0, r3, lsl #2
-+        subs        r10, #1
-+        bne         2b
-+
-+        pop         {r4-r10, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        sub         r9, r2, #4
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vmov        d0, d9
-+        vmov        s2, r12
-+        add         r10, r0, #32
-+        mov         r5, #16
-+1:
-+        vld1.32     {d1[1]}, [r9]
-+        add         r8, r7
-+        vmov        q11, q4
-+        vmov        q10, q3
-+        asr         r9, r8, #8
-+        vmov        q9, q2
-+        add         r9, r2, r9, lsl #2
-+        vmov        q8, q1
-+        vext.16     q4, q3, q4, #6
-+        vext.16     q3, q2, q3, #6
-+        vext.16     q2, q1, q2, #6
-+        vext.16     q1, q0, q1, #6
-+2:
-+        vmul.u16    q12, q8, d1[1]
-+        adds        r12, r4
-+        vmla.u16    q12, q1, d1[0]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q9, d1[1]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q2, d1[0]
-+        sub         r5, #1
-+        vmul.u16    q14, q10, d1[1]
-+        teq         r5, #0
-+        vmla.u16    q14, q3, d1[0]
-+        vmul.u16    q15, q11, d1[1]
-+        vmla.u16    q15, q4, d1[0]
-+        vmov        s2, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d0
-+        pop         {r4-r10, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        add         r1, r1, #64
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        vmov        d1, d9
-+        vmov        s1, r12
-+        add         r10, r0, #32
-+        mov         r5, #16
-+1:
-+        vld1.32     {d0[0]}, [r1]!
-+        vmov        q8, q1
-+        vmov        q9, q2
-+        vmov        q10, q3
-+        vmov        q11, q4
-+        vext.16     q1, q1, q2, #2
-+        vext.16     q2, q2, q3, #2
-+        vext.16     q3, q3, q4, #2
-+        vext.16     q4, q4, q0, #2
-+2:
-+        vmul.u16    q12, q1, d0[2]
-+        adds        r12, r4
-+        vmla.u16    q12, q8, d0[3]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q2, d0[2]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q9, d0[3]
-+        sub         r5, #1
-+        vmul.u16    q14, q3, d0[2]
-+        teq         r5, #0
-+        vmla.u16    q14, q10, d0[3]
-+        vmul.u16    q15, q4, d0[2]
-+        vmla.u16    q15, q11, d0[3]
-+        vmov        s1, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d1
-+        pop         {r4-r10, pc}
-+
-+endfunc
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
-new file mode 100644
-index 0000000000..75a1789c25
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
-@@ -0,0 +1,695 @@
-+/*
-+ * Copyright (c) 2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        ldr         r2, [r2]
-+        vld1.32     {d0[0]}, [r1]
-+        mov         r1, #2
-+        vmov        s1, r2
-+        vmov        s2, r2
-+        vmov.i16    q2, #3
-+        add         r2, r0, r3
-+        vaddl.u8    q1, d0, d1    @ d2[0] = top[0] + left[0]
-+        lsl         r3, #1
-+        vmovl.u8    q0, d0
-+        vmov.i64    d7, #0xffff
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmov.i64    d7, #0xff
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #3
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.8      d6, d6[0]
-+        vrshrn.i16  d0, q0, #2
-+
-+        @ Store top line
-+        vst1.32     {d0[0]}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d1, d0, #5*8
-+        vshr.u64    d2, d0, #6*8
-+        vshr.u64    d3, d0, #7*8
-+        vbif        d1, d6, d7
-+        vbif        d2, d6, d7
-+        vst1.32     {d1[0]}, [r2], r3
-+        vbif        d3, d6, d7
-+        vst1.32     {d2[0]}, [r0]
-+        vst1.32     {d3[0]}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {d0}, [r1]
-+        vld1.8      {d1}, [r2]
-+A       add         r2, r0, r3, lsl #1
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+        vadd.i16    d0, d1       @ d0 has 2 val pairs
-+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
-+        vpadd.i32   d3, d0, d0
-+        vrshrn.u16  d0, q1, #3
-+
-+        @ Store
-+        vst1.8      {d0}, [r0], r3
-+        vst1.8      {d0}, [r2], r3
-+        vst1.8      {d0}, [r0]
-+        vst1.8      {d0}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {d0}, [r1]
-+        mov         r1, #2
-+        vld1.8      {d16}, [r2]
-+        vmov.i16    q2, #3
-+        vmov.i64    d7, #0xffff
-+        vaddl.u8    q1, d0, d16   @ d2[0] = top[0] + left[0]
-+        vmovl.u8    q0, d0
-+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmov.i64    d7, #0xff
-+        vmovl.u8    q1, d16
-+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #4
-+        vmla.i16    q1, q2, d6[0]
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.8      d6, d6[0]
-+        vrshrn.i16  d2, q1, #2
-+        vrshrn.i16  d0, q0, #2
-+
-+        @ Store top line
-+        vst1.8      {d0}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d2, #8
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        mov         r1, #6
-+1:
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        subs        r1, #2
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0}, [r1]
-+        mov         r1, #8
-+        vld1.8      {q1}, [r2]
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+A       add         r2, r0, r3, lsl #1
-+A       lsl         r3, #2
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vaddl.u8    q1, d2, d3
-+        vadd.i16    q1, q0
-+        vadd.i16    d3, d2        @ d3 has 2 val pairs
-+        vpadd.i32   d2, d3, d3    @ This add U & V separately
-+        vpadd.i32   d3, d3, d3
-+        vrshrn.u16  d0, q1, #4
-+        vrshrn.u16  d1, q1, #4
-+
-+        @ Store
-+1:
-+        vst1.8      {q0}, [r0], r3
-+        subs        r1, #4
-+        vst1.8      {q0}, [r2], r3
-+        vst1.8      {q0}, [r0], r3
-+        vst1.8      {q0}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q8}, [r1]
-+        mov         r1, #2
-+        vld1.8      {q9}, [r2]
-+        vaddl.u8    q10, d16, d17
-+        vaddl.u8    q11, d16, d18
-+        vaddl.u8    q0, d18, d19
-+        vmov.i16    q1, #3
-+        vadd.i16    q10, q0
-+        vmovl.u8    q0, d18
-+        vadd.i16    d20, d21
-+        vmov.i16    d2[0], r1     @ 2, 3, 3, 3...
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmovl.u8    q2, d16
-+        vmovl.u8    q9, d19
-+        vpadd.i16   d20, d20      @ 2 (top & bottom of vector the same)
-+        vmov.i64    d7, #0xffff
-+        vmovl.u8    q8, d17
-+        vbit        d4, d22, d7   @ q2 = top[0]+left[0], top[1..7]
-+        vmov.i64    d7, #0xff
-+        vpadd.i16   d20, d20      @ 1 (all the same)
-+        vrshr.u16   d21, d20, #5
-+        vrshr.u16   d20, d20, #5
-+        vmla.i16    q0, q10, d2[1]
-+        vmla.i16    q9, q10, d2[1]
-+        vmla.i16    q2, q10, q1
-+        vmla.i16    q8, q10, d2[1]
-+        vdup.8      q1, d20[0]
-+        vrshrn.i16  d0, q0, #2
-+        vrshrn.i16  d1, q9, #2
-+        vrshrn.i16  d4, q2, #2
-+        vrshrn.i16  d5, q8, #2
-+        vext.8      q0, q0, q0, #1
-+
-+        @ Store top line
-+        vst1.8      {q2}, [r0], r3
-+
-+        @ Store the rest
-+        mov         r1, #15
-+1:
-+        vbit        d2, d0, d7
-+        vext.8      q0, q0, q0, #1
-+        subs        r1, #1
-+        vst1.8      {q1}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0-q1}, [r1]
-+        mov         r1, #16
-+        vld1.8      {q2-q3}, [r2]
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vaddl.u8    q1, d2, d3
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vaddl.u8    q2, d4, d5
-+        vaddl.u8    q3, d6, d7
-+        vadd.i16    q0, q1
-+        vadd.i16    q2, q3
-+        vadd.i16    q0, q2
-+        vadd.i16    d0, d1        @ d0 has 2 val pairs
-+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
-+        vpadd.i32   d5, d0, d0
-+        vrshrn.u16  d0, q2, #5
-+        vrshrn.u16  d1, q2, #5
-+        vrshrn.u16  d2, q2, #5
-+        vrshrn.u16  d3, q2, #5
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_32_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0-q1}, [r1]
-+        mov         r1, #32
-+        vld1.8      {q2-q3}, [r2]
-+        add         r2, r0, r3
-+        vaddl.u8    q0, d0, d1
-+        lsl         r3, #1
-+        vaddl.u8    q1, d2, d3
-+        vaddl.u8    q2, d4, d5
-+        vaddl.u8    q3, d6, d7
-+        vadd.i16    q0, q1
-+        vadd.i16    q2, q3
-+        vadd.i16    q0, q2
-+        vadd.i16    d0, d1        @ d0 has 4 vals
-+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
-+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
-+        vpadd.i16   d5, d0, d0
-+        vrshrn.u16  d0, q2, #6
-+        vrshrn.u16  d1, q2, #6
-+        vrshrn.u16  d2, q2, #6
-+        vrshrn.u16  d3, q2, #6
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ -----------------------------------------------------------------------------
-+@
-+@ 10 Bit versions
-+@
-+@ There is no actual bit depth dependency in this code except that our
-+@ intermediate results will overflow the 16 bits they are stored in
-+@ All there functions are good to 10 bits - with the worst case being
-+@ in dc_32 where we use all 16 bits.
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {d0}, [r1]
-+        mov         r1, #2
-+        vld1.16     {d1}, [r2]
-+T       lsl         r3, #1
-+        vmov.i16    q2, #3
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.u16    d2, d0, d1    @ d2[0] = top[0] + left[0]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vmov.i64    d7, #0xffff
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #3
-+        vmla.i16    q0, q2, d6[0]
-+        vrshr.u16   q0, #2
-+
-+        @ Store top line
-+        vst1.16     {d0}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d3, d1, #1*16
-+        vshr.u64    d4, d1, #2*16
-+        vshr.u64    d5, d1, #3*16
-+        vbif        d3, d6, d7
-+        vbif        d4, d6, d7
-+        vst1.16     {d3}, [r2], r3
-+        vbif        d5, d6, d7
-+        vst1.16     {d4}, [r0]
-+        vst1.16     {d5}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0}, [r1]
-+        vld1.8      {q1}, [r2]
-+A       add         r2, r0, r3, lsl #2
-+A       lsl         r3, #3
-+T       lsl         r3, #2
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vadd.i16    q0, q1
-+        vadd.i16    d0, d1       @ d0 has 2 val pairs
-+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
-+        vpadd.i32   d3, d0, d0
-+        vrshr.u16   q0, q1, #3
-+
-+        vst1.16     {q0}, [r0], r3
-+        vst1.16     {q0}, [r2], r3
-+        vst1.16     {q0}, [r0]
-+        vst1.16     {q0}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q0}, [r1]
-+        mov         r1, #2
-+        vld1.16     {q8}, [r2]
-+T       lsl         r3, #1
-+        vmov.i16    q2, #3
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.i16    q1, q0, q8    @ q1[0] = top[0] + left[0]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmov.i64    d7, #0xffff
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #4
-+        vmla.i16    q8, q2, d6[0]
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.16     q2, d6[0]
-+        vdup.16     q9, d6[0]
-+        vrshr.u16   q8, q8, #2
-+        vrshr.u16   q0, q0, #2
-+        vext.16     q1, q8, q8, #1
-+
-+        @ Store top line
-+        vst1.16     {q0}, [r0], r3
-+
-+        @ Store the rest
-+        vbit        d18, d2, d7
-+        vst1.16     {q9}, [r2], r3
-+        mov         r1, #6
-+1:
-+        vext.16     q8, q8, q8, #2
-+        subs        r1, #2
-+        vext.16     q1, q1, q1, #2
-+        vbit        d4, d16, d7
-+        vst1.16     {q2}, [r0], r3
-+        vbit        d18, d2, d7
-+        vst1.16     {q9}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q0-q1}, [r1]
-+        mov         r1, #8
-+        vld1.16     {q2-q3}, [r2]
-+T       lsl         r3, #2
-+        vadd.i16    q1, q0
-+A       add         r2, r0, r3, lsl #2
-+A       lsl         r3, #3
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vadd.i16    q2, q3
-+        vadd.i16    q1, q2
-+        vadd.i16    d3, d2        @ d3 has 2 val pairs
-+        vpadd.i32   d2, d3, d3    @ This add U & V separately
-+        vpadd.i32   d3, d3, d3
-+        vrshr.u16   q0, q1, #4
-+        vrshr.u16   q1, q1, #4
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q8-q9}, [r1]
-+        mov         r1, #2
-+        vld1.16     {q10-q11}, [r2]
-+        lsl         r3, #1        @ stride given in pels
-+        vadd.i16    q0, q8, q9
-+        vadd.i16    q1, q10, q11
-+        vmov.i16    q3, #3
-+        vadd.i16    q1, q0
-+        vadd.i16    d0, d16, d20
-+        vmov.i64    d31, #0xffff
-+        vadd.i16    d3, d2
-+        vmov.16     d6[0], r1     @ 2, 3, 3, 3...
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ topline[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vbit        d16, d0, d31  @ q8 = top[0]+left[0], top[1..7]
-+        vpadd.i16   d3, d3        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d3, d3        @ 1 (all the same)
-+        vrshr.u16   d2, d3, #5
-+        vrshr.u16   d3, d3, #5
-+        vmov        q0, q1
-+        vmla.i16    q10, q1, d6[1]
-+        vmla.i16    q11, q1, d6[1]
-+        vmla.i16    q8, q1, q3
-+        vmla.i16    q9, q1, d6[1]
-+        vrshr.u16   q2, q10, #2
-+        vrshr.u16   q3, q11, #2
-+        vrshr.u16   q8, #2
-+        vrshr.u16   q9, #2
-+        vext.16     q2, q2, q2, #1
-+        mov         r1, #7<<29
-+
-+        @ Store top line
-+        vst1.16     {q8-q9}, [r0], r3
-+
-+        @ Store the rest
-+1:
-+        vbit        d0, d4, d31
-+        vext.16     q2, q2, q2, #1
-+        subs        r1, #1<<29
-+        vst1.16     {q0-q1}, [r0], r3
-+        bne         1b
-+1:
-+        vbit        d0, d6, d31
-+        vext.16     q3, q3, q3, #1
-+        subs        r1, #1<<29
-+        vst1.16     {q0-q1}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vldm        r1, {q0-q3}
-+        vldm        r2, {q8-q11}
-+        vadd.i16    q0, q1
-+        mov         r1, #16
-+        vadd.i16    q2, q3
-+        add         r2, r0, #32
-+        vadd.i16    q8, q9
-+        lsl         r3, #2
-+        vadd.i16    q10, q11
-+        vadd.u16    q0, q2
-+        vadd.u16    q8, q10
-+        vadd.i16    q0, q8
-+        vadd.i16    d0, d1        @ d0 has 2 val pairs
-+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
-+        vpadd.i32   d5, d0, d0
-+        vrshr.u16   q0, q2, #5
-+        vrshr.u16   q1, q2, #5
-+
-+        @ Store
-+1:
-+        vst1.16     {q0-q1}, [r0], r3
-+        subs        r1, #1
-+        vst1.16     {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx           lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels)
-+
-+function ff_hevc_rpi_pred_dc_32_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        @ With 10 bits we are (just) safe from overflow in i16
-+        vldm        r1, {q0-q3}
-+        vldm        r2, {q8-q11}
-+        vadd.i16    q0, q1
-+        mov         r1, #32
-+        vadd.i16    q2, q3
-+        add         r2, r0, #32
-+        vadd.i16    q8, q9
-+        lsl         r3, #1
-+        vadd.i16    q10, q11
-+        vadd.u16    q0, q2
-+        vadd.u16    q8, q10
-+        vadd.i16    q0, q8
-+        vadd.i16    d0, d1        @ d0 has 4 vals
-+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
-+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
-+        vpadd.i16   d5, d0, d0
-+        vrshr.u16   q0, q2, #6
-+        vrshr.u16   q1, q2, #6
-+
-+        @ Store
-+1:
-+        vst1.16     {q0-q1}, [r0], r3
-+        subs        r1, #1
-+        vst1.16     {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx           lr
-+endfunc
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-new file mode 100644
-index 0000000000..21cd28c709
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-@@ -0,0 +1,872 @@
-+/*
-+ * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ All functions have the call
-+@
-+@ int ff_hevc_rpi_intra_filter_N_neon_PW(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+@
-+@ Assumptions:
-+@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
-+@  if reuseing this code)
-+@
-+@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
-+@ N==4, but do for chroma N>=8.  As we share Y/C fns that means we can ignore
-+@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
-+@
-+@ We always have at least 64 pixel H frame width rounding - this lets us
-+@ load UR widthout having to worry about exactly how many pixels are actually
-+@ within the frame.  As partial loads will only occur very occasionally this
-+@ should be a win in nearly all cases.
-+@
-+@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
-+@ so we do no maths on the contents
-+@
-+@ No filtering in 32bit fns as they are chroma only
-+
-+
-+.equ    AVAIL_UR, 1
-+.equ    AVAIL_U,  2
-+.equ    AVAIL_UL, 4
-+.equ    AVAIL_L,  8
-+.equ    AVAIL_DL, 16
-+
-+.equ    FILTER_LIGHT, 0x40
-+.equ    FILTER_STRONG, 0x80
-+
-+.equ    AVAIL_S_UR_N_U_C, 32 - 1
-+.equ    AVAIL_S_U_N_UL_C, 32 - 2
-+.equ    AVAIL_S_UL_N_L_C, 32 - 3
-+.equ    AVAIL_S_L_N_DL_C, 32 - 4
-+
-+.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
-+
-+@ On entry
-+@  r2   req
-+@  r3   avail
-+@ [sp, #sp_offset...]  args
-+@
-+@ On Exit:
-+@
-+@ Extend values:
-+@  d_l  scalar contains value for L & DL
-+@       if DL avail then this is is DL[0] so we don't need to load that
-+@  d_ul scalar containing value for UL
-+@  d_u  scalar containing value for U
-+@  d_ur scalar containing value for UR
-+@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
-+@ This means that L-light-filter works even if nreq DL (we never filter
-+@ req-DL without req-L, but we do filter req-L without req-DL)
-+@ If UR avail then d_ur == a_ur so U-filter good too
-+@
-+@ Data load pointers (only load if req & avail):
-+@  r4   DL + stride
-+@  r10  L
-+@  r6   U
-+@  r5   UR
-+@
-+@ Others:
-+@  r2   req
-+@  r7   req & avail
-+@  r3   L + stride
-+@  r8   DL + stride * 2
-+@  r9   stride * 2
-+@  cs   Load U
-+@  mi   Load UR
-+@
-+@ Clobbered:
-+@  r12
-+
-+.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
-+
-+.equ    src_l\@,   \sp_offset + 0
-+.equ    src_u\@,   \sp_offset + 4
-+.equ    src_ur\@,  \sp_offset + 8
-+.equ    stride\@,  \sp_offset + 12
-+.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
-+.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
-+
-+@ r9    stride
-+@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
-+@ r4 = b_dl, r10 = b_l,             r8 = b_u
-+
-+        ldr        r5,  [sp, #src_ur\@]
-+        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
-+        ldr        r10, [sp, #src_l\@]
-+        ldr        r9,  [sp, #stride\@]
-+        ldr        r6,  [sp, #src_u\@]
-+
-+        @ This is quite a slow instruction but it replaces
-+        @ a decent number of tests that yield a max of 2 flags/op
-+        @ It is annoying we can't branch on Q!
-+        @ If L navail (ne) then DL must be navail (pl)
-+        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
-+
-+        mov        r4,  r5
-+        sub        r7,  r10, r9
-+        it vs
-+        movvs      r4,  r6
-+        add        r8,  r6,  #b_size\@ - pw\@
-+        it cs
-+        movcs      r4,  r7
-+        ite ne
-+        movne      r10, r4
-+        addeq      r4,  r7,  r9,  lsl #\log2_s
-+        it cc
-+        movcc      r7,  r10
-+        it mi
-+        addmi      r4,  r10, r9,  lsl #\log2_s
-+        vld1.\d_type {\d_ul}, [r7]
-+        itt vc
-+        movvc      r8,  r7
-+        movvc      r6,  r7
-+        vld1.\d_type {\d_l }, [r4], r9
-+        tst        r3,  #AVAIL_UR
-+        vld1.\d_type {\d_u }, [r6]
-+        it eq
-+        moveq      r5,  r8
-+        and        r7,  r2,  r3
-+        add        r8,  r4,  r9
-+        vld1.\d_type {\d_ur}, [r5]
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        add        r3,  r10, r9
-+        lsl        r9,  #1
-+.endm
-+
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_8(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    0
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_8, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
-+
-+        it cs
-+        vldrcs     s2,  [r6]
-+        ite pl
-+        vmovpl     s3,  s4
-+        vldrmi     s3,  [r5]
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0,  #-pw
-+        bpl        1f
-+
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10]
-+        vld1.8    {d0[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.8    {d0[5]}, [r4],  r9
-+        vld1.8    {d0[6]}, [r8]
-+        vld1.8    {d0[7]}, [r4]
-+1:
-+        vstr       d1,  [r1]            @ Up
-+        vst1.8    {d31[7]}, [r12]
-+        vstr       d0,  [r0]            @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_16, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
-+
-+        it cs
-+        vldrcs     d2,  [r6]
-+        it mi
-+        vldrmi     d3,  [r5]
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0, #-pw
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10]
-+        vld1.16   {d0[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.16   {d1[1]}, [r4],  r9
-+        vld1.16   {d1[2]}, [r8]
-+        vld1.16   {d1[3]}, [r4]
-+1:
-+        vst1.16   {q1}, [r1]           @ Up
-+        vst1.16   {d31[3]}, [r12]
-+        vst1.16   {q0}, [r0]           @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_8(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    0
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+
-+function ff_hevc_rpi_intra_filter_8_neon_8, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
-+
-+        it cs
-+        vldrcs     d4,  [r6]
-+        it mi
-+        vldrmi     d5,  [r5]
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        bpl        1f
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10], r9
-+        vld1.8    {d0[3]}, [r3],  r9
-+        vld1.8    {d0[4]}, [r10], r9
-+        vld1.8    {d0[5]}, [r3],  r9
-+        vld1.8    {d0[6]}, [r10]
-+        vld1.8    {d0[7]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.8    {d1[1]}, [r4],  r9
-+        vld1.8    {d1[2]}, [r8],  r9
-+        vld1.8    {d1[3]}, [r4],  r9
-+        vld1.8    {d1[4]}, [r8],  r9
-+        vld1.8    {d1[5]}, [r4],  r9
-+        vld1.8    {d1[6]}, [r8]
-+        vld1.8    {d1[7]}, [r4]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        @ Luma light filter
-+        vext.8     q8,  q15, q2,  #15
-+        vext.8     q12, q15, q0,  #15
-+        vaddl.u8   q9,  d17, d5
-+        vaddl.u8   q8,  d16, d4
-+        vaddl.u8   q13, d25, d1
-+        vaddl.u8   q12, d24, d0
-+        vmov.u8    r3,  d5[7]           @ Save final pel
-+        vmov.u8    r2,  d1[7]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshrn.u16 d4,  q2,  #2
-+        vrshrn.u16 d5,  q3,  #2
-+        vrshrn.u16 d0,  q0,  #2
-+        vrshrn.u16 d1,  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u8    d5[7], r3            @ Restore final pel
-+        vmov.u8    d1[7], r2            @ Restore final pel
-+        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
-+
-+10:
-+        vst1.8    {q2 }, [r1]           @ Up
-+        vst1.8    {d31[7]}, [r12]       @ Up-left
-+        vst1.8    {q0 }, [r0]           @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
-+
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #4
-+        vldm       r5,  {d6, d7}
-+        bgt        1f
-+        vdup.16    d7,  d6[3]
-+1:
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vdup.16    q1,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10]
-+        vld1.16   {d1[3]}, [r3]
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.16   {d2[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.16   {d2[2]}, [r8],  r9
-+        vld1.16   {d2[3]}, [r4],  r9
-+        blt        2f
-+        vld1.16   {d3[0]}, [r8],  r9
-+        vld1.16   {d3[1]}, [r4],  r9
-+        vld1.16   {d3[2]}, [r8]
-+        vld1.16   {d3[3]}, [r4]
-+        b          1f
-+2:
-+        vdup.16    d3,  d2[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        @ Luma light filter
-+        vext.16    q9,  q2,  q3,  #7
-+        vext.16    q8,  q15, q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+        vadd.u16   q9,  q3
-+        vadd.u16   q8,  q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r3,  d7[3]           @ Save final pel
-+        vmov.u16   r2,  d3[3]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r3            @ Restore final pel
-+        vmov.u16   d3[3], r2            @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
-+
-+10:
-+        vst1.16   {q2,  q3}, [r1]       @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vst1.16   {q0,  q1}, [r0]       @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  4
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.16    q9,  d16[0]
-+        vdup.16    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {d16-d19}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #12
-+        @ Given chroma frame layout, if UR exists then it is always legit to
-+        @ load all of it even if most of it is outside the frame.
-+        vldm       r5,  {d20-d23}
-+        bgt        1f
-+        bge        4f
-+        cmp        r12,  #8
-+        bge        3f
-+        vdup.16    d21, d20[3]
-+3:      vdup.16    d22, d21[3]
-+4:      vdup.16    d23, d22[3]
-+
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        ldr        r12, [sp, #dl_size]
-+        vdup.16    q1,  d0[0]
-+        vdup.16    q2,  d0[0]
-+        vdup.16    q3,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10], r9
-+        vld1.16   {d1[3]}, [r3],  r9
-+        vld1.16   {d2[0]}, [r10], r9
-+        vld1.16   {d2[1]}, [r3],  r9
-+        vld1.16   {d2[2]}, [r10], r9
-+        vld1.16   {d2[3]}, [r3],  r9
-+        vld1.16   {d3[0]}, [r10], r9
-+        vld1.16   {d3[1]}, [r3],  r9
-+        vld1.16   {d3[2]}, [r10]
-+        vld1.16   {d3[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.16   {d4[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.16   {d4[2]}, [r8],  r9
-+        vld1.16   {d4[3]}, [r4],  r9
-+        ble        2f
-+        vld1.16   {d5[0]}, [r8],  r9
-+        vld1.16   {d5[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.16   {d5[2]}, [r8],  r9
-+        vld1.16   {d5[3]}, [r4],  r9
-+        blt        3f
-+        vld1.16   {d6[0]}, [r8],  r9
-+        vld1.16   {d6[1]}, [r4],  r9
-+        vld1.16   {d6[2]}, [r8],  r9
-+        vld1.16   {d6[3]}, [r4],  r9
-+        ble        4f
-+        vld1.16   {d7[0]}, [r8],  r9
-+        vld1.16   {d7[1]}, [r4],  r9
-+        vld1.16   {d7[2]}, [r8]
-+        vld1.16   {d7[3]}, [r4]
-+        b          1f
-+2:      vdup.16    d5,  d4[3]
-+3:      vdup.16    d6,  d5[3]
-+4:      vdup.16    d7,  d6[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        vpush     {q5}
-+        @ Luma light filter
-+        @ Left
-+        vext.16    q5,  q2,  q3,  #7
-+        vext.16    q14, q1,  q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+
-+        vadd.u16   q5,  q3
-+        vadd.u16   q14, q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r2,  d7[3]           @ Save final pel
-+
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q14, #1
-+        vext.16    q2,  q14, q5,  #1
-+        vext.16    q3,  q5,  q5,  #1
-+
-+        vmov       d30, d24             @ d30[0] = l[0] + ul
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+        vadd.u16   q2,  q14
-+        vadd.u16   q3,  q5
-+
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+
-+        @ Up
-+        vext.16    q5,  q10, q11, #7
-+        vext.16    q14, q9,  q10, #7
-+        vext.16    q13, q8,  q9,  #7
-+        vext.16    q12, q15, q8,  #7
-+
-+        vadd.u16   q5,  q11
-+        vadd.u16   q14, q10
-+        vadd.u16   q13, q9
-+        vadd.u16   q12, q8
-+        vmov.u16   r3,  d23[3]          @ Save final pel
-+
-+        vext.16    q8,  q12, q13, #1
-+        vext.16    q9,  q13, q14, #1
-+        vext.16    q10, q14, q5,  #1
-+        vext.16    q11, q5,  q5,  #1
-+
-+        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q8,  q12
-+        vadd.u16   q9,  q13
-+        vadd.u16   q10, q14
-+        vadd.u16   q11, q5
-+
-+        vrshr.u16  q8,  #2
-+        vrshr.u16  q9,  #2
-+        vrshr.u16  q10, #2
-+        vrshr.u16  q11, #2
-+
-+        @ Misc
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r2            @ Restore final pel
-+        vmov.u16   d23[3], r3           @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
-+        vpop      {q5}
-+
-+10:
-+        vstm       r1, {d16-d23}        @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vstm       r0, { d0-d7 }        @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
-+
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        it mi
-+        vldmmi     r5,  {d6, d7}
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        add        r12, r0,  #-pw
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10]
-+        vld1.32   {d1[1]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.32   {d2[1]}, [r4],  r9
-+        vld1.32   {d3[0]}, [r8]
-+        vld1.32   {d3[1]}, [r4]
-+1:
-+        vst1.32    {q2,  q3 }, [r1]     @ Up
-+        vst1.32    {d31[1]}, [r12]
-+        vst1.32    {q0,  q1 }, [r0]     @ Left
-+        pop        {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.32    q9,  d16[0]
-+        vdup.32    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {q8,  q9 }
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #p_size
-+        vldm       r5,  {q10, q11}
-+        bge        1f
-+        vdup.32    q11, d21[1]
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        vdup.32    q2,  d0[0]
-+        vdup.32    q3,  d0[0]
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10], r9
-+        vld1.32   {d1[1]}, [r3],  r9
-+        vld1.32   {d2[0]}, [r10], r9
-+        vld1.32   {d2[1]}, [r3],  r9
-+        vld1.32   {d3[0]}, [r10]
-+        vld1.32   {d3[1]}, [r3]
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.32   {d4[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.32   {d5[0]}, [r8],  r9
-+        vld1.32   {d5[1]}, [r4],  r9
-+        blt        2f
-+        vld1.32   {d6[0]}, [r8],  r9
-+        vld1.32   {d6[1]}, [r4],  r9
-+        vld1.32   {d7[0]}, [r8]
-+        vld1.32   {d7[1]}, [r4]
-+        b          1f
-+2:
-+        vdup.32    q3,  d5[1]
-+1:
-+        add        r12, r0,  #-pw
-+        vstm       r1,  { q8-q11}       @ Up
-+        vst1.32   {d31[1]}, [r12]
-+        vstm       r0,  { q0-q3 }       @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  4
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
-+
-+        @ Once we get this big we have run out of neon regs to store
-+        @ everything at once so do in pieces
-+
-+        @ Up (have)
-+        it cs
-+        vldmcs     r6,  { q0-q3 }
-+        ldr        r12, [sp, #ur_size]
-+        it mi
-+        vldmmi     r5,  { q8-q11}
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        bpl        1f
-+        cmp        r12, #12
-+        add        lr,  r1,  #(pw << log2_s)
-+        bgt        2f
-+        cmp        r12, #8
-+        bge        3f
-+        vdup.16    q9,  d17[1]
-+4:      vdup.16    d10, d19[1]
-+3:      vdup.16    q11, d21[1]
-+2:      vstm       lr, { q8-q11}
-+1:
-+
-+        @ Left (have)
-+        add        lr,  r0,  #-pw
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vst1.32   {d30[1]}, [lr]        @ UL
-+        bpl        1f
-+        vld1.32   { d0[0]}, [r10], r9
-+        vld1.32   { d0[1]}, [r3],  r9
-+        vld1.32   { d1[0]}, [r10], r9
-+        vld1.32   { d1[1]}, [r3],  r9
-+        vld1.32   { d2[0]}, [r10], r9
-+        vld1.32   { d2[1]}, [r3],  r9
-+        vld1.32   { d3[0]}, [r10], r9
-+        vld1.32   { d3[1]}, [r3],  r9
-+        vld1.32   { d4[0]}, [r10], r9
-+        vld1.32   { d4[1]}, [r3],  r9
-+        vld1.32   { d5[0]}, [r10], r9
-+        vld1.32   { d5[1]}, [r3],  r9
-+        vld1.32   { d6[0]}, [r10], r9
-+        vld1.32   { d6[1]}, [r3],  r9
-+        vld1.32   { d7[0]}, [r10]
-+        vld1.32   { d7[1]}, [r3]
-+        vstm       r0,  { q0-q3 }
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        vld1.32   {d16[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.32   {d17[0]}, [r8],  r9
-+        vld1.32   {d17[1]}, [r4],  r9
-+        ble        2f
-+        vld1.32   {d18[0]}, [r8],  r9
-+        vld1.32   {d18[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.32   {d19[0]}, [r8],  r9
-+        vld1.32   {d19[1]}, [r4],  r9
-+        blt        3f
-+        vld1.32   {d20[0]}, [r8],  r9
-+        vld1.32   {d20[1]}, [r4],  r9
-+        vld1.32   {d21[0]}, [r8],  r9
-+        vld1.32   {d21[1]}, [r4],  r9
-+        ble        4f
-+        vld1.32   {d22[0]}, [r8],  r9
-+        vld1.32   {d22[1]}, [r4],  r9
-+        vld1.32   {d23[0]}, [r8]
-+        vld1.32   {d23[1]}, [r4]
-+        b          5f
-+2:      vdup.32    q9,  d17[1]
-+3:      vdup.32    q10, d19[1]
-+4:      vdup.32    q11, d21[1]
-+5:      vstm       lr,  { q8-q11}
-+1:
-+        eors       r7,  r2
-+        beq        99f
-+
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        vdup.32    q0,  d31[0]
-+        vdup.32    q1,  d31[0]
-+        vdup.32    q2,  d31[0]
-+        vdup.32    q3,  d31[0]
-+        add        lr,  r1,  #(pw << log2_s)
-+        vdup.32    q8,  d31[1]
-+        vdup.32    q9,  d31[1]
-+        vdup.32    q10, d31[1]
-+        vdup.32    q11, d31[1]
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        it mi
-+        vstmmi     lr,  { q8-q11}
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q0,  d30[0]
-+        vdup.32    q1,  d30[0]
-+        vdup.32    q2,  d30[0]
-+        vdup.32    q3,  d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        it mi
-+        vstmmi     r0, { q0-q3 }
-+        it cs
-+        vstmcs     lr, { q0-q3 }
-+
-+99:
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
-new file mode 100644
-index 0000000000..67192e7213
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
-@@ -0,0 +1,911 @@
-+/*
-+ * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/*
-+ * Horizontal & Vertical special cases of angular intra pred
-+ *
-+ * Split out because:
-+ *  Vertical, at least, is relatively common
-+ *  Much simpler code than the general angular case
-+ *  Luma with size < 32 has extra filtering that doesn't happen anywhere else
-+ *
-+ * *** Currently luma filtering is mandatory where it occurs, but there are
-+ *     cases where it should be turned off (rdpcm & an extension sps flag).
-+ *     These don't occur in the standard conformance suite for Main Profile
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.32     {d0[0]}, [r2 :32]   @ Left
-+        add         r2, r0, r3
-+        vld1.8      {d1[]}, [r1]
-+        lsl         r3, #1
-+        vdup.8      d4, ip
-+        vmov.i8     d2, #128
-+        vhsub.u8    d4, d0, d4
-+        veor        d1, d2
-+        vld1.32     {d0[0]}, [r1 :32]   @ Top
-+        vqadd.s8    d1, d4
-+        vmov.i64    d3, #0xff
-+        vmov        d4, d0
-+        veor        d5, d1, d2
-+        veor        d1, d1, d2
-+        vbit        d0, d1, d3
-+        vshr.u64    d5, #8
-+        vst1.32     {d0[0]}, [r0], r3
-+        vshr.u64    d1, #16
-+        vbit        d4, d5, d3
-+        vshr.u64    d5, #16
-+        vst1.32     {d4[0]}, [r2], r3
-+        vbit        d0, d1, d3
-+        vst1.32     {d0[0]}, [r0]
-+        vbit        d4, d5, d3
-+        vst1.32     {d4[0]}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {d0}, [r2 :64]      @ Left
-+        vmov.i8     d1, #128
-+        vld1.8      {d2[]}, [r1]
-+        vld1.8      {d3}, [r1 :64]      @ Top
-+        vdup.8      d4, ip
-+        vhsub.u8    d4, d0, d4
-+        veor        d2, d1
-+        vmov.i64    d0, #0xff
-+        mov         r1, #8
-+        vqadd.s8    d2, d4, d2
-+        veor        d1, d2, d1
-+1:
-+        vbit        d3, d1, d0
-+        vshr.u64    d1, #8
-+        vst1.8      {d3}, [r0 :64], r3
-+        subs        r1, #2
-+        vbit        d3, d1, d0
-+        vshr.u64    d1, #8
-+        vst1.8      {d3}, [r0 :64], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {q0}, [r2 :128]     @ Left
-+        vdup.8      q1, ip
-+        vld1.8      {d4[],d5[]}, [r1]
-+        vhsub.u8    q0, q1
-+        vmov.i8     q1, #128
-+        veor        q2, q1
-+        vmov.i64    d16, #0xff
-+        vqadd.s8    q0, q2
-+        vld1.8      {q3}, [r1 :128]     @ Top
-+        mov         r1, #16
-+        veor        q0, q1
-+        vmov        q1, q3
-+        vext.8      q2, q0, q0, #1
-+1:
-+        vbit        d2, d0, d16
-+        vbit        d6, d4, d16
-+        vext.8      q0, q0, q0, #2
-+        subs        r1, #2
-+        vst1.8      {q1}, [r0 :128], r3
-+        vext.8      q2, q2, q2, #2
-+        vst1.8      {q3}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vert_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
-+        vld1.8     {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3
-+        lsl         r3,  #1
-+        mov         r1,  #16
-+1:
-+        vst1.8     {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.8     {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
-+        vld1.16    {d0 }, [r1  :64]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+
-+        vst1.16    {d0 }, [r0  :64], r3
-+        vst1.16    {d0 }, [r2  :64], r3
-+        vst1.16    {d0 }, [r0  :64]
-+        vst1.16    {d0 }, [r2  :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
-+        vld1.16    {q0 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+        mov         r1,  #4
-+1:
-+        vst1.16    {q0 }, [r0  :128], r3
-+        subs        r1,  #2
-+        vst1.16    {q0 }, [r2  :128], r3
-+        vst1.16    {q0 }, [r0  :128], r3
-+        vst1.16    {q0 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
-+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+        mov         r1,  #8
-+1:
-+        vst1.16    {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.16    {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+@ ? Might be faster as simple arm
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.32     {d0[0]}, [r1 :32]   @ Top
-+        add         r1, r2, #3
-+        vld1.8      {d1[]}, [r2]!
-+        vdup.8      d2, ip
-+        vmov.i8     d3, #128
-+        vhsub.u8    d0, d2
-+        veor        d1, d3
-+        vld1.8      {d2[]}, [r2]!
-+        add         ip, r0, r3
-+        vqadd.s8    d0, d0, d1
-+        lsl         r3, #1
-+        vld1.8      {d1[]}, [r2]
-+        vld1.8      {d4[]}, [r1]
-+        veor        d0, d3
-+        vst1.32     {d0[0]}, [r0 :32], r3
-+        vst1.32     {d2[0]}, [ip :32], r3
-+        vst1.32     {d1[0]}, [r0 :32]
-+        vst1.32     {d4[0]}, [ip :32]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {d0}, [r1 :64]      @ Top
-+        vmov.i8     d1, #128
-+        vld1.8      {d2[]}, [r2]!
-+        mov         r1, #8-2
-+        vdup.8      d3, ip
-+        vhsub.u8    d0, d3
-+        veor        d2, d1
-+        vqadd.s8    d0, d2
-+          vld1.8      {d2[]}, [r2]!
-+        veor        d0, d1
-+        vst1.8      {d0}, [r0], r3
-+1:
-+            vld1.8      {d0[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {d2}, [r0 :64], r3
-+              vld1.8      {d2[]}, [r2]!
-+            vst1.8      {d0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.8      {d2}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {q0}, [r1 :64]      @ Top
-+        mov         r1, #16-2
-+        vld1.8      {d4[],d5[]}, [r2]!
-+        vdup.8      q3, ip
-+        vhsub.u8    q0, q3
-+        vmov.i8     q1, #128
-+        veor        q2, q1
-+        vqadd.s8    q0, q2
-+          vld1.8      {d4[],d5[]}, [r2]!
-+        veor        q0, q1
-+        vst1.8      {q0}, [r0], r3
-+1:
-+            vld1.8      {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {q2}, [r0 :64], r3
-+              vld1.8      {d4[],d5[]}, [r2]!
-+            vst1.8      {q0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.8      {q2}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
-+        vld1.8      {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        mov         r1, #32-2
-+          vld1.8      {d2[],d3[]}, [r2]!
-+        vst1.8      {q0}, [r0 :128], r3
-+        vst1.8      {q0}, [ip :128], r3
-+1:
-+            vld1.8      {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {q1}, [r0 :128], r3
-+          vst1.8      {q1}, [ip :128], r3
-+              vld1.8      {d2[],d3[]}, [r2]!
-+            vst1.8      {q0}, [r0 :128], r3
-+            vst1.8      {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.8      {q1}, [r0 :128]
-+              vst1.8      {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
-+        add         r1, r2, #2
-+        vld1.16     {d0[]}, [r2]
-+        add         r2, #4
-+        vld1.16     {d1[]}, [r1]
-+        add         r1, #4
-+        vld1.16     {d2[]}, [r2]
-+A       add         r2, r0, r3, lsl #1
-+T       lsl         r3, #1
-+T       add         r2, r0, r3
-+        vld1.16     {d3[]}, [r1]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vst1.16     {d0}, [r0 :64], r3
-+        vst1.16     {d1}, [r2 :64], r3
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [r2 :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        lsl         r3, #1
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        mov         r1, #8-2
-+        vst1.16     {q0}, [r0 :64], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :64], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        lsl         r3, #1
-+        add         ip, r0, #16
-+        mov         r1, #16-2
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        vst1.16     {q0}, [r0 :128], r3
-+        vst1.16     {q0}, [ip :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :128], r3
-+          vst1.16     {q1}, [ip :128], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :128]
-+              vst1.16     {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 Bit
-+@ Has clipping constants so 10-bit only but could easily be macroed up to
-+@ 14-bit before we run out of bits
-+
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {d0}, [r2 :64]      @ Left
-+        vmov.i16    d2, #0
-+        vld1.16     {d1[]}, [r1]
-+T       lsl         r3, #1
-+        vdup.16     d4, ip
-+        vmov.i16    d3, #0x3ff
-+        vld1.16     {d5}, [r1 :64]      @ Top
-+        vhsub.u16   d4, d0, d4
-+        vmov.i64    d0, #0xffff
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.i16    d1, d1, d4
-+        vmov        d6, d5
-+        vmax.s16    d1, d1, d2
-+        vmin.s16    d2, d1, d3
-+        vmin.s16    d1, d1, d3
-+        vbit        d5, d1, d0
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vshr.u64    d2, #16
-+        vshr.u64    d1, #32
-+        vbit        d6, d2, d0
-+        vst1.16     {d5}, [r0], r3
-+        vshr.u64    d2, #32
-+        vst1.16     {d6}, [r2], r3
-+        vbit        d5, d1, d0
-+        vst1.16     {d5}, [r0]
-+        vbit        d6, d2, d0
-+        vst1.16     {d6}, [r2]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0}, [r2 :128]     @ Left
-+        lsl         r3, #1
-+        vdup.16     q1, ip
-+        vld1.16     {d4[],d5[]}, [r1]
-+        vhsub.u16   q0, q0, q1
-+        vmov.i16    q1, #0
-+        vadd.i16    q0, q2
-+        vmov.i16    q2, #0x3ff
-+        vld1.16     {q3}, [r1 :128]     @ Top
-+        mov         r1, #8
-+        vmax.s16    q0, q1
-+        vmov        q1, q3
-+        vmin.s16    q0, q2
-+        vmov.i64    d16, #0xffff
-+        vext.16     q2, q0, q0, #1
-+1:
-+        vbit        d2, d0, d16
-+        vbit        d6, d4, d16
-+        vext.16     q0, q0, q0, #2
-+        subs        r1, #2
-+        vst1.16     {q1}, [r0 :128], r3
-+        vext.16     q2, q2, q2, #2
-+        vst1.16     {q3}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0-q1}, [r2 :128]  @ Left
-+T       lsl         r3, #1
-+        vdup.16     q2, ip
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vld1.16     {d6[],d7[]}, [r1]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vhsub.u16   q0, q2
-+        vhsub.u16   q1, q2
-+        vadd.i16    q0, q3
-+        vadd.i16    q1, q3
-+        vmov.i16    q2, #0
-+        vld1.16     {q8-q9}, [r1 :128]  @ Top
-+        mov         r1, #0
-+        vmov.i16    q3, #0x3ff
-+        vmax.s16    q0, q2
-+        vmax.s16    q1, q2
-+        vmin.s16    q0, q3
-+        vmin.s16    q1, q3
-+        vmov        q10, q8
-+        vmov        q11, q9
-+        vext.16     q2, q0, q1, #1
-+        vext.16     q3, q1, q1, #1
-+        vmov.i64    d24, #0xffff
-+1:
-+        vbit        d16, d0, d24
-+        vbit        d20, d4, d24
-+        vext.16     q0, q0, q0, #2
-+        subs        r1, #1<<30
-+        vst1.16     {q8-q9}, [r0 :128], r3
-+        vext.16     q2, q2, q2, #2
-+        vst1.16     {q10-q11}, [r2 :128], r3
-+        bne         1b
-+1:
-+        vbit        d16, d2, d24
-+        vbit        d20, d6, d24
-+        vext.16     q1, q1, q1, #2
-+        subs        r1, #1<<30
-+        vst1.16     {q8-q9}, [r0 :128], r3
-+        vext.16     q3, q3, q3, #2
-+        vst1.16     {q10-q11}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
-+        vldm        r1, { q0-q3 }    @ Up
-+        lsl         r3, #1
-+        mov         r1, #32
-+        add         r2, r0, #32
-+1:
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+        subs        r1, #1
-+        vst1.16     {q2-q3}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
-+        vld1.16    {q0 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #2
-+        lsl         r3,  #3
-+
-+        vst1.16    {q0 }, [r0  :128], r3
-+        vst1.16    {q0 }, [r2  :128], r3
-+        vst1.16    {q0 }, [r0  :128]
-+        vst1.16    {q0 }, [r2  :128]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
-+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #2
-+        lsl         r3,  #3
-+        mov         r1,  #4
-+1:
-+        vst1.16    {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.16    {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
-+        vldm        r1, { q0-q3 }    @ Up
-+        lsl         r3, #2
-+        mov         r1, #16
-+        add         r2, r0, #32
-+1:
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+        subs        r1, #1
-+        vst1.16     {q2-q3}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+@ ff_hevc_rpi_pred_horizontal_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {d0}, [r1 :64]      @ Top
-+        vmov.i16    d1, #0
-+        vld1.16     {d2[]}, [r2]!
-+T       lsl         r3, #1
-+        vdup.16     d3, ip
-+        vmov.i16    d4, #0x3ff
-+        vhsub.u16   d0, d3
-+A       add         ip, r0, r3, lsl #1
-+T       add         ip, r0, r3
-+        vld1.16     {d3[]}, [r2]!
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vadd.i16    d0, d2
-+        vld1.16     {d2[]}, [r2]!
-+        vmax.s16    d0, d1
-+        vld1.16     {d1[]}, [r2]
-+        vmin.s16    d0, d4
-+        vst1.16     {d0}, [r0 :64], r3
-+        vst1.16     {d3}, [ip :64], r3
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d1}, [ip :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0}, [r1 :128]     @ Top
-+        lsl         r3, #1
-+        vdup.16     q1, ip
-+        mov         r1, #8-2
-+        vhsub.u16   q0, q1
-+        vld1.16     {d2[],d3[]}, [r2]!
-+        vmov.i16    q2, #0
-+        vadd.i16    q0, q1
-+        vmov.i16    q1, #0x3ff
-+        vmax.s16    q0, q2
-+          vld1.16     {d4[],d5[]}, [r2]!
-+        vmin.s16    q0, q1
-+        vst1.16     {q0}, [r0 :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q2}, [r0 :128], r3
-+              vld1.16     {d4[],d5[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+        bne         1b
-+
-+              vst1.16     {q2}, [r0 :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0-q1}, [r1 :128]  @ Top
-+        lsl         r3, #1
-+        vdup.16     q2, ip
-+        add         ip, r0, r3
-+        vhsub.u16   q0, q2
-+        add         ip, #16
-+        vhsub.u16   q1, q2
-+        mov         r1, #16-2
-+        vld1.16     {d4[],d5[]}, [r2]!
-+        vmov.i16    q3, #0
-+        vadd.u16    q0, q2
-+        vadd.i16    q1, q2
-+        vmov.i16    q2, #0x3ff
-+        vmax.s16    q0, q3
-+        vmax.s16    q1, q3
-+          vld1.16     {d6[],d7[]}, [r2]!
-+        vmin.s16    q0, q2
-+        vmin.s16    q1, q2
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q3}, [r0 :128], r3
-+          vst1.16     {q3}, [ip :128], r3
-+              vld1.16     {d6[],d7[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q3}, [r0 :128]
-+              vst1.16     {q3}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        push        {lr}
-+        mov         lr, #32
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        lsl         r3, #1
-+        vst1.16     {q0}, [r0 :128], lr
-+        sub         r3, #32
-+        vst1.16     {q0}, [ip :128], lr
-+        mov         r1, #32-2
-+        vst1.16     {q0}, [r0 :128], r3
-+        vst1.16     {q0}, [ip :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :128], lr
-+          vst1.16     {q1}, [ip :128], lr
-+          vst1.16     {q1}, [r0 :128], r3
-+          vst1.16     {q1}, [ip :128], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], lr
-+            vst1.16     {q0}, [ip :128], lr
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :128], lr
-+              vst1.16     {q1}, [ip :128], lr
-+              vst1.16     {q1}, [r0 :128]
-+              vst1.16     {q1}, [ip :128]
-+        pop         {pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
-+        add         r1, r2, #4
-+        vld1.32     {d0[],d1[]}, [r2]
-+        add         r2, #8
-+        vld1.32     {d2[],d3[]}, [r1]
-+        add         r1, #8
-+        vld1.32     {d4[],d5[]}, [r2]
-+A       add         r2, r0, r3, lsl #2
-+T       lsl         r3, #2
-+T       add         r2, r0, r3
-+        vld1.32     {d6[],d7[]}, [r1]
-+A       lsl         r3, #3
-+T       lsl         r3, #1
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q1}, [r2 :128], r3
-+        vst1.32     {q2}, [r0 :128]
-+        vst1.32     {q3}, [r2 :128]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        lsl         r3, #2
-+        add         ip, r0, #16
-+        mov         r1, #8-2
-+          vld1.32     {d2[],d3[]}, [r2]!
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q0}, [ip :128], r3
-+1:
-+            vld1.32     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.32     {q1}, [r0 :128], r3
-+          vst1.32     {q1}, [ip :128], r3
-+              vld1.32     {d2[],d3[]}, [r2]!
-+            vst1.32     {q0}, [r0 :128], r3
-+            vst1.32     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.32     {q1}, [r0 :128]
-+              vst1.32     {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        push        {lr}
-+        mov         lr, #32
-+          vld1.32     {d2[],d3[]}, [r2]!
-+        lsl         r3, #2
-+        vst1.32     {q0}, [r0 :128], lr
-+        sub         r3, #32
-+        vst1.32     {q0}, [ip :128], lr
-+        mov         r1, #16-2
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q0}, [ip :128], r3
-+1:
-+            vld1.32     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.32     {q1}, [r0 :128], lr
-+          vst1.32     {q1}, [ip :128], lr
-+          vst1.32     {q1}, [r0 :128], r3
-+          vst1.32     {q1}, [ip :128], r3
-+              vld1.32     {d2[],d3[]}, [r2]!
-+            vst1.32     {q0}, [r0 :128], lr
-+            vst1.32     {q0}, [ip :128], lr
-+            vst1.32     {q0}, [r0 :128], r3
-+            vst1.32     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.32     {q1}, [r0 :128], lr
-+              vst1.32     {q1}, [ip :128], lr
-+              vst1.32     {q1}, [r0 :128]
-+              vst1.32     {q1}, [ip :128]
-+        pop         {pc}
-+endfunc
-+
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
-new file mode 100644
-index 0000000000..e35896a102
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
-@@ -0,0 +1,1034 @@
-+/*
-+ * Copyright (c) 2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ Planar intra pred (8.4.4.2.4)
-+@
-+@ predSamples[ x ][ y ] =
-+@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
-+@   ( x + 1 ) * p[ nTbS ][ -1 ] +
-+@   ( nTbS - 1 - y ) * p[ x ][ -1 ] +
-+@   ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
-+
-+@ All 10-bit functions would work with 9
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_8, export=1
-+
-+        vld1.8      {d0}, [r1]          @ Top
-+        adr         ip, nb_3_0_1_4
-+        vld1.8      {d1}, [r2]          @ Left
-+        vmov.i64    d2, #0xffffffff
-+        vldr        d3, [ip, #8]        @ {1,2,3,4,1,2,3,4}
-+        add         r1, r0, r3
-+        vdup.32     d4, d0[0]           @ {t0,t1,t2,t3,t0,t1,t2,t3}
-+        vdup.8      d0, d0[4]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+        vdup.8      d5, d1[4]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+        vdup.8      d6, d1[0]           @ {l0,l0,l0,l0,l0,l0,l0,l0}
-+        vshll.u8    q8, d4, #2
-+        lsl         r3, #1
-+        vsubl.u8    q2, d5, d4
-+        vmlal.u8    q8, d0, d3
-+        vld1.8      {d0}, [ip]          @ {3,2,1,0,3,2,1,0}
-+        vdup.8      d7, d1[1]           @ {l1,l1,l1,l1,l1,l1,l1,l1}
-+        vshl.s16    q9, q2, #1
-+        vbif        d6, d7, d2          @ {l0,l0,l0,l0,l1,l1,l1,l1}
-+        vadd.i16    d16, d4
-+        vdup.8      d7, d1[2]           @ {l2,l2,l2,l2,l2,l2,l2,l2}
-+        vadd.i16    d17, d18
-+        vdup.8      d1, d1[3]           @ {l3,l3,l3,l3,l3,l3,l3,l3}
-+        vadd.i16    q2, q8, q9
-+        vmlal.u8    q8, d0, d6
-+        vbif        d7, d1, d2          @ {l2,l2,l2,l2,l3,l3,l3,l3}
-+        vmlal.u8    q2, d0, d7
-+        vrshrn.i16  d0, q8, #3
-+        vst1.32     d0[0], [r0 :32], r3
-+        vst1.32     d0[1], [r1 :32], r3
-+        vrshrn.i16  d0, q2, #3
-+        vst1.32     d0[0], [r0 :32]
-+        vst1.32     d0[1], [r1 :32]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_10, export=1
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0}, [r1 :64]      @ Top
-+        adr         ip, nbh_3_0_1_4
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        vldr        d3, [ip, #8]        @ {1,2,3,4}
-+T       lsl         r3, #1
-+        vshl.s16    d4, d0, #2
-+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4}
-+        vldr        d5, [r2]            @ Left (upper)
-+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4}
-+        vldr        d6, [ip]            @ {3,2,1,0}
-+        vmla.i16    d4, d3, d1          @ Acc set up
-+        vsub.i16    d0, d2, d0          @ Add set up
-+        vmov        d7, d6
-+        vdup.16     d2, d5[0]
-+        vdup.16     d3, d5[1]
-+        vdup.16     d16, d5[2]
-+        vadd.i16    d18, d0, d4
-+        vshl.s16    d0, #1              @ x2
-+        vadd.i16    d19, d0, d4
-+        vdup.16     d17, d5[3]
-+        vadd.i16    d4, d0, d18
-+A       add         r1, r0, r3, lsl #1
-+T       add         r1, r0, r3
-+        vadd.i16    d5, d0, d19
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmla.i16    q9, q1, q3
-+        vmla.i16    q2, q8, q3
-+        vrshr.u16   q0, q9, #3
-+        vst1.16     {d0}, [r0], r3
-+        vrshr.u16   d2, d4, #3
-+        vst1.16     {d1}, [r1], r3
-+        vrshr.u16   d3, d5, #3
-+        vst1.16     {d2}, [r0]
-+        vst1.16     {d3}, [r1]
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_8, export=1
-+
-+        vld1.8      {q0}, [r1]          @ Top
-+        adr         ip, nb_7_0_1_8
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        mov         r1, #8
-+        vldr        d3, [ip, #8]        @ {1,2,3,4,5,6,7,8}
-+        vshll.u8    q2, d0, #3
-+        vdup.8      d1, d1[0]           @ {t8,t8,t8,t8,t8,t8,t8,t8}
-+        vdup.8      d2, d2[0]           @ {l8,l8,l8,l8,l8,l8,l8,l8}
-+        vldr        d6, [r2]            @ Left (upper)
-+        vmlal.u8    q2, d3, d1
-+        vsubl.u8    q0, d2, d0
-+        vldr        d7, [ip]            @ {7,6,5,4,3,2,1,0}
-+
-+@ u8   7..0    [1]  d7
-+@ u8  left[y]  [1]  d6
-+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vdup.8      d2, d6[0]
-+        vadd.i16    q2, q0
-+        vdup.8      d3, d6[1]
-+        vadd.i16    q8, q2, q0
-+1:
-+        vmlal.u8    q2, d7, d2
-+        subs        r1, #2
-+        vadd.i16    q9, q8, q0
-+        vmlal.u8    q8, d7, d3
-+        vdup.8      d2, d6[2]
-+        vdup.8      d3, d6[3]
-+        vrshrn.i16  d20, q2, #4
-+        vshr.u64    d6, #16
-+        vmov        q2, q9
-+        vst1.8      {d20}, [r0], r3
-+        vrshrn.i16  d20, q8, #4
-+        vadd.i16    q8, q2, q0
-+        vst1.8      {d20}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_10, export=1
-+
-+        adr         ip, nb_7_0_1_8
-+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
-+        lsl         r3, #1
-+        vld1.16     {q1}, [ip :128]     @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
-+        add         ip, r2, #16
-+        vld1.16     {d4[],d5[]}, [r1]   @ Top (right)
-+        mov         r1, #8-2
-+        vshl.s16    q3, q0, #3
-+        vmovl.u8    q8, d3              @ {1,2,3,4,5,6,7,8}
-+        vld1.16     {d18[],d19[]}, [ip] @ Left (lower)
-+        vmla.i16    q3, q8, q2          @ Acc set up
-+        vsub.i16    q0, q9, q0          @ Add set up
-+        vmovl.u8    q1, d2              @ {7,6,5,4,3,2,1,0}
-+        vadd.i16    q2, q3, q0
-+
-+@ u16  7..0        [1]  q1
-+@ u32 left[y]      [1]  [r2]
-+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.16     {d6[],d7[]}, [r2]!
-+        vadd.i16    q8, q2, q0
-+        vld1.16     {d18[],d19[]}, [r2]!
-+        vmla.i16    q2, q1, q3
-+        vadd.i16    q3, q8, q0
-+        vmla.i16    q8, q1, q9
-+1:
-+        vrshr.u16   q9, q2, #4
-+        subs        r1, #2
-+        vmov        q2, q3
-+        vrshr.u16   q10, q8, #4
-+          vld1.16     {d6[],d7[]}, [r2]!
-+        vst1.16     {q9}, [r0 :128], r3
-+          vadd.i16    q8, q2, q0
-+          vld1.16     {d18[],d19[]}, [r2]!
-+          vmla.i16    q2, q1, q3
-+          vadd.i16    q3, q8, q0
-+          vmla.i16    q8, q1, q9
-+        vst1.16     {q10}, [r0 :128], r3
-+        bne         1b
-+
-+        vrshr.u16   q9, q2, #4
-+        add         r3, r0
-+        vrshr.u16   q10, q8, #4
-+        vst1.16     {q9}, [r0 :128]
-+        vst1.16     {q10}, [r3 :128]
-+
-+        bx         lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+        .balign 64
-+
-+nb_31_0_1_32:
-+        .byte   31, 30, 29, 28, 27, 26, 25, 24
-+        .byte   23, 22, 21, 20, 19, 18, 17, 16
-+nb_15_0_1_16:
-+        .byte   15, 14, 13, 12, 11, 10,  9,  8
-+        .byte    7,  6,  5,  4,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  5,  6,  7,  8
-+        .byte    9, 10, 11, 12, 13, 14, 15, 16
-+        .byte   17, 18, 19, 20, 21, 22, 23, 24
-+        .byte   25, 26, 27, 28, 29, 30, 31, 32
-+
-+        @ should be back on a 64-byte boundary here
-+
-+        @ These could be extracted from the above array, but separate out
-+        @ out for better (16 byte) alignment
-+nb_3_0_1_4:
-+        .byte    3,  2,  1,  0,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  1,  2,  3,  4
-+nb_7_0_1_8:
-+        .byte    7,  6,  5,  4,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  5,  6,  7,  8
-+nbh_3_0_1_4:
-+        .short   3,  2,  1,  0,  1,  2,  3,  4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_8, export=1
-+
-+        adr         ip, nb_15_0_1_16 + 16
-+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
-+        add         r2, #16
-+        vld1.8      {q1}, [ip: 128]     @ {1,2,3...16}
-+        vld1.8      {d4[]}, [r1]        @ Top (right)
-+        sub         ip, #16
-+        vshll.u8    q3, d0, #4
-+        mov         r1, #16
-+        vshll.u8    q8, d1, #4
-+        vld1.8      {d5[]}, [r2]        @ Left (lower)
-+        sub         r2, #16
-+        vmlal.u8    q3, d2, d4
-+        vmlal.u8    q8, d3, d4          @ Acc set up
-+        vsubl.u8    q1, d5, d0
-+        vsubl.u8    q0, d5, d1          @ Add set up
-+        vld1.8      {q2}, [ip :128]     @ {15,14,13...0}
-+
-+@ u8  15..0    [1]  q2
-+@ u8  left[y]  [1]  [r2]
-+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q3, q1
-+        vadd.i16    q8, q0
-+1:
-+        vadd.i16    q10, q3, q1
-+        subs        r1, #2
-+        vld1.8      {d18[]}, [r2]!
-+        vadd.i16    q11, q8, q0
-+        vld1.8      {d19[]}, [r2]!
-+        vmlal.u8    q3, d4, d18
-+        vmlal.u8    q8, d5, d18
-+        vadd.i16    q12, q10, q1
-+        vmlal.u8    q10, d4, d19
-+        vadd.i16    q13, q11, q0
-+        vmlal.u8    q11, d5, d19
-+        vrshrn.u16  d18, q3, #5
-+        vrshrn.u16  d19, q8, #5
-+        vmov        q3, q12
-+        vst1.8      {q9}, [r0 :128], r3
-+        vrshrn.u16  d18, q10, #5
-+        vrshrn.u16  d19, q11, #5
-+        vmov        q8, q13
-+        vst1.8      {q9}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        adr         ip, nb_15_0_1_16 + 16
-+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
-+        add         r2, #32
-+        vld1.8      {q2}, [ip :128]     @ {1,2,3...16}
-+        lsl         r3, #1
-+        vld1.16     {d6[],d7[]}, [r1]   @ Top (right)
-+        sub         ip, #16
-+        vmovl.u8    q8, d4
-+        mov         r1, #16
-+        vshl.i16    q9, q0, #4
-+        vmovl.u8    q2, d5
-+        vshl.i16    q10, q1, #4
-+        vld1.16     {d22[],d23[]}, [r2] @ Left (lower)
-+        sub         r2, #32
-+        vld1.8      {q12}, [ip]         @ {15,14,13...0}
-+        vmla.i16    q9, q8, q3
-+        vmla.i16    q10, q2, q3         @ Acc set up
-+        vsub.i16    q0, q11, q0
-+        vsub.i16    q1, q11, q1         @ Add set up
-+        vadd.i16    q2, q9, q0
-+        vadd.i16    q3, q10, q1
-+        vmovl.u8    q8, d24
-+        vmovl.u8    q9, d25
-+
-+@ u16  15..0       [2]  q8,q9
-+@ u32 left[y]      [2]  [r2]
-+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+        vadd.i16    q10, q2, q0
-+        subs        r1, #2
-+        vld1.16     {d24[],d25[]}, [r2]!
-+        vadd.i16    q11, q3, q1
-+        vld1.16     {d28[],d29[]}, [r2]!
-+        vmla.i16    q2, q8, q12
-+        vmla.i16    q3, q9, q12
-+        vadd.i16    q12, q10, q0
-+        vmla.i16    q10, q8, q14
-+        vadd.i16    q13, q11, q1
-+        vmla.i16    q11, q9, q14
-+        vrshr.u16   q14, q2, #5
-+        vrshr.u16   q15, q3, #5
-+        vmov        q2, q12
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        vrshr.u16   q14, q10, #5
-+        vrshr.u16   q15, q11, #5
-+        vmov        q3, q13
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        bne         1b
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_8, export=1
-+
-+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
-+        adr         ip, nb_31_0_1_32 + 32
-+        vpush       {d8-d12}
-+        vld1.8      {q2-q3}, [ip :128]  @ {1,2,3...32}
-+        add         r2, #32
-+        vld1.8      {d8[]}, [r1]        @ Top (right)
-+        sub         ip, #32
-+        vshll.u8    q8, d0, #5
-+        mov         r1, #32
-+        vld1.8      {d9[]}, [r2]        @ Left (lower)
-+        sub         r2, #32
-+        vshll.u8    q9, d1, #5
-+        vshll.u8    q10, d2, #5
-+        vshll.u8    q11, d3, #5
-+        vmlal.u8    q8, d4, d8
-+        vsubl.u8    q12, d9, d0
-+        vmlal.u8    q9, d5, d8
-+        vsubl.u8    q13, d9, d1
-+        vmlal.u8    q10, d6, d8
-+        vsubl.u8    q14, d9, d2
-+        vmlal.u8    q11, d7, d8         @ Acc set up
-+        vsubl.u8    q15, d9, d3         @ Add set up
-+        vadd.i16    q8, q12
-+        vadd.i16    q9, q13
-+        vadd.i16    q10, q14
-+        vadd.i16    q11, q15
-+        vld1.8      {q4-q5}, [ip :128]  @ {31,30,29...0}
-+
-+@ u8  31..0    [2]  q4,q5
-+@ u8  left[y]  [2]  [r2]
-+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.8      {d12[]}, [r2]!
-+        vadd.i16    q0, q8, q12
-+        b           2f
-+1:
-+          vld1.8      {d12[]}, [r2]!
-+        vrshrn.u16  d3, q1, #6
-+        vrshrn.u16  d2, q0, #6
-+          vadd.i16    q0, q8, q12
-+        vrshrn.u16  d4, q2, #6
-+        vrshrn.u16  d5, q3, #6
-+        vst1.8      {q1-q2}, [r0 :128], r3
-+2:        vadd.i16    q1, q9, q13
-+          subs        r1, #2
-+          vadd.i16    q2, q10, q14
-+          vadd.i16    q3, q11, q15
-+          vmlal.u8    q8, d8, d12
-+          vmlal.u8    q9, d9, d12
-+          vmlal.u8    q10, d10, d12
-+          vmlal.u8    q11, d11, d12
-+            vld1.8      {d12[]}, [r2]!
-+          vrshrn.u16  d19, q9, #6
-+          vrshrn.u16  d18, q8, #6
-+            vadd.i16    q8, q0, q12
-+          vrshrn.u16  d20, q10, #6
-+          vrshrn.u16  d21, q11, #6
-+          vst1.8      {q9-q10}, [r0 :128], r3
-+            vadd.i16    q9, q1, q13
-+            vadd.i16    q10, q2, q14
-+            vadd.i16    q11, q3, q15
-+            vmlal.u8    q0, d8, d12
-+            vmlal.u8    q1, d9, d12
-+            vmlal.u8    q2, d10, d12
-+            vmlal.u8    q3, d11, d12
-+
-+        bne         1b
-+
-+        vpop        {d8-d12}
-+
-+        vrshrn.u16  d3, q1, #6
-+        vrshrn.u16  d2, q0, #6
-+        vrshrn.u16  d4, q2, #6
-+        vrshrn.u16  d5, q3, #6
-+        vst1.8      {q1-q2}, [r0 :128]
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
-+        adr         ip, nb_31_0_1_32 + 32
-+        vpush       {q4-q7}
-+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
-+        add         r2, #64
-+        vld1.8      {q14-q15}, [ip :128] @ {1,2,3...32}
-+T       lsl         r3, #1
-+        vld1.16     {d8[],d9[]}, [r1]    @ Top (right)
-+        sub         ip, #32
-+        vmovl.u8    q12, d28
-+        mov         r1, #32
-+        vmovl.u8    q13, d29
-+        vld1.8      {q6-q7}, [ip :128]   @ {31,30,29...0}
-+        vmovl.u8    q14, d30
-+        vmovl.u8    q15, d31
-+        vld1.16     {d10[],d11[]}, [r2]  @ Left (lower)
-+        sub         r2, #64
-+        vshl.i16    q8, q0, #5
-+        vshl.i16    q9, q1, #5
-+        vshl.i16    q10, q2, #5
-+        vshl.i16    q11, q3, #5
-+        vmla.i16    q8, q12, q4
-+        vsub.i16    q0, q5, q0
-+        vmla.i16    q9, q13, q4
-+        vsub.i16    q1, q5, q1
-+        vmla.i16    q10, q14, q4
-+        vmov.u16    ip, d0[0]
-+        vsub.i16    q2, q5, q2
-+        vmla.i16    q11, q15, q4         @ Acc set up
-+        vsub.i16    q3, q5, q3           @ Add set up
-+        vadd.i16    q8, q0
-+        vadd.i16    q9, q1
-+        vadd.i16    q10, q2
-+        vadd.i16    q11, q3
-+        vmovl.u8    q4, d12
-+        vmovl.u8    q5, d13
-+        vmovl.u8    q6, d14
-+        vmovl.u8    q7, d15
-+
-+@ u16 31..0    [4]  q4-q7
-+@ u16 left[y]  [4]  [r2]
-+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q12, q8, q0
-+A       sub         r0, r0, r3, lsl #1
-+T       sub         r0, r3
-+1:
-+        vld1.16     {d0[0]}, [r2]!
-+A       add         r0, r0, r3, lsl #1
-+T       add         r0, r3
-+        vadd.i16    q13, q9, q1
-+        subs        r1, #2
-+        vadd.i16    q14, q10, q2
-+        vadd.i16    q15, q11, q3
-+        vmla.i16    q8, q4, d0[0]
-+        vmla.i16    q9, q5, d0[0]
-+        vmla.i16    q10, q6, d0[0]
-+        vmla.i16    q11, q7, d0[0]
-+        vmov.16     d0[0], ip
-+        vrshr.u16   q8, #6
-+        vrshr.u16   q9, #6
-+        vrshr.u16   q10, #6
-+        vrshr.u16   q11, #6
-+        vstm        r0, {q8-q11}
-+        vadd.i16    q8, q12, q0
-+A       add         r0, r0, r3, lsl #1
-+T       add         r0, r3
-+        vld1.16     {d0[0]}, [r2]!
-+        vadd.i16    q9, q13, q1
-+        vadd.i16    q10, q14, q2
-+        vadd.i16    q11, q15, q3
-+        vmla.i16    q12, q4, d0[0]
-+        vmla.i16    q13, q5, d0[0]
-+        vmla.i16    q14, q6, d0[0]
-+        vmla.i16    q15, q7, d0[0]
-+        vmov.16     d0[0], ip
-+        vrshr.u16   q12, #6
-+        vrshr.u16   q13, #6
-+        vrshr.u16   q14, #6
-+        vrshr.u16   q15, #6
-+        vstm        r0, {q12-q15}
-+        vadd.i16    q12, q8, q0
-+        bne         1b
-+
-+        vpop        {q4-q7}
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
-+
-+        vld1.8      {q0}, [r1]          @ Top
-+        adr         ip, nbx2_3_0_1_4
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        mov         r1, #4
-+        vldr        d3, [ip, #8]        @ {1,1,2,2,3,3,4,4}
-+        lsl         r3, #1
-+        vshll.u8    q2, d0, #2
-+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+        vldr        d6, [r2]            @ Left (upper)
-+        vmlal.u8    q2, d3, d1
-+        vsubl.u8    q0, d2, d0
-+        vldr        d7, [ip]            @ {3,3,2,2,1,1,0,0}
-+
-+@ u8   3..0    [1]  d7
-+@ u8  left[y]  [1]  d6
-+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vdup.16     d2, d6[0]
-+        vadd.i16    q2, q0
-+        vdup.16     d3, d6[1]
-+        vadd.i16    q8, q2, q0
-+1:
-+        vmlal.u8    q2, d7, d2
-+        subs        r1, #2
-+        vadd.i16    q9, q8, q0
-+        vmlal.u8    q8, d7, d3
-+        vdup.16     d2, d6[2]
-+        vdup.16     d3, d6[3]
-+        vrshrn.i16  d20, q2, #3
-+        vmov        q2, q9
-+        vst1.8      {d20}, [r0], r3
-+        vrshrn.i16  d20, q8, #3
-+        vadd.i16    q8, q2, q0
-+        vst1.8      {d20}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
-+
-+        adr         ip, nbx2_3_0_1_4
-+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
-+        lsl         r3, #2
-+        vld1.16     {q1}, [ip :128]     @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
-+        add         ip, r2, #16
-+        vld1.32     {d4[],d5[]}, [r1]   @ Top (right)
-+        vshl.s16    q3, q0, #2
-+        vmovl.u8    q8, d3              @ {1,1,2,2,3,3,4,4}
-+        vld1.32     {d18[],d19[]}, [ip] @ Left (lower)
-+        vmla.i16    q3, q8, q2          @ Acc set up
-+        vsub.i16    q0, q9, q0          @ Add set up
-+        vmovl.u8    q1, d2              @ {3,3,2,2,1,1,0,0}
-+        vadd.i16    q2, q3, q0
-+
-+@ u16  3..0        [1]  q1
-+@ u32 left[y]      [1]  [r2]
-+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.32     {d6[],d7[]}, [r2]!
-+        vadd.i16    q8, q2, q0
-+        vld1.32     {d18[],d19[]}, [r2]!
-+        vmla.i16    q2, q1, q3
-+        vadd.i16    q3, q8, q0
-+        vmla.i16    q8, q1, q9
-+
-+        vrshr.u16   q9, q2, #3
-+        vmov        q2, q3
-+        vrshr.u16   q10, q8, #3
-+          vld1.32     {d6[],d7[]}, [r2]!
-+        vst1.16     {q9}, [r0 :128], r3
-+          vadd.i16    q8, q2, q0
-+          vld1.32     {d18[],d19[]}, [r2]!
-+          vmla.i16    q2, q1, q3
-+          vadd.i16    q3, q8, q0
-+          vmla.i16    q8, q1, q9
-+        vst1.16     {q10}, [r0 :128], r3
-+
-+          vrshr.u16   q9, q2, #3
-+          add         r3, r0
-+          vrshr.u16   q10, q8, #3
-+          vst1.16     {q9}, [r0 :128]
-+          vst1.16     {q10}, [r3 :128]
-+
-+          bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
-+
-+        adr         ip, nbx2_7_0_1_8 + 16
-+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
-+        add         r2, #16
-+        vld1.8      {q1}, [ip: 128]     @ {1,1,2,2,3,3...8,8}
-+        lsl         r3, #1
-+        vld1.16     {d4[]}, [r1]        @ Top (right)
-+        sub         ip, #16
-+        vshll.u8    q3, d0, #3
-+        mov         r1, #8
-+        vshll.u8    q8, d1, #3
-+        vld1.16     {d5[]}, [r2]        @ Left (lower)
-+        sub         r2, #16
-+        vmlal.u8    q3, d2, d4
-+        vmlal.u8    q8, d3, d4          @ Acc set up
-+        vsubl.u8    q1, d5, d0
-+        vsubl.u8    q0, d5, d1          @ Add set up
-+        vld1.8      {q2}, [ip :128]     @ {7,7,6,6,5,5...0,0}
-+
-+@ u8  7..0     [1]  q2
-+@ u8  left[y]  [1]  [r2]
-+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q3, q1
-+        vadd.i16    q8, q0
-+1:
-+        vadd.i16    q10, q3, q1
-+        subs        r1, #2
-+        vld1.16     {d18[]}, [r2]!
-+        vadd.i16    q11, q8, q0
-+        vld1.16     {d19[]}, [r2]!
-+        vmlal.u8    q3, d4, d18
-+        vmlal.u8    q8, d5, d18
-+        vadd.i16    q12, q10, q1
-+        vmlal.u8    q10, d4, d19
-+        vadd.i16    q13, q11, q0
-+        vmlal.u8    q11, d5, d19
-+        vrshrn.u16  d18, q3, #4
-+        vrshrn.u16  d19, q8, #4
-+        vmov        q3, q12
-+        vst1.8      {q9}, [r0 :128], r3
-+        vrshrn.u16  d18, q10, #4
-+        vrshrn.u16  d19, q11, #4
-+        vmov        q8, q13
-+        vst1.8      {q9}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+        .balign 64
-+
-+nbx2_15_0_1_16:
-+        .byte   15, 15, 14, 14, 13, 13, 12, 12
-+        .byte   11, 11, 10, 10,  9,  9,  8,  8
-+nbx2_7_0_1_8:
-+        .byte    7,  7,  6,  6,  5,  5,  4,  4
-+        .byte    3,  3,  2,  2,  1,  1,  0,  0
-+        .byte    1,  1,  2,  2,  3,  3,  4,  4
-+        .byte    5,  5,  6,  6,  7,  7,  8,  8
-+        .byte    9,  9, 10, 10, 11, 11, 12, 12
-+        .byte   13, 13, 14, 14, 15, 15, 16, 16
-+
-+        @ should be back on a 64-byte boundary here
-+
-+nbx2_3_0_1_4:
-+        .byte    3,  3,  2,  2,  1,  1,  0,  0
-+        .byte    1,  1,  2,  2,  3,  3,  4,  4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        adr         ip, nbx2_7_0_1_8 + 16
-+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
-+        add         r2, #32
-+        vld1.8      {q2}, [ip :128]     @ {1,1,2,2,3,3...8,8}
-+        lsl         r3, #2
-+        vld1.32     {d6[],d7[]}, [r1]   @ Top (right)
-+        sub         ip, #16
-+        vmovl.u8    q8, d4
-+        mov         r1, #8
-+        vshl.i16    q9, q0, #3
-+        vmovl.u8    q2, d5
-+        vshl.i16    q10, q1, #3
-+        vld1.32     {d22[],d23[]}, [r2] @ Left (lower)
-+        sub         r2, #32
-+        vld1.8      {q12}, [ip]         @ {7,7,6,6,5,5...0,0}
-+        vmla.i16    q9, q8, q3
-+        vmla.i16    q10, q2, q3         @ Acc set up
-+        vsub.i16    q0, q11, q0
-+        vsub.i16    q1, q11, q1         @ Add set up
-+        vadd.i16    q2, q9, q0
-+        vadd.i16    q3, q10, q1
-+        vmovl.u8    q8, d24
-+        vmovl.u8    q9, d25
-+
-+@ u16  7..0        [2]  q8,q9
-+@ u32 left[y]      [2]  [r2]
-+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+        vadd.i16    q10, q2, q0
-+        subs        r1, #2
-+        vld1.32     {d24[],d25[]}, [r2]!
-+        vadd.i16    q11, q3, q1
-+        vld1.32     {d28[],d29[]}, [r2]!
-+        vmla.i16    q2, q8, q12
-+        vmla.i16    q3, q9, q12
-+        vadd.i16    q12, q10, q0
-+        vmla.i16    q10, q8, q14
-+        vadd.i16    q13, q11, q1
-+        vmla.i16    q11, q9, q14
-+        vrshr.u16   q14, q2, #4
-+        vrshr.u16   q15, q3, #4
-+        vmov        q2, q12
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        vrshr.u16   q14, q10, #4
-+        vrshr.u16   q15, q11, #4
-+        vmov        q3, q13
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        bne         1b
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
-+
-+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
-+        adr         ip, nbx2_15_0_1_16 + 32
-+        vpush       {d8-d12}
-+        vld1.8      {q2-q3}, [ip :128]  @ {1,1,2,2,3,3...16,16}
-+        add         r2, #32
-+        vld1.16     {d8[]}, [r1]        @ Top (right)
-+        sub         ip, #32
-+        vshll.u8    q8, d0, #4
-+        mov         r1, #16
-+        vld1.16     {d9[]}, [r2]        @ Left (lower)
-+        sub         r2, #32
-+        vshll.u8    q9, d1, #4
-+        lsl         r3, #1
-+        vshll.u8    q10, d2, #4
-+        vshll.u8    q11, d3, #4
-+        vmlal.u8    q8, d4, d8
-+        vsubl.u8    q12, d9, d0
-+        vmlal.u8    q9, d5, d8
-+        vsubl.u8    q13, d9, d1
-+        vmlal.u8    q10, d6, d8
-+        vsubl.u8    q14, d9, d2
-+        vmlal.u8    q11, d7, d8         @ Acc set up
-+        vsubl.u8    q15, d9, d3         @ Add set up
-+        vadd.i16    q8, q12
-+        vadd.i16    q9, q13
-+        vadd.i16    q10, q14
-+        vadd.i16    q11, q15
-+        vld1.8      {q4-q5}, [ip :128]  @ {15,15,14,14,13,13...0,0}
-+
-+@ u8  15..0    [2]  q4,q5
-+@ u8  left[y]  [2]  [r2]
-+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.16     {d12[]}, [r2]!
-+        vadd.i16    q0, q8, q12
-+        b           2f
-+1:
-+          vld1.16     {d12[]}, [r2]!
-+        vrshrn.u16  d3, q1, #5
-+        vrshrn.u16  d2, q0, #5
-+          vadd.i16    q0, q8, q12
-+        vrshrn.u16  d4, q2, #5
-+        vrshrn.u16  d5, q3, #5
-+        vst1.8      {q1-q2}, [r0 :128], r3
-+2:        vadd.i16    q1, q9, q13
-+          subs        r1, #2
-+          vadd.i16    q2, q10, q14
-+          vadd.i16    q3, q11, q15
-+          vmlal.u8    q8, d8, d12
-+          vmlal.u8    q9, d9, d12
-+          vmlal.u8    q10, d10, d12
-+          vmlal.u8    q11, d11, d12
-+            vld1.16     {d12[]}, [r2]!
-+          vrshrn.u16  d19, q9, #5
-+          vrshrn.u16  d18, q8, #5
-+            vadd.i16    q8, q0, q12
-+          vrshrn.u16  d20, q10, #5
-+          vrshrn.u16  d21, q11, #5
-+          vst1.8      {q9-q10}, [r0 :128], r3
-+            vadd.i16    q9, q1, q13
-+            vadd.i16    q10, q2, q14
-+            vadd.i16    q11, q3, q15
-+            vmlal.u8    q0, d8, d12
-+            vmlal.u8    q1, d9, d12
-+            vmlal.u8    q2, d10, d12
-+            vmlal.u8    q3, d11, d12
-+
-+        bne         1b
-+
-+        vpop        {d8-d12}
-+
-+        vrshrn.u16  d3, q1, #5
-+        vrshrn.u16  d2, q0, #5
-+        vrshrn.u16  d4, q2, #5
-+        vrshrn.u16  d5, q3, #5
-+        vst1.8      {q1-q2}, [r0 :128]
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
-+        adr         ip, nbx2_15_0_1_16 + 32
-+        vpush       {q4-q7}
-+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
-+        add         r2, #64
-+        vld1.8      {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
-+T       lsl         r3, #2
-+        vld1.32     {d8[],d9[]}, [r1]    @ Top (right)
-+        sub         ip, #32
-+        vmovl.u8    q12, d28
-+        mov         r1, #16
-+        vmovl.u8    q13, d29
-+        vld1.8      {q6-q7}, [ip :128]   @ {15,15,14,14,13,13...0,0}
-+        vmovl.u8    q14, d30
-+        vmovl.u8    q15, d31
-+        vld1.32     {d10[],d11[]}, [r2]  @ Left (lower)
-+        sub         r2, #64
-+        vshl.i16    q8, q0, #4
-+        vshl.i16    q9, q1, #4
-+        vshl.i16    q10, q2, #4
-+        vshl.i16    q11, q3, #4
-+        vmla.i16    q8, q12, q4
-+        vsub.i16    q0, q5, q0
-+        vmla.i16    q9, q13, q4
-+        vpush       {q0}
-+        vsub.i16    q1, q5, q1
-+        vmla.i16    q10, q14, q4
-+        vsub.i16    q2, q5, q2
-+        vmla.i16    q11, q15, q4         @ Acc set up
-+        vsub.i16    q3, q5, q3           @ Add set up
-+        vadd.i16    q8, q0
-+        vadd.i16    q9, q1
-+        vadd.i16    q10, q2
-+        vadd.i16    q11, q3
-+        vmovl.u8    q4, d12
-+        vmovl.u8    q5, d13
-+        vmovl.u8    q6, d14
-+        vmovl.u8    q7, d15
-+
-+@ u16 31..0    [4]  q4-q7
-+@ u16 left[y]  [4]  [r2]
-+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q12, q8, q0
-+A       sub         r0, r0, r3, lsl #2
-+T       sub         r0, r3
-+1:
-+        vld1.32     {d0[],d1[]}, [r2]!
-+A       add         r0, r0, r3, lsl #2
-+T       add         r0, r3
-+        vadd.i16    q13, q9, q1
-+        subs        r1, #2
-+        vadd.i16    q14, q10, q2
-+        vadd.i16    q15, q11, q3
-+        vmla.i16    q8, q4, q0
-+        vmla.i16    q9, q5, q0
-+        vmla.i16    q10, q6, q0
-+        vmla.i16    q11, q7, q0
-+        vld1.16     {q0}, [sp]
-+        vrshr.u16   q8, #5
-+        vrshr.u16   q9, #5
-+        vrshr.u16   q10, #5
-+        vrshr.u16   q11, #5
-+        vstm        r0, {q8-q11}
-+        vadd.i16    q8, q12, q0
-+A       add         r0, r0, r3, lsl #2
-+T       add         r0, r3
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        vadd.i16    q9, q13, q1
-+        vadd.i16    q10, q14, q2
-+        vadd.i16    q11, q15, q3
-+        vmla.i16    q12, q4, q0
-+        vmla.i16    q13, q5, q0
-+        vmla.i16    q14, q6, q0
-+        vmla.i16    q15, q7, q0
-+        vld1.16     {q0}, [sp]
-+        vrshr.u16   q12, #5
-+        vrshr.u16   q13, #5
-+        vrshr.u16   q14, #5
-+        vrshr.u16   q15, #5
-+        vstm        r0, {q12-q15}
-+        vadd.i16    q12, q8, q0
-+        bne         1b
-+
-+        vpop        {q3-q7}
-+        bx          lr
-+
-+endfunc
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index d234271c5b..a36b675fba 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -3273,7 +3273,13 @@ typedef struct AVCodecContext {
- #endif
- 
-     /**
--     * Audio only. The amount of padding (in samples) appended by the encoder to
-+     * Opaque pointer for use by replacement get_buffer2 code
-+     *
-+     * @author jc (08/02/2016)
-+     */
-+    void * get_buffer_context;
-+
-+    /* Audio only. The amount of padding (in samples) appended by the encoder to
-      * the end of the audio. I.e. this number of decoded samples must be
-      * discarded by the caller from the end of the stream to get the original
-      * audio without any trailing padding.
-@@ -4666,6 +4672,17 @@ void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst);
-  */
- AVCodec *avcodec_find_decoder(enum AVCodecID id);
- 
-+/**
-+ * Find a registered decoder with a matching codec ID and pix_fmt.
-+ * A decoder will pix_fmt set to NULL will match any fmt.
-+ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
-+ *
-+ * @param id AVCodecID of the requested decoder
-+ * @param fmt AVPixelForma that msut be supported by decoder
-+ * @return A decoder if one was found, NULL otherwise.
-+ */
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
-+
- /**
-  * Find a registered decoder with the specified name.
-  *
-diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-index 1bf1c620d6..ccfa991f60 100644
---- a/libavcodec/cabac.h
-+++ b/libavcodec/cabac.h
-@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
- typedef struct CABACContext{
-     int low;
-     int range;
--    int outstanding_count;
-+    union
-+    {
-+        int outstanding_count;
-+        struct {
-+            uint16_t bits;
-+            uint16_t range;
-+        } by22;
-+    };
-     const uint8_t *bytestream_start;
-     const uint8_t *bytestream;
-     const uint8_t *bytestream_end;
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 647a22ef7c..4ed35d1126 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
-  * MMAL Video Decoder
-  */
- 
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- #include <stdatomic.h>
- 
- #include "avcodec.h"
-diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index b6fb91c1c6..e99dacbb1c 100644
---- a/libavcodec/raw.c
-+++ b/libavcodec/raw.c
-@@ -289,6 +289,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
-     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
-     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
- 
-+    /* RPI (Might as well define for everything) */
-+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
-+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
-+
-     /* special */
-     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
-     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d181b74570..c52c450956 100644
---- a/libavcodec/rawenc.c
-+++ b/libavcodec/rawenc.c
-@@ -24,6 +24,7 @@
-  * Raw Video Encoder
-  */
- 
-+#include "config.h"
- #include "avcodec.h"
- #include "raw.h"
- #include "internal.h"
-@@ -31,6 +32,10 @@
- #include "libavutil/intreadwrite.h"
- #include "libavutil/imgutils.h"
- #include "libavutil/internal.h"
-+#include "libavutil/avassert.h"
-+#if CONFIG_SAND
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- static av_cold int raw_encode_init(AVCodecContext *avctx)
- {
-@@ -49,6 +54,55 @@ FF_ENABLE_DEPRECATION_WARNINGS
-     return 0;
- }
- 
-+#if CONFIG_SAND
-+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3 / 2;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height;
-+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
-+    dst += width * height * 2;
-+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
-+    return 0;
-+}
-+#endif
-+
-+
- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-                       const AVFrame *frame, int *got_packet)
- {
-@@ -58,6 +112,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-     if (ret < 0)
-         return ret;
- 
-+#if CONFIG_SAND
-+    if (av_rpi_is_sand_frame(frame)) {
-+        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
-+        *got_packet = (ret == 0);
-+        return ret;
-+    }
-+#endif
-+
-     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
-         return ret;
-     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
-new file mode 100644
-index 0000000000..552c2e349e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac.c
-@@ -0,0 +1,2255 @@
-+/*
-+ * HEVC CABAC decoding
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#define UNCHECKED_BITSTREAM_READER 1
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+
-+#include "cabac_functions.h"
-+#include "rpi_hevc_data.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// BY22 is probably faster than simple bypass if the processor has
-+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
-+// x86 has fast int divide
-+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
-+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
-+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
-+// Use native divide if we have a fast one - otherwise use mpy 1/x
-+// x86 has a fast integer divide - arm doesn't - unsure about other
-+// architectures
-+#define USE_BY22_DIV  ARCH_X86
-+
-+// Special case blocks with a single significant ceoff
-+// Decreases the complexity of the code for a common case but increases the
-+// code size.
-+#define USE_N_END_1 1
-+
-+#if !USE_BY22_DIV
-+// * 1/x @ 32 bits gets us 22 bits of accuracy
-+#define CABAC_BY22_PEEK_BITS  22
-+#else
-+// A real 32-bit divide gets us another bit
-+// If we have a 64 bit int & a unit time divider then we should get a lot
-+// of bits (55)  but that is untested and it is unclear if it would give
-+// us a large advantage
-+#define CABAC_BY22_PEEK_BITS  23
-+#endif
-+
-+#define CABAC_MAX_BIN 31
-+
-+
-+#if USE_BY22 && !USE_BY22_DIV
-+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
-+
-+static const uint32_t cabac_by22_inv_range[256] = {
-+                                                    0,      I(257), I(258), I(259),
-+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
-+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
-+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
-+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
-+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
-+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
-+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
-+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
-+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
-+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
-+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
-+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
-+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
-+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
-+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
-+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
-+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
-+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
-+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
-+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
-+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
-+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
-+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
-+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
-+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
-+    I(510), I(511)
-+};
-+#undef I
-+#endif  // USE_BY22
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_cabac.h"
-+#endif
-+
-+/**
-+ * number of bin by SyntaxElement.
-+ */
-+static const int8_t num_bins_in_se[] = {
-+     1, // sao_merge_flag
-+     1, // sao_type_idx
-+     0, // sao_eo_class
-+     0, // sao_band_position
-+     0, // sao_offset_abs
-+     0, // sao_offset_sign
-+     0, // end_of_slice_flag
-+     3, // split_coding_unit_flag
-+     1, // cu_transquant_bypass_flag
-+     3, // skip_flag
-+     3, // cu_qp_delta
-+     1, // pred_mode
-+     4, // part_mode
-+     0, // pcm_flag
-+     1, // prev_intra_luma_pred_mode
-+     0, // mpm_idx
-+     0, // rem_intra_luma_pred_mode
-+     2, // intra_chroma_pred_mode
-+     1, // merge_flag
-+     1, // merge_idx
-+     5, // inter_pred_idc
-+     2, // ref_idx_l0
-+     2, // ref_idx_l1
-+     2, // abs_mvd_greater0_flag
-+     2, // abs_mvd_greater1_flag
-+     0, // abs_mvd_minus2
-+     0, // mvd_sign_flag
-+     1, // mvp_lx_flag
-+     1, // no_residual_data_flag
-+     3, // split_transform_flag
-+     2, // cbf_luma
-+     4, // cbf_cb, cbf_cr
-+     2, // transform_skip_flag[][]
-+     2, // explicit_rdpcm_flag[][]
-+     2, // explicit_rdpcm_dir_flag[][]
-+    18, // last_significant_coeff_x_prefix
-+    18, // last_significant_coeff_y_prefix
-+     0, // last_significant_coeff_x_suffix
-+     0, // last_significant_coeff_y_suffix
-+     4, // significant_coeff_group_flag
-+    44, // significant_coeff_flag
-+    24, // coeff_abs_level_greater1_flag
-+     6, // coeff_abs_level_greater2_flag
-+     0, // coeff_abs_level_remaining
-+     0, // coeff_sign_flag
-+     8, // log2_res_scale_abs
-+     2, // res_scale_sign_flag
-+     1, // cu_chroma_qp_offset_flag
-+     1, // cu_chroma_qp_offset_idx
-+};
-+
-+/**
-+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
-+ */
-+static const int elem_offset[sizeof(num_bins_in_se)] = {
-+    0, // sao_merge_flag
-+    1, // sao_type_idx
-+    2, // sao_eo_class
-+    2, // sao_band_position
-+    2, // sao_offset_abs
-+    2, // sao_offset_sign
-+    2, // end_of_slice_flag
-+    2, // split_coding_unit_flag
-+    5, // cu_transquant_bypass_flag
-+    6, // skip_flag
-+    9, // cu_qp_delta
-+    12, // pred_mode
-+    13, // part_mode
-+    17, // pcm_flag
-+    17, // prev_intra_luma_pred_mode
-+    18, // mpm_idx
-+    18, // rem_intra_luma_pred_mode
-+    18, // intra_chroma_pred_mode
-+    20, // merge_flag
-+    21, // merge_idx
-+    22, // inter_pred_idc
-+    27, // ref_idx_l0
-+    29, // ref_idx_l1
-+    31, // abs_mvd_greater0_flag
-+    33, // abs_mvd_greater1_flag
-+    35, // abs_mvd_minus2
-+    35, // mvd_sign_flag
-+    35, // mvp_lx_flag
-+    36, // no_residual_data_flag
-+    37, // split_transform_flag
-+    40, // cbf_luma
-+    42, // cbf_cb, cbf_cr
-+    46, // transform_skip_flag[][]
-+    48, // explicit_rdpcm_flag[][]
-+    50, // explicit_rdpcm_dir_flag[][]
-+    52, // last_significant_coeff_x_prefix
-+    70, // last_significant_coeff_y_prefix
-+    88, // last_significant_coeff_x_suffix
-+    88, // last_significant_coeff_y_suffix
-+    88, // significant_coeff_group_flag
-+    92, // significant_coeff_flag
-+    136, // coeff_abs_level_greater1_flag
-+    160, // coeff_abs_level_greater2_flag
-+    166, // coeff_abs_level_remaining
-+    166, // coeff_sign_flag
-+    166, // log2_res_scale_abs
-+    174, // res_scale_sign_flag
-+    176, // cu_chroma_qp_offset_flag
-+    177, // cu_chroma_qp_offset_idx
-+};
-+
-+#define CNU 154
-+/**
-+ * Indexed by init_type
-+ */
-+static const uint8_t init_values[3][HEVC_CONTEXTS] = {
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      200,
-+      // split_coding_unit_flag
-+      139, 141, 157,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      CNU, CNU, CNU,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      CNU,
-+      // part_mode
-+      184, CNU, CNU, CNU,
-+      // prev_intra_luma_pred_mode
-+      184,
-+      // intra_chroma_pred_mode
-+      63, 139,
-+      // merge_flag
-+      CNU,
-+      // merge_idx
-+      CNU,
-+      // inter_pred_idc
-+      CNU, CNU, CNU, CNU, CNU,
-+      // ref_idx_l0
-+      CNU, CNU,
-+      // ref_idx_l1
-+      CNU, CNU,
-+      // abs_mvd_greater1_flag
-+      CNU, CNU,
-+      // abs_mvd_greater1_flag
-+      CNU, CNU,
-+      // mvp_lx_flag
-+      CNU,
-+      // no_residual_data_flag
-+      CNU,
-+      // split_transform_flag
-+      153, 138, 138,
-+      // cbf_luma
-+      111, 141,
-+      // cbf_cb, cbf_cr
-+      94, 138, 182, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+       79, 108, 123,  63,
-+      // last_significant_coeff_y_prefix
-+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+       79, 108, 123,  63,
-+      // significant_coeff_group_flag
-+      91, 171, 134, 141,
-+      // significant_coeff_flag
-+      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
-+      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
-+      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
-+      141, 111,
-+      // coeff_abs_level_greater1_flag
-+      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
-+      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
-+      // coeff_abs_level_greater2_flag
-+      138, 153, 136, 167, 152, 152,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      185,
-+      // split_coding_unit_flag
-+      107, 139, 126,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      197, 185, 201,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      149,
-+      // part_mode
-+      154, 139, 154, 154,
-+      // prev_intra_luma_pred_mode
-+      154,
-+      // intra_chroma_pred_mode
-+      152, 139,
-+      // merge_flag
-+      110,
-+      // merge_idx
-+      122,
-+      // inter_pred_idc
-+      95, 79, 63, 31, 31,
-+      // ref_idx_l0
-+      153, 153,
-+      // ref_idx_l1
-+      153, 153,
-+      // abs_mvd_greater1_flag
-+      140, 198,
-+      // abs_mvd_greater1_flag
-+      140, 198,
-+      // mvp_lx_flag
-+      168,
-+      // no_residual_data_flag
-+      79,
-+      // split_transform_flag
-+      124, 138, 94,
-+      // cbf_luma
-+      153, 111,
-+      // cbf_cb, cbf_cr
-+      149, 107, 167, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
-+       94, 108, 123, 108,
-+      // last_significant_coeff_y_prefix
-+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
-+       94, 108, 123, 108,
-+      // significant_coeff_group_flag
-+      121, 140, 61, 154,
-+      // significant_coeff_flag
-+      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
-+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
-+      140, 140,
-+      // coeff_abs_level_greater1_flag
-+      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
-+      // coeff_abs_level_greater2_flag
-+      107, 167, 91, 122, 107, 167,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      160,
-+      // split_coding_unit_flag
-+      107, 139, 126,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      197, 185, 201,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      134,
-+      // part_mode
-+      154, 139, 154, 154,
-+      // prev_intra_luma_pred_mode
-+      183,
-+      // intra_chroma_pred_mode
-+      152, 139,
-+      // merge_flag
-+      154,
-+      // merge_idx
-+      137,
-+      // inter_pred_idc
-+      95, 79, 63, 31, 31,
-+      // ref_idx_l0
-+      153, 153,
-+      // ref_idx_l1
-+      153, 153,
-+      // abs_mvd_greater1_flag
-+      169, 198,
-+      // abs_mvd_greater1_flag
-+      169, 198,
-+      // mvp_lx_flag
-+      168,
-+      // no_residual_data_flag
-+      79,
-+      // split_transform_flag
-+      224, 167, 122,
-+      // cbf_luma
-+      153, 111,
-+      // cbf_cb, cbf_cr
-+      149, 92, 167, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
-+       79, 108, 123,  93,
-+      // last_significant_coeff_y_prefix
-+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
-+       79, 108, 123,  93,
-+      // significant_coeff_group_flag
-+      121, 140, 61, 154,
-+      // significant_coeff_flag
-+      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
-+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
-+      140, 140,
-+      // coeff_abs_level_greater1_flag
-+      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
-+      // coeff_abs_level_greater2_flag
-+      107, 167, 91, 107, 107, 167,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+};
-+
-+static const uint8_t scan_1x1[1] = {
-+    0,
-+};
-+
-+static const uint8_t horiz_scan2x2_x[4] = {
-+    0, 1, 0, 1,
-+};
-+
-+static const uint8_t horiz_scan2x2_y[4] = {
-+    0, 0, 1, 1
-+};
-+
-+static const uint8_t horiz_scan4x4_x[16] = {
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+};
-+
-+static const uint8_t horiz_scan4x4_y[16] = {
-+    0, 0, 0, 0,
-+    1, 1, 1, 1,
-+    2, 2, 2, 2,
-+    3, 3, 3, 3,
-+};
-+
-+static const uint8_t horiz_scan8x8_inv[8][8] = {
-+    {  0,  1,  2,  3, 16, 17, 18, 19, },
-+    {  4,  5,  6,  7, 20, 21, 22, 23, },
-+    {  8,  9, 10, 11, 24, 25, 26, 27, },
-+    { 12, 13, 14, 15, 28, 29, 30, 31, },
-+    { 32, 33, 34, 35, 48, 49, 50, 51, },
-+    { 36, 37, 38, 39, 52, 53, 54, 55, },
-+    { 40, 41, 42, 43, 56, 57, 58, 59, },
-+    { 44, 45, 46, 47, 60, 61, 62, 63, },
-+};
-+
-+static const uint8_t diag_scan2x2_x[4] = {
-+    0, 0, 1, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_y[4] = {
-+    0, 1, 0, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_inv[2][2] = {
-+    { 0, 2, },
-+    { 1, 3, },
-+};
-+
-+static const uint8_t diag_scan4x4_inv[4][4] = {
-+    { 0,  2,  5,  9, },
-+    { 1,  4,  8, 12, },
-+    { 3,  7, 11, 14, },
-+    { 6, 10, 13, 15, },
-+};
-+
-+static const uint8_t diag_scan8x8_inv[8][8] = {
-+    {  0,  2,  5,  9, 14, 20, 27, 35, },
-+    {  1,  4,  8, 13, 19, 26, 34, 42, },
-+    {  3,  7, 12, 18, 25, 33, 41, 48, },
-+    {  6, 11, 17, 24, 32, 40, 47, 53, },
-+    { 10, 16, 23, 31, 39, 46, 52, 57, },
-+    { 15, 22, 30, 38, 45, 51, 56, 60, },
-+    { 21, 29, 37, 44, 50, 55, 59, 62, },
-+    { 28, 36, 43, 49, 54, 58, 61, 63, },
-+};
-+
-+
-+typedef struct
-+{
-+    uint16_t coeff;
-+    uint16_t scale;
-+} xy_off_t;
-+
-+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
-+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
-+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
-+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
-+
-+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
-+
-+#define OFF_DIAG(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
-+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
-+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
-+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_HORIZ(t) {\
-+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
-+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
-+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
-+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_VERT(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
-+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
-+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
-+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+static const xy_off_t off_xys[3][4][16] =
-+{
-+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
-+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
-+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
-+};
-+
-+
-+// Helper fns
-+#ifndef hevc_mem_bits32
-+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
-+{
-+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
-+}
-+#endif
-+
-+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
-+#define hevc_clz32 hevc_clz32_builtin
-+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
-+{
-+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
-+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
-+}
-+#endif
-+
-+// It is unlikely that we will ever need this but include for completeness
-+#ifndef hevc_clz32
-+static inline unsigned int hevc_clz32(unsigned int x)
-+{
-+    unsigned int n = 1;
-+    if ((x & 0xffff0000) == 0) {
-+        n += 16;
-+        x <<= 16;
-+    }
-+    if ((x & 0xff000000) == 0) {
-+        n += 8;
-+        x <<= 8;
-+    }
-+    if ((x & 0xf0000000) == 0) {
-+        n += 4;
-+        x <<= 4;
-+    }
-+    if ((x & 0xc0000000) == 0) {
-+        n += 2;
-+        x <<= 2;
-+    }
-+    return n - ((x >> 31) & 1);
-+}
-+#endif
-+
-+static inline int cabac_overflow(const CABACContext * const cc)
-+{
-+    av_assert0(cc->bytestream >= cc->bytestream_start);
-+    return cc->bytestream >= cc->bytestream_end + 4;
-+}
-+
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
-+{
-+    return cabac_overflow(&lc->cc);
-+}
-+
-+#if !USE_BY22
-+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
-+// will no longer be called but the setup calls will still exist and we want
-+// to null them out
-+#define bypass_start(s)
-+#define bypass_finish(s)
-+#else
-+// Use BY22 for residual bypass block
-+
-+#define bypass_start(cc) get_cabac_by22_start(cc)
-+#define bypass_finish(cc) get_cabac_by22_finish(cc)
-+
-+// BY22 notes that bypass is simply a divide into the bitstream and so we
-+// can peek out large quantities of bits at once and treat the result as if
-+// it was VLC.  In many cases this will lead to O(1) processing rather than
-+// O(n) though the setup and teardown is sufficiently expensive that it is
-+// only worth using if we expect to be dealing with more than a few bits
-+// The definition of "a few bits" will vary from platform to platform but
-+// tests on ARM show that it probably isn't worth it for a single coded
-+// residual, but is for >1 - it also seems likely that if there are
-+// more residuals then they are likely to be bigger and this will make the
-+// O(1) nature of the code more worthwhile.
-+
-+
-+// Bypass block start
-+// Must be called before _by22_peek is used as it sets the CABAC environment
-+// into the correct state.  _by22_finish must be called to return to 'normal'
-+// (i.e. non-bypass) cabac decoding
-+#ifndef get_cabac_by22_start
-+static inline void get_cabac_by22_start(CABACContext * const c)
-+{
-+    const unsigned int bits = __builtin_ctz(c->low);
-+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
-+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
-+#if !USE_BY22_DIV
-+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
-+#endif
-+
-+    c->bytestream -= (CABAC_BITS / 8);
-+    c->by22.bits = bits;
-+#if !USE_BY22_DIV
-+    c->by22.range = c->range;
-+    c->range = inv;
-+#endif
-+    c->low = x;
-+}
-+#endif
-+
-+// Bypass block finish
-+// Must be called at the end of the bypass block to return to normal operation
-+static inline void get_cabac_by22_finish(CABACContext * const c)
-+{
-+    unsigned int used = c->by22.bits;
-+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
-+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
-+
-+    c->bytestream += bytes_used + (CABAC_BITS / 8);
-+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
-+#if !USE_BY22_DIV
-+    c->range = c->by22.range;
-+#endif
-+}
-+
-+// Peek bypass bits
-+// _by22_start must be called before _by22_peek is called and _by22_flush
-+// must be called afterwards to flush any used bits
-+// The actual number of valid bits returned is
-+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
-+// will be at least 22 which should be long enough for any prefix or suffix
-+// though probably not long enough for the worst case combination
-+#ifndef get_cabac_by22_peek
-+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
-+{
-+#if USE_BY22_DIV
-+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
-+#else
-+    uint32_t x = c->low & ~1U;
-+    const uint32_t inv = c->range;
-+
-+    if (inv != 0)
-+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
-+
-+    return x << 1;
-+#endif
-+}
-+#endif
-+
-+// Flush bypass bits peeked by _by22_peek
-+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
-+// val is an unmodified copy of whatever _by22_peek returned
-+#ifndef get_cabac_by22_flush
-+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
-+{
-+    // Subtract the bits used & reshift up to the top of the word
-+#if USE_BY22_DIV
-+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
-+#else
-+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
-+#endif
-+
-+    // and refill lower bits
-+    // We will probably OR over some existing bits but that doesn't matter
-+    c->by22.bits += n;
-+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
-+}
-+#endif
-+
-+#endif  // USE_BY22
-+
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
-+{
-+    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
-+    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
-+}
-+
-+static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
-+    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
-+}
-+
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
-+{
-+    GetBitContext * const gb = &lc->gb;
-+    skip_bits(gb, 1);
-+    align_get_bits(gb);
-+    return ff_init_cabac_decoder(&lc->cc,
-+                          gb->buffer + get_bits_count(gb) / 8,
-+                          (get_bits_left(gb) + 7) / 8);
-+}
-+
-+static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int init_type = 2 - s->sh.slice_type;
-+    int i;
-+
-+    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
-+        init_type ^= 3;
-+
-+    for (i = 0; i < HEVC_CONTEXTS; i++) {
-+        int init_value = init_values[init_type][i];
-+        int m = (init_value >> 4) * 5 - 45;
-+        int n = ((init_value & 15) << 3) - 16;
-+        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
-+
-+        pre ^= pre >> 31;
-+        if (pre > 124)
-+            pre = 124 + (pre & 1);
-+        lc->cabac_state[i] = pre;
-+    }
-+
-+    for (i = 0; i < 4; i++)
-+        lc->stat_coeff[i] = 0;
-+}
-+
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
-+{
-+    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
-+    {
-+        lc->qPy_pred = s->sh.slice_qp;
-+        cabac_init_state(s, lc);
-+    }
-+    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
-+    {
-+        lc->qPy_pred = s->sh.slice_qp;
-+        load_states(s, lc);
-+    }
-+    lc->cabac_init_req = 0;
-+}
-+
-+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
-+{
-+    return get_cabac_inline(c, state);
-+}
-+
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
-+{
-+    return get_cabac_terminate(c);
-+}
-+
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
-+        return 0;
-+
-+    if (!get_cabac_bypass(&lc->cc))
-+        return SAO_BAND;
-+    return SAO_EDGE;
-+}
-+
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 0; i < 4; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int i = 0;
-+    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
-+
-+    while (i < length && get_cabac_bypass(&lc->cc))
-+        i++;
-+    return i;
-+}
-+
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return get_cabac_bypass(&lc->cc);
-+}
-+
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret = get_cabac_bypass(&lc->cc) << 1;
-+    ret    |= get_cabac_bypass(&lc->cc);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
-+{
-+    int val = 1;
-+
-+    if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
-+        return 0;
-+
-+    while (val < 5 &&
-+           get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
-+        val++;
-+
-+    if (val >= 5) {
-+        unsigned int k = 0;
-+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+            val += 1 << k;
-+            k++;
-+        }
-+//        if (k == CABAC_MAX_BIN)
-+//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+
-+        while (k--)
-+            val += get_cabac_bypass(&lc->cc) << k;
-+    }
-+    return get_cabac_bypass(&lc->cc) ? -val : val;
-+}
-+
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
-+    int i = 0;
-+
-+    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
-+        i++;
-+
-+    return i;
-+}
-+
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
-+{
-+    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
-+        return PART_2Nx2N;
-+    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+        if (lc->cu.pred_mode == MODE_INTRA) // 0
-+            return PART_NxN;
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+            return PART_2NxN;
-+        if (log2_cb_size == 3) // 00
-+            return PART_Nx2N;
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
-+            return PART_Nx2N;
-+        return PART_NxN; // 000
-+    }
-+
-+    if (!s->ps.sps->amp_enabled_flag) {
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+            return PART_2NxN;
-+        return PART_Nx2N;
-+    }
-+
-+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
-+            return PART_2NxN;
-+        if (get_cabac_bypass(&lc->cc)) // 0101
-+            return PART_2NxnD;
-+        return PART_2NxnU; // 0100
-+    }
-+
-+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
-+        return PART_Nx2N;
-+    if (get_cabac_bypass(&lc->cc)) // 0001
-+        return PART_nRx2N;
-+    return PART_nLx2N;  // 0000
-+}
-+
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i = 0;
-+    while (i < 2 && get_cabac_bypass(&lc->cc))
-+        i++;
-+    return i;
-+}
-+
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 0; i < 4; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret;
-+    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
-+        return 4;
-+
-+    ret  = get_cabac_bypass(&lc->cc) << 1;
-+    ret |= get_cabac_bypass(&lc->cc);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
-+
-+    if (i != 0) {
-+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
-+            i++;
-+    }
-+    return i;
-+}
-+
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
-+{
-+    if (nPbW + nPbH == 12)
-+        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
-+        return PRED_BI;
-+
-+    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+}
-+
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
-+{
-+    int i = 0;
-+    int max = num_ref_idx_lx - 1;
-+    int max_ctx = FFMIN(max, 2);
-+
-+    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
-+        i++;
-+    if (i == 2) {
-+        while (i < max && get_cabac_bypass(&lc->cc))
-+            i++;
-+    }
-+
-+    return i;
-+}
-+
-+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
-+}
-+
-+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
-+}
-+
-+#if !USE_BY22
-+static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret = 2;
-+    int k = 1;
-+
-+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+        ret += 1U << k;
-+        k++;
-+    }
-+    if (k == CABAC_MAX_BIN) {
-+        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+        return 0;
-+    }
-+
-+    while (k--)
-+        ret += get_cabac_bypass(&lc->cc) << k;
-+    return get_cabac_bypass_sign(&lc->cc, -ret);
-+}
-+#endif
-+
-+static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return get_cabac_bypass_sign(&lc->cc, -1);
-+}
-+
-+static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
-+}
-+
-+
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
-+    int i =0;
-+
-+    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
-+        i++;
-+
-+    return i;
-+}
-+
-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
-+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
-+{
-+    int i = 0;
-+    int max = (log2_size << 1) - 1;
-+    int ctx_offset, ctx_shift;
-+
-+    if (!c_idx_nz) {
-+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
-+        ctx_shift = (log2_size + 1) >> 2;
-+    } else {
-+        ctx_offset = 15;
-+        ctx_shift = log2_size - 2;
-+    }
-+    while (i < max &&
-+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+        i++;
-+    *last_scx_prefix = i;
-+
-+    i = 0;
-+    while (i < max &&
-+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+        i++;
-+    *last_scy_prefix = i;
-+}
-+
-+static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
-+                                                 int last_significant_coeff_prefix)
-+{
-+    int i;
-+    int length = (last_significant_coeff_prefix >> 1) - 1;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 1; i < length; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
-+{
-+    int inc;
-+
-+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
-+
-+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
-+}
-+
-+static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
-+{
-+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
-+}
-+
-+#if !USE_BY22
-+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
-+#endif
-+
-+
-+#ifndef coeff_abs_level_remaining_decode_bypass
-+static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
-+{
-+    uint32_t y;
-+    unsigned int prefix;
-+    unsigned int last_coeff_abs_level_remaining;
-+    unsigned int n;
-+
-+    y = get_cabac_by22_peek(c);
-+    prefix = hevc_clz32(~y);
-+    // y << prefix will always have top bit 0
-+
-+    if (prefix < 3) {
-+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
-+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
-+        n = prefix + 1 + rice_param;
-+    }
-+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
-+    {
-+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
-+
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix * 2 + rice_param - 2;
-+    }
-+    else {
-+        unsigned int suffix;
-+
-+        get_cabac_by22_flush(c, prefix, y);
-+        y = get_cabac_by22_peek(c);
-+
-+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix + rice_param - 2;
-+    }
-+
-+    get_cabac_by22_flush(c, n, y);
-+
-+    return last_coeff_abs_level_remaining;
-+}
-+#endif
-+
-+static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
-+{
-+    int prefix = 0;
-+    int suffix = 0;
-+    int last_coeff_abs_level_remaining;
-+    int i;
-+
-+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-+        prefix++;
-+    if (prefix == CABAC_MAX_BIN) {
-+//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-+        return 0;
-+    }
-+
-+    if (prefix < 3) {
-+        for (i = 0; i < rc_rice_param; i++)
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-+        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-+    } else {
-+        int prefix_minus3 = prefix - 3;
-+        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-+        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-+                                              << rc_rice_param) + suffix;
-+    }
-+
-+    return last_coeff_abs_level_remaining;
-+}
-+
-+#if !USE_BY22
-+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
-+static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
-+{
-+    unsigned int i;
-+    uint32_t ret = 0;
-+
-+    for (i = 0; i < nb; i++)
-+        ret = (ret << 1) | get_cabac_bypass(c);
-+
-+    return ret << (32 - nb);
-+}
-+#endif
-+
-+#ifndef coeff_sign_flag_decode_bypass
-+static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
-+{
-+    uint32_t y;
-+    y = get_cabac_by22_peek(c);
-+    get_cabac_by22_flush(c, nb, y);
-+    return y & ~(0xffffffffU >> nb);
-+}
-+#endif
-+
-+
-+#ifndef get_cabac_greater1_bits
-+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i;
-+    unsigned int rv = 0;
-+    for (i = 0; i != n; ++i) {
-+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
-+        const unsigned int b = get_cabac(c, state0 + idx);
-+        rv = (rv << 1) | b;
-+    }
-+    return rv;
-+}
-+#endif
-+
-+
-+// N.B. levels returned are the values assuming coeff_abs_level_remaining
-+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
-+// this version of events.
-+static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
-+    int * const pprev_subset_coded, int * const psum,
-+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
-+{
-+    CABACContext * const c = &lc->cc;
-+    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
-+    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
-+    unsigned int rv;
-+    unsigned int i;
-+    const unsigned int n = FFMIN(n_end, 8);
-+
-+    // Really this is i != n but the simple unconditional loop is cheaper
-+    // and faster
-+    for (i = 0; i != 8; ++i)
-+        levels[i] = 1;
-+
-+    rv = get_cabac_greater1_bits(c, n, state0);
-+
-+    *pprev_subset_coded = 0;
-+    *psum = n;
-+
-+    rv <<= (32 - n);
-+    if (rv != 0)
-+    {
-+        *pprev_subset_coded = 1;
-+        *psum = n + 1;
-+        i = hevc_clz32(rv);
-+        levels[i] = 2;
-+        if (get_cabac(c, state_gt2) == 0)
-+        {
-+            // Unset first coded bit
-+            rv &= ~(0x80000000U >> i);
-+        }
-+    }
-+
-+    if (n_end > 8) {
-+        const unsigned int g8 = n_end - 8;
-+        rv |= ((1 << g8) - 1) << (24 - g8);
-+        for (i = 0; i != g8; ++i) {
-+            levels[i + 8] = 0;
-+        }
-+    }
-+
-+    return rv;
-+}
-+
-+// extended_precision_processing_flag must be false given we are
-+// putting the result into a 16-bit array
-+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
-+// scale_m is uint8_t
-+//
-+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
-+//   or it can be 2 (if we have transquant_bypass)
-+// shift is set to one less than we really want but would normally be
-+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
-+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
-+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
-+// to achieve it
-+
-+#ifndef trans_scale_sat
-+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
-+}
-+#endif
-+
-+
-+#ifndef update_rice
-+static inline void update_rice(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
-+    if (x >= 6)
-+        (*stat_coeff)++;
-+    else if (x == 0 && *stat_coeff > 0)
-+        (*stat_coeff)--;
-+}
-+#endif
-+
-+
-+// n must be > 0 on entry
-+#ifndef get_cabac_sig_coeff_flag_idxs
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * p)
-+{
-+    do {
-+        if (get_cabac(c, state0 + ctx_map[n]))
-+            *p++ = n;
-+    } while (--n != 0);
-+    return p;
-+}
-+#endif
-+
-+
-+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * const flag_idx)
-+{
-+    int rv;
-+
-+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
-+
-+    return rv;
-+}
-+
-+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x1,  x2,  x3,\
-+     x4,  x5,  x6,  x7,\
-+     x8,  x9, x10, x11,\
-+    x12, x13, x14, x15}
-+
-+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x8, x12,\
-+     x1,  x5,  x9, x13,\
-+     x2,  x6, x10, x14,\
-+     x3,  x7, x11, x15}
-+
-+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x1,  x8,\
-+     x5,  x2, x12,  x9,\
-+     x6,  x3, x13, x10,\
-+     x7, x14, x11, x15}
-+
-+
-+static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
-+    uint8_t * const significant_coeff_group_flag,
-+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
-+    int * const pPrev_sig)
-+{
-+    while (--i >= 0) {
-+        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
-+        const unsigned int x_cg = scan_x_cg[i];
-+
-+        // For the flag decode we only care about Z/NZ but
-+        // we use the full Right * 2 + Down when calculating
-+        // significant coeff flags so we obtain it here.
-+        //
-+        // The group flag array is one longer than it needs to
-+        // be so we don't need to check for y_cg limits
-+        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
-+
-+        if (i == 0 ||
-+            significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
-+        {
-+            gf_y[0] |= (1 << x_cg);
-+            *pPrev_sig = prev_sig;
-+            break;
-+        }
-+    }
-+
-+    return i;
-+}
-+
-+static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+    const AVFrame * const frame = s->frame;
-+    const unsigned int stride = frame_stride1(s->frame, c_idx);
-+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
-+    uint8_t * const dst = !is_sliced ?
-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+        c_idx == 0 ?
-+            av_rpi_sand_frame_pos_y(frame, x, y) :
-+            av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+    const unsigned int i = jb->intra.n;
-+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+        pc->ta.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->ta.stride == stride);
-+
-+        pc->type = RPI_PRED_ADD_RESIDUAL_C;
-+    }
-+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+        pc->dc.dst == dst)
-+    {
-+        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->dc.stride == stride);
-+
-+        // Rewrite as add residual - must rewrite all fields as different union member
-+        pc->type = RPI_PRED_ADD_RESIDUAL_V;
-+        pc->ta.buf = coeffs;
-+        pc->ta.dst = dst;
-+        pc->ta.stride = stride;
-+        pc->ta.dc = dc;
-+    }
-+    else
-+    {
-+        HEVCPredCmd * const cmd = pc + 1;
-+        jb->intra.n = i + 1;
-+
-+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
-+        cmd->size = log2_trafo_size;
-+        cmd->ta.buf = coeffs;
-+        cmd->ta.dst = dst;
-+        cmd->ta.stride = stride;
-+        cmd->ta.dc = 0;
-+    }
-+}
-+
-+
-+static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+    const AVFrame * const frame = s->frame;
-+    const unsigned int stride = frame_stride1(s->frame, c_idx);
-+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+    const int is_sliced = 1;
-+    uint8_t * const dst = !is_sliced ?
-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+        c_idx == 0 ?
-+            av_rpi_sand_frame_pos_y(frame, x, y) :
-+            av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
-+    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
-+
-+    const unsigned int i = jb->intra.n;
-+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+        pc->ta.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->ta.stride == stride);
-+
-+        pc->ta.dc = (int16_t)coeff;
-+    }
-+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+        pc->dc.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->dc.stride == stride &&
-+                   (pc->dc.dc & ~0xffff) == 0);
-+
-+        pc->dc.dc |= (coeff << 16);
-+    }
-+    else
-+    {
-+        HEVCPredCmd * const cmd = pc + 1;
-+        jb->intra.n = i + 1;
-+
-+        cmd->type = RPI_PRED_ADD_DC + c_idx;
-+        cmd->size = log2_trafo_size;
-+        cmd->dc.dst = dst;
-+        cmd->dc.stride = stride;
-+        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
-+    }
-+}
-+
-+
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int log2_trafo_size, const enum ScanType scan_idx,
-+                                const int c_idx)
-+{
-+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
-+
-+    int last_significant_coeff_x, last_significant_coeff_y;
-+    int num_coeff = 0;
-+    int prev_subset_coded = 0;
-+
-+    int num_last_subset;
-+    int x_cg_last_sig, y_cg_last_sig;
-+
-+    const uint8_t *scan_x_cg, *scan_y_cg;
-+    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+
-+    int use_vpu;
-+#if RPI_COMPRESS_COEFFS                                
-+    int num_nonzero = 0;
-+    int use_compress = 0;
-+    int *coeffs32;
-+#endif
-+    int use_dc = 0;
-+    int16_t *coeffs;
-+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
-+    int explicit_rdpcm_flag = 0;
-+    int explicit_rdpcm_dir_flag;
-+
-+    int i;
-+    int shift,scale;
-+    const uint8_t *scale_matrix = NULL;
-+    uint8_t dc_scale;
-+    const int c_idx_nz = (c_idx != 0);
-+    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
-+    int prev_sig = 0;
-+    int may_hide_sign;
-+
-+    int16_t dummy_coeffs[16];
-+
-+    // Derive QP for dequant
-+    if (!lc->cu.cu_transquant_bypass_flag) {
-+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
-+
-+        if (s->ps.pps->transform_skip_enabled_flag &&
-+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-+            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
-+            if (transform_skip_flag) {
-+                trans_skip_or_bypass = 1;
-+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
-+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
-+                    may_hide_sign = 0;
-+                }
-+            }
-+        }
-+
-+        {
-+            static const uint8_t level_scale[8] = {
-+                40, 45, 51, 57, 64, 72, 0, 0  // Pad to 8
-+            };
-+            const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
-+
-+            // Shift is set to one less than will actually occur as the scale
-+            // and saturate step adds 1 and then shifts right again
-+            scale = level_scale[qp6 & 7];
-+//            shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
-+            shift = log2_trafo_size - (qp6 >> 3);
-+
-+            if (shift < 0) {
-+                scale <<= -shift;
-+                shift = 0;
-+            }
-+        }
-+
-+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-+            const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
-+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-+            const unsigned int matrix_id =
-+                lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
-+
-+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-+            dc_scale = scale_matrix[0];
-+            if (log2_trafo_size >= 4)
-+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-+        }
-+        else
-+        {
-+            static const uint8_t sixteen_scale[64] = {
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16
-+            };
-+            scale_matrix = sixteen_scale;
-+            dc_scale = 16;
-+        }
-+    } else {
-+        static const uint8_t unit_scale[64] = {
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+        };
-+        scale_matrix = unit_scale;
-+        shift        = 0;
-+        scale        = 2;  // We will shift right to kill this
-+        dc_scale     = 1;
-+
-+        may_hide_sign = 0;
-+    }
-+
-+
-+
-+
-+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-+        trans_skip_or_bypass) {
-+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
-+        if (explicit_rdpcm_flag) {
-+            may_hide_sign = 0;
-+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
-+        }
-+    }
-+
-+    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
-+                                           &last_significant_coeff_x, &last_significant_coeff_y);
-+
-+    if (last_significant_coeff_x > 3) {
-+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
-+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-+        (2 + (last_significant_coeff_x & 1)) +
-+        suffix;
-+    }
-+
-+    if (last_significant_coeff_y > 3) {
-+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
-+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-+        (2 + (last_significant_coeff_y & 1)) +
-+        suffix;
-+    }
-+
-+    if (scan_idx == SCAN_VERT)
-+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
-+
-+    x_cg_last_sig = last_significant_coeff_x >> 2;
-+    y_cg_last_sig = last_significant_coeff_y >> 2;
-+
-+    switch (scan_idx) {
-+    case SCAN_DIAG: {
-+        int last_x_c = last_significant_coeff_x & 3;
-+        int last_y_c = last_significant_coeff_y & 3;
-+
-+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-+
-+        switch (log2_trafo_size) {
-+        case 2:
-+            scan_x_cg = scan_1x1;
-+            scan_y_cg = scan_1x1;
-+            break;
-+        case 3:
-+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = diag_scan2x2_x;
-+            scan_y_cg = diag_scan2x2_y;
-+            break;
-+        case 4:
-+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
-+            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
-+            break;
-+        case 5:
-+        default:
-+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
-+            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
-+            break;
-+        }
-+        break;
-+    }
-+    case SCAN_HORIZ:
-+        scan_x_cg = horiz_scan2x2_x;
-+        scan_y_cg = horiz_scan2x2_y;
-+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-+        break;
-+    default: //SCAN_VERT
-+        scan_x_cg = horiz_scan2x2_y;
-+        scan_y_cg = horiz_scan2x2_x;
-+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-+        break;
-+    }
-+    num_coeff++;
-+    num_last_subset = (num_coeff - 1) >> 4;
-+
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+
-+    {
-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+        const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */;  // These need special processing
-+        use_vpu = 0;
-+        use_dc = (num_coeff == 1) && !special &&
-+            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
-+
-+        if (use_dc) {
-+            // Just need a little empty space
-+            coeffs = dummy_coeffs;
-+            // No need to clear
-+        }
-+        else
-+        {
-+            use_vpu = !special && log2_trafo_size >= 4;
-+#if RPI_COMPRESS_COEFFS
-+            use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
-+#endif
-+            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if RPI_COMPRESS_COEFFS
-+            coeffs32 = (int*)coeffs;
-+            if (!use_compress)
-+#endif
-+#if HAVE_NEON
-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+        }
-+    }
-+
-+    i = num_last_subset;
-+    do {
-+        int implicit_non_zero_coeff = 0;
-+        int n_end;
-+
-+        uint8_t significant_coeff_flag_idx[16];
-+        unsigned int nb_significant_coeff_flag = 0;
-+
-+        if (i == num_last_subset) {
-+            // First time through
-+            int last_scan_pos = num_coeff - (i << 4) - 1;
-+            n_end = last_scan_pos - 1;
-+            significant_coeff_flag_idx[0] = last_scan_pos;
-+            nb_significant_coeff_flag = 1;
-+        } else {
-+            n_end = 15;
-+            implicit_non_zero_coeff = (i != 0);
-+        }
-+
-+        if (n_end >= 0) {
-+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
-+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
-+            };
-+            // N.B. prev_sig = Right * 2 + Down
-+            static const uint8_t ctx_idx_maps[3][4][16] = {
-+                {
-+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                }
-+            };
-+            const uint8_t *ctx_idx_map_p;
-+            int scf_offset = 0;
-+
-+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                ctx_idx_map_p = ctx_idx_maps[0][3];
-+                scf_offset = 40 + c_idx_nz;
-+            } else {
-+                if (c_idx_nz != 0)
-+                    scf_offset = 27;
-+
-+                if (log2_trafo_size == 2) {
-+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-+                } else {
-+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
-+                    if (!c_idx_nz) {
-+                        if (i != 0)
-+                            scf_offset += 3;
-+
-+                        if (log2_trafo_size == 3) {
-+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-+                        } else {
-+                            scf_offset += 21;
-+                        }
-+                    } else {
-+                        if (log2_trafo_size == 3)
-+                            scf_offset += 9;
-+                        else
-+                            scf_offset += 12;
-+                    }
-+                }
-+            }
-+
-+            if (n_end > 0) {
-+                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
-+                    lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
-+                    n_end, ctx_idx_map_p,
-+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
-+
-+                nb_significant_coeff_flag += cnt;
-+                if (cnt != 0) {
-+                    implicit_non_zero_coeff = 0;
-+                }
-+            }
-+
-+            if (implicit_non_zero_coeff == 0) {
-+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                    scf_offset = 42 + c_idx_nz;
-+                } else {
-+                    if (i == 0) {
-+                        scf_offset = c_idx_nz ? 27 : 0;
-+                    } else {
-+                        scf_offset = 2 + scf_offset;
-+                    }
-+                }
-+                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
-+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                    nb_significant_coeff_flag++;
-+                }
-+            } else {
-+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                nb_significant_coeff_flag++;
-+            }
-+        }
-+#if RPI_COMPRESS_COEFFS
-+        if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
-+          int16_t temp[32*32];
-+          const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
-+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
-+          memcpy(temp, coeffs, sizeof(int)*num_nonzero);
-+          coeffs32 = (int *)temp;
-+          memset(coeffs, 0, ccount * sizeof(int16_t));
-+          num_nonzero--;
-+          while (num_nonzero >= 0) {
-+            const unsigned int res = coeffs32[num_nonzero];
-+            const unsigned int offset = res & 0xffff;
-+            coeffs[ offset ] = res >> 16;
-+            num_nonzero--;
-+          }
-+          use_compress = 0;
-+        }
-+#endif            
-+
-+        if (nb_significant_coeff_flag != 0) {
-+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
-+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
-+                prev_subset_coded;
-+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
-+                (gt1_idx_delta << 2);
-+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
-+                gt1_idx_delta;
-+
-+            const unsigned int x_cg = scan_x_cg[i];
-+            const unsigned int y_cg = scan_y_cg[i];
-+            int16_t * const blk_coeffs = coeffs +
-+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
-+            // This calculation is 'wrong' for log2_traffo_size == 2
-+            // but that doesn't matter as in this case x_cg & y_cg
-+            // are always 0 so result is correct (0) anyway
-+            const uint8_t * const blk_scale = scale_matrix +
-+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
-+
-+            // * The following code block doesn't deal with these flags:
-+            //   (nor did the one it replaces)
-+            //
-+            // cabac_bypass_alignment_enabled_flag
-+            //    This should be easy but I can't find a test case
-+            // extended_precision_processing_flag
-+            //    This can extend the required precision past 16bits
-+            //    so is probably tricky - also no example found yet
-+
-+#if USE_N_END_1
-+            if (nb_significant_coeff_flag == 1) {
-+                // There is a small gain to be had from special casing the single
-+                // transform coefficient case.  The reduction in complexity
-+                // makes up for the code duplicatioon.
-+
-+                int trans_coeff_level = 1;
-+                int coeff_sign_flag;
-+                int coded_val = 0;
-+
-+                // initialize first elem of coeff_bas_level_greater1_flag
-+                prev_subset_coded = 0;
-+
-+                if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
-+                    trans_coeff_level = 2;
-+                    prev_subset_coded = 1;
-+                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
-+                }
-+
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&lc->cc);
-+
-+                if (coded_val)
-+                {
-+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
-+                    } else {
-+                        uint8_t * const stat_coeff =
-+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                        const unsigned int c_rice_param = *stat_coeff >> 2;
-+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
-+
-+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
-+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                    }
-+                }
-+
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
-+                    const int res = trans_scale_sat(
-+                        (trans_coeff_level ^ k) - k,  // Apply sign
-+                        scale,
-+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
-+                        shift);
-+#if RPI_COMPRESS_COEFFS                                
-+                      if (use_compress)
-+                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+                      else
-+#endif
-+                      blk_coeffs[xy_off->coeff] = res;
-+                }
-+            }
-+            else
-+#endif
-+            {
-+                int sign_hidden = may_hide_sign;
-+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
-+                uint32_t coeff_sign_flags;
-+                uint32_t coded_vals = 0;
-+                // Sum(abs(level[]))
-+                // In fact we only need the bottom bit and in some future
-+                // version that may be all we calculate
-+                unsigned int sum_abs;
-+
-+                coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
-+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
-+
-+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
-+                    sign_hidden = 0;
-+
-+                // -- Start bypass block
-+
-+                bypass_start(&lc->cc);
-+
-+                coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
-+
-+                if (coded_vals != 0)
-+                {
-+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
-+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
-+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
-+                    int * level = levels - 1;
-+
-+                    do {
-+                        {
-+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
-+                            level += z;
-+                            coded_vals <<= z;
-+                        }
-+
-+                        {
-+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
-+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+                            sum_abs += last_coeff_abs_level_remaining + 1;
-+                            *level = trans_coeff_level;
-+
-+                            if (stat_coeff != NULL)
-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                            stat_coeff = NULL;
-+
-+                            if (trans_coeff_level > (3 << c_rice_param) &&
-+                                (c_rice_param < 4 || rice_adaptation_enabled))
-+                                ++c_rice_param;
-+                        }
-+                    } while (coded_vals != 0);
-+                }
-+
-+                // sign_hidden = 0 or 1 so we can combine the tests
-+                if ((sign_hidden & sum_abs) != 0) {
-+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-+                }
-+
-+                bypass_finish(&lc->cc);
-+
-+                // -- Finish bypass block
-+
-+                // Scale loop
-+                {
-+                    int m = nb_significant_coeff_flag - 1;
-+
-+                    // Deal with DC component (if any) first
-+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
-+                    {
-+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+                        const int res = trans_scale_sat(
-+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
-+#if RPI_COMPRESS_COEFFS
-+                        if (use_compress)
-+                        {
-+                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
-+                        }
-+                        else
-+#endif
-+                        {
-+                            blk_coeffs[0] = res;
-+                        }
-+                        --m;
-+                    }
-+
-+#if !USE_N_END_1
-+                    // If N_END_1 set then m was at least 1 initially
-+                    if (m >= 0)
-+#endif
-+                    {
-+                        do {
-+                            const xy_off_t * const xy_off = scan_xy_off +
-+                                significant_coeff_flag_idx[m];
-+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+                            const int res = trans_scale_sat(
-+                                (levels[m] ^ k) - k,
-+                                scale,
-+                                blk_scale[xy_off->scale],
-+                                shift);
-+#if RPI_COMPRESS_COEFFS
-+                            if (use_compress) {
-+                              coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+                            } else
-+#endif
-+                              blk_coeffs[xy_off->coeff] = res;
-+                        } while (--m >= 0);
-+                    }
-+                }
-+
-+            }
-+        }
-+    } while ((i = next_subset(lc, i, c_idx_nz,
-+                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
-+             !cabac_overflow(&lc->cc));
-+
-+    if (lc->cu.cu_transquant_bypass_flag) {
-+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
-+
-+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+        }
-+    } else {
-+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-+                      log2_trafo_size == 2 &&
-+                      lc->cu.pred_mode == MODE_INTRA;
-+            if (rot) {
-+                for (i = 0; i < 8; i++)
-+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-+            }
-+
-+            s->hevcdsp.dequant(coeffs, log2_trafo_size);
-+
-+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                                        lc->cu.pred_mode == MODE_INTRA &&
-+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
-+
-+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+            }
-+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+            s->hevcdsp.transform_4x4_luma(coeffs);
-+        }
-+        else if (!use_vpu)
-+        {
-+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+            if (max_xy == 0)
-+            {
-+                if (use_dc)
-+                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+                else
-+                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
-+            }
-+            else {
-+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                if (max_xy < 4)
-+                    col_limit = FFMIN(4, col_limit);
-+                else if (max_xy < 8)
-+                    col_limit = FFMIN(8, col_limit);
-+                else if (max_xy < 12)
-+                    col_limit = FFMIN(24, col_limit);
-+                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
-+            }
-+        }
-+    }
-+
-+#if 0
-+    // Mildly rotted - we support no mode where cross is valid
-+    if (lc->tu.cross_pf) {
-+        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
-+        const int ccount = 1 << (log2_trafo_size * 2);
-+
-+        for (i = 0; i < ccount; i++) {
-+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+        }
-+    }
-+#endif
-+
-+    if (!use_dc) {
-+#if RPI_COMPRESS_COEFFS                                
-+        if (use_compress) {
-+          coeffs32[num_nonzero] = 0;
-+        }
-+#endif      
-+        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+    }
-+}
-+
-+#if !USE_BY22
-+// Stores results to lc
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+    int x = abs_mvd_greater0_flag_decode(lc);
-+    int y = abs_mvd_greater0_flag_decode(lc);
-+
-+    if (x)
-+        x += abs_mvd_greater1_flag_decode(lc);
-+    if (y)
-+        y += abs_mvd_greater1_flag_decode(lc);
-+
-+    switch (x) {
-+    case 2: x = mvd_decode(lc);           break;
-+    case 1: x = mvd_sign_flag_decode(lc); break;
-+    case 0: x = 0;                       break;
-+    }
-+
-+    switch (y) {
-+    case 2: y = mvd_decode(lc);           break;
-+    case 1: y = mvd_sign_flag_decode(lc); break;
-+    case 0: y = 0;                       break;
-+    }
-+    return MV_XY(x,y);
-+}
-+#else
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+    int x = abs_mvd_greater0_flag_decode(lc);
-+    int y = abs_mvd_greater0_flag_decode(lc);
-+
-+    if ((x | y) == 0)
-+        return 0;
-+
-+    if (x != 0)
-+        x += abs_mvd_greater1_flag_decode(lc);
-+    if (y != 0)
-+        y += abs_mvd_greater1_flag_decode(lc);
-+
-+    if ((x | y) == 1)
-+    {
-+        // Not worth starting BY22
-+        if (x != 0)
-+            x = mvd_sign_flag_decode(lc);
-+        if (y != 0)
-+            y = mvd_sign_flag_decode(lc);
-+    }
-+    else
-+    {
-+        CABACContext * const cc = &lc->cc;
-+        uint32_t val;
-+        uint32_t b;
-+        unsigned int n = 0;
-+
-+        bypass_start(cc);
-+        b = val = get_cabac_by22_peek(cc);
-+
-+        if (x == 1) {
-+            x = ((int32_t)b >> 31) | 1;
-+            n = 1;
-+            b <<= 1;
-+        }
-+        else if (x == 2) {
-+            // EG1 so we have (leading one bits + 1) of suffix
-+            // This makes prefix & suffix lengths the same
-+            const unsigned int k = hevc_clz32(~b) + 1;
-+            int s;
-+
-+            av_assert2(k <= 15);
-+
-+            b <<= k;
-+            n = 2 * k + 1; // Includes suffix & sign
-+
-+            // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
-+            // if we are going to do this without a flush
-+            if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
-+            {
-+                // Need too many bits - flush
-+                // n = k
-+                get_cabac_by22_flush(cc, k, val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = k + 1;
-+            }
-+
-+            x = (b >> (32 - k)) + (1 << k);
-+            b <<= k;
-+            s = (int32_t)b >> 31;
-+            x = (x ^ s) - s;
-+            b <<= 1;
-+
-+            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
-+            if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
-+            {
-+                get_cabac_by22_flush(cc, n, val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = 0;
-+            }
-+        }
-+
-+        if (y == 1) {
-+            y = ((int32_t)b >> 31) | 1;
-+            ++n;
-+            // don't care about b anymore
-+        }
-+        else if (y == 2) {
-+            const unsigned int k = hevc_clz32(~b) + 1;
-+            int s;
-+
-+            av_assert2(k <= 15);
-+
-+            // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
-+            // if we are going to do this without a flush
-+            b <<= k;
-+            n += 2 * k + 1;
-+
-+            if (n > CABAC_BY22_PEEK_BITS)
-+            {
-+                // Need too many bits - flush
-+                get_cabac_by22_flush(cc, n - (k + 1), val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = k + 1;
-+            }
-+
-+            y = (b >> (32 - k)) + (1 << k);
-+            s = (int32_t)(b << k) >> 31;
-+            y = (y ^ s) - s;
-+            // don't care about b anymore
-+        }
-+
-+        get_cabac_by22_flush(cc, n, val);
-+        bypass_finish(cc);
-+    }
-+
-+    return MV_XY(x, y);
-+}
-+#endif
-diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
-new file mode 100644
-index 0000000000..a6587616ae
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac_fns.h
-@@ -0,0 +1,191 @@
-+#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
-+#define AVCODEC_RPI_HEVC_CABAC_FNS_H
-+
-+#include "config.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
-+
-+//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int log2_trafo_size, const enum ScanType scan_idx,
-+                                const int c_idx);
-+
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
-+
-+#define HEVC_BIN_SAO_MERGE_FLAG                         0
-+#define HEVC_BIN_SAO_TYPE_IDX                           1
-+#define HEVC_BIN_SAO_EO_CLASS                           2
-+#define HEVC_BIN_SAO_BAND_POSITION                      2
-+#define HEVC_BIN_SAO_OFFSET_ABS                         2
-+#define HEVC_BIN_SAO_OFFSET_SIGN                        2
-+#define HEVC_BIN_END_OF_SLICE_FLAG                      2
-+#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG                 2
-+#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG              5
-+#define HEVC_BIN_SKIP_FLAG                              6
-+#define HEVC_BIN_CU_QP_DELTA                            9
-+#define HEVC_BIN_PRED_MODE                              12
-+#define HEVC_BIN_PART_MODE                              13
-+#define HEVC_BIN_PCM_FLAG                               17
-+#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE              17
-+#define HEVC_BIN_MPM_IDX                                18
-+#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE               18
-+#define HEVC_BIN_INTRA_CHROMA_PRED_MODE                 18
-+#define HEVC_BIN_MERGE_FLAG                             20
-+#define HEVC_BIN_MERGE_IDX                              21
-+#define HEVC_BIN_INTER_PRED_IDC                         22
-+#define HEVC_BIN_REF_IDX_L0                             27
-+#define HEVC_BIN_REF_IDX_L1                             29
-+#define HEVC_BIN_ABS_MVD_GREATER0_FLAG                  31
-+#define HEVC_BIN_ABS_MVD_GREATER1_FLAG                  33
-+#define HEVC_BIN_ABS_MVD_MINUS2                         35
-+#define HEVC_BIN_MVD_SIGN_FLAG                          35
-+#define HEVC_BIN_MVP_LX_FLAG                            35
-+#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG                  36
-+#define HEVC_BIN_SPLIT_TRANSFORM_FLAG                   37
-+#define HEVC_BIN_CBF_LUMA                               40
-+#define HEVC_BIN_CBF_CB_CR                              42
-+#define HEVC_BIN_TRANSFORM_SKIP_FLAG                    46
-+#define HEVC_BIN_EXPLICIT_RDPCM_FLAG                    48
-+#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG                50
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX        52
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX        70
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX        88
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX        88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG           88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG                 92
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG          136
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG          160
-+#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING              166
-+#define HEVC_BIN_COEFF_SIGN_FLAG                        166
-+#define HEVC_BIN_LOG2_RES_SCALE_ABS                     166
-+#define HEVC_BIN_RES_SCALE_SIGN_FLAG                    174
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG               176
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX                177
-+
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
-+
-+static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
-+    const uint8_t *ptr = c->bytestream;
-+
-+    if (c->low & 0x1)
-+        ptr--;
-+#if CABAC_BITS == 16
-+    if (c->low & 0x1FF)
-+        ptr--;
-+#endif
-+    if ((int) (c->bytestream_end - ptr) < n)
-+        return NULL;
-+    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
-+        return NULL;
-+
-+    return ptr;
-+}
-+
-+static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                                            const unsigned int ct_depth,
-+                                                            const unsigned int x0, const unsigned int y0)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
-+                                 ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
-+                                 ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
-+}
-+
-+static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                             const int x0, const int y0, const int x_cb, const int y_cb)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
-+                                 (s->cabac_stash_left[y0 >> 3] & 1) +
-+                                 (s->cabac_stash_up[x0 >> 3] & 1));
-+}
-+
-+static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+}
-+
-+static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
-+}
-+
-+static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
-+}
-+
-+
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c
-new file mode 100644
-index 0000000000..341bb77d9d
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.c
-@@ -0,0 +1,75 @@
-+/*
-+ * HEVC shared tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_data.h"
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
-+    0, 0, 1, 0,
-+    1, 2, 0, 1,
-+    2, 3, 1, 2,
-+    3, 2, 3, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
-+    0, 1, 0, 2,
-+    1, 0, 3, 2,
-+    1, 0, 3, 2,
-+    1, 3, 2, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
-+    0, 0, 1, 0,
-+    1, 2, 0, 1,
-+    2, 3, 0, 1,
-+    2, 3, 4, 0,
-+    1, 2, 3, 4,
-+    5, 0, 1, 2,
-+    3, 4, 5, 6,
-+    0, 1, 2, 3,
-+    4, 5, 6, 7,
-+    1, 2, 3, 4,
-+    5, 6, 7, 2,
-+    3, 4, 5, 6,
-+    7, 3, 4, 5,
-+    6, 7, 4, 5,
-+    6, 7, 5, 6,
-+    7, 6, 7, 7,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
-+    0, 1, 0, 2,
-+    1, 0, 3, 2,
-+    1, 0, 4, 3,
-+    2, 1, 0, 5,
-+    4, 3, 2, 1,
-+    0, 6, 5, 4,
-+    3, 2, 1, 0,
-+    7, 6, 5, 4,
-+    3, 2, 1, 0,
-+    7, 6, 5, 4,
-+    3, 2, 1, 7,
-+    6, 5, 4, 3,
-+    2, 7, 6, 5,
-+    4, 3, 7, 6,
-+    5, 4, 7, 6,
-+    5, 7, 6, 7,
-+};
-diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h
-new file mode 100644
-index 0000000000..0aee673d8b
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.h
-@@ -0,0 +1,31 @@
-+/*
-+ * HEVC shared data tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_DATA_H
-+#define AVCODEC_RPI_HEVC_DATA_H
-+
-+#include <stdint.h>
-+
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
-+
-+#endif /* AVCODEC_RPI_HEVC_DATA_H */
-diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
-new file mode 100644
-index 0000000000..dd5f65b5c4
---- /dev/null
-+++ b/libavcodec/rpi_hevc_filter.c
-@@ -0,0 +1,1206 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Seppo Tomperi
-+ * Copyright (C) 2013 Wassim Hamidouche
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+//#define DISABLE_SAO
-+//#define DISABLE_DEBLOCK
-+//#define DISABLE_STRENGTHS
-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-+//#define DISABLE_DEBLOCK_NONREF
-+
-+#include "libavutil/common.h"
-+#include "libavutil/internal.h"
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+
-+#include "rpi_qpu.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#define LUMA 0
-+#define CB 1
-+#define CR 2
-+
-+// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
-+// so -12,75 overall
-+static const uint8_t tctablex[] = {
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
-+    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
-+    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24,          // QP 38...53
-+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24                    // 54..75
-+};
-+#define tctable (tctablex + 12 + 6*8)
-+
-+static const uint8_t betatablex[] = {
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
-+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
-+     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
-+    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,                      // QP 38...51
-+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64                    // 52..73
-+};
-+#define betatable (betatablex + 12 + 6*8)
-+
-+static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
-+                            const int c_idx, const int tc_offset)
-+{
-+    return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
-+}
-+
-+static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int xBase, const unsigned int yBase)
-+{
-+    const unsigned int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
-+    const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
-+    const unsigned int xQgBase              = xBase & MinCuQpDeltaSizeMask;
-+    const unsigned int yQgBase              = yBase & MinCuQpDeltaSizeMask;
-+    const unsigned int min_cb_width         = s->ps.sps->min_cb_width;
-+    const unsigned int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
-+    const unsigned int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
-+    const int qPy_pred = lc->qPy_pred;
-+
-+    return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+             s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
-+            ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+             s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
-+}
-+
-+// * Only called from bitstream decode in foreground
-+//   so should be safe
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
-+{
-+    const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
-+
-+    if (lc->tu.cu_qp_delta != 0) {
-+        // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
-+        int off = s->ps.sps->qp_bd_offset;
-+        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
-+                                 52 + off) - off;
-+    } else
-+        lc->qp_y = qp_y;
-+}
-+
-+static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
-+{
-+    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
-+}
-+
-+// "DSP" these?
-+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
-+{
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            *(uint32_t *)dst = *(uint32_t *)src;
-+            break;
-+        case 1:
-+            *(uint16_t *)dst = *(uint16_t *)src;
-+            break;
-+        default:
-+            *dst = *src;
-+            break;
-+    }
-+}
-+
-+static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
-+                           ptrdiff_t stride_src, int x, int y, int width, int height,
-+                           int c_idx, int x_ctb, int y_ctb)
-+{
-+    const unsigned int sh = pixel_shift(s, c_idx);
-+    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
-+    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
-+
-+    /* copy horizontal edges */
-+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
-+        src, width << sh);
-+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
-+        src + stride_src * (height - 1), width << sh);
-+
-+    /* copy vertical edges */
-+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
-+
-+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
-+}
-+
-+// N.B. Src & dst are swapped as this is a restore!
-+// x0 & y0 are in luma coords
-+// Width & height are in Y/C pels as appropriate
-+// * Clear scope for optimsation here but not used enough to be worth it
-+static void restore_tqb_pixels(const HEVCRpiContext * const s,
-+                               uint8_t *src1, const uint8_t *dst1,
-+                               const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int width, const int height,
-+                               const int c_idx)
-+{
-+    if (s->ps.pps->transquant_bypass_enable_flag ||
-+        s->ps.sps->pcm.loop_filter_disable_flag)
-+    {
-+        const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
-+        int blks_y = height >> (c_idx == 0 ? 3 : 2);
-+        const unsigned int bwidth = 8 << s->ps.sps->pixel_shift;  // Y & C have the same width in sand
-+        const unsigned int bheight = (c_idx == 0) ? 8 : 4;
-+        const unsigned int sh = ((x0 >> 3) & 7);
-+        const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
-+
-+        do {
-+            unsigned int m = (*pcm >> sh) & mask;
-+            uint8_t * bd = src1;
-+            const uint8_t * bs = dst1;
-+            while (m != 0) {
-+                if ((m & 1) != 0) {
-+                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
-+                }
-+                m >>= 1;
-+                bs += bwidth;
-+                bd += bwidth;
-+            }
-+            src1 += stride_src * bheight;
-+            dst1 += stride_dst * bheight;
-+            pcm += s->ps.sps->pcm_width;
-+        } while (--blks_y > 0);
-+    }
-+}
-+
-+#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
-+
-+static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
-+{
-+#if SAO_FILTER_N == 5
-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#elif SAO_FILTER_N == 6
-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#else
-+#error Confused by size of sao fn array
-+#endif
-+    int c_idx;
-+    int edges[4];  // 0 left 1 top 2 right 3 bottom
-+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
-+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
-+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
-+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
-+    RpiSAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
-+    // flags indicating unfilterable edges
-+    uint8_t vert_edge[]      = { 0, 0 };
-+    uint8_t horiz_edge[]     = { 0, 0 };
-+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
-+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
-+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
-+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
-+    uint8_t restore          = no_tile_filter || !lfase;
-+    uint8_t left_tile_edge   = 0;
-+    uint8_t right_tile_edge  = 0;
-+    uint8_t up_tile_edge     = 0;
-+    uint8_t bottom_tile_edge = 0;
-+    const int sliced = 1;
-+    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
-+
-+    edges[0]   = x_ctb == 0;
-+    edges[1]   = y_ctb == 0;
-+    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
-+    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-+
-+#ifdef DISABLE_SAO
-+    return;
-+#endif
-+
-+    if (restore) {
-+        if (!edges[0]) {
-+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
-+        }
-+        if (!edges[2]) {
-+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
-+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
-+        }
-+        if (!edges[1]) {
-+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
-+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
-+        }
-+        if (!edges[3]) {
-+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
-+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
-+        }
-+        if (!edges[0] && !edges[1]) {
-+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
-+        }
-+        if (!edges[1] && !edges[2]) {
-+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
-+        }
-+        if (!edges[2] && !edges[3]) {
-+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
-+        }
-+        if (!edges[0] && !edges[3]) {
-+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
-+        }
-+    }
-+
-+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
-+        const unsigned int vshift = ctx_vshift(s, c_idx);
-+        const unsigned int hshift = ctx_hshift(s, c_idx);
-+        const int x0 = x >> hshift;
-+        const int y0 = y >> vshift;
-+        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
-+        const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
-+        const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
-+        const int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
-+        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
-+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
-+        ptrdiff_t stride_dst;
-+        uint8_t *dst;
-+
-+        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
-+        uint8_t * const src = !sliced ?
-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
-+            !sliced ? src - (1 << sh) :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
-+            !sliced ? src + (width << sh) :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
-+
-+        if (sliced && c_idx > 1) {
-+            break;
-+        }
-+
-+//        if (c_idx == 1)
-+//            printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
-+
-+        switch (sao->type_idx[c_idx]) {
-+        case SAO_BAND:
-+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+                           x_ctb, y_ctb);
-+            if (s->ps.pps->transquant_bypass_enable_flag ||
-+                s->ps.sps->pcm.loop_filter_disable_flag)
-+            {
-+                // Can't use the edge buffer here as it may be in use by the foreground
-+                DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
-+                dst = dstbuf;
-+                stride_dst = 2*MAX_PB_SIZE;
-+                s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+                if (sliced && c_idx != 0)
-+                {
-+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
-+                                                    sao->offset_val[1], sao->band_position[1],
-+                                                    sao->offset_val[2], sao->band_position[2],
-+                                                    width, height);
-+                }
-+                else
-+                {
-+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
-+                                                    width, height);
-+                }
-+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+                                   x, y, width, height, c_idx);
-+            } else {
-+                if (sliced && c_idx != 0)
-+                {
-+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
-+                                                    sao->offset_val[1], sao->band_position[1],
-+                                                    sao->offset_val[2], sao->band_position[2],
-+                                                    width, height);
-+                }
-+                else
-+                {
-+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
-+                                                    width, height);
-+                }
-+            }
-+            sao->type_idx[c_idx] = SAO_APPLIED;
-+            break;
-+        case SAO_EDGE:
-+        {
-+            const int w = s->ps.sps->width >> hshift;
-+            const int h = s->ps.sps->height >> vshift;
-+            int top_edge = edges[1];
-+            int bottom_edge = edges[3];
-+            // Can't use the edge buffer here as it may be in use by the foreground
-+            DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+                [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
-+
-+            stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
-+            dst = dstbuf + stride_dst + 32;
-+
-+            if (!top_edge) {
-+                uint8_t *dst1;
-+                int src_idx;
-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
-+
-+                dst1 = dst - stride_dst;
-+
-+                if (src_l != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
-+                }
-+
-+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
-+                           SAO_APPLIED);
-+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
-+
-+                if (src_r != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
-+                }
-+            }
-+            if (!bottom_edge) {
-+                uint8_t * const dst1 = dst + height * stride_dst;
-+                int src_idx;
-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
-+                const unsigned int hoff = height * stride_src;
-+
-+                if (src_l != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
-+                }
-+
-+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
-+                           SAO_APPLIED);
-+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
-+
-+                if (src_r != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
-+                }
-+            }
-+            if (src_l != NULL) {
-+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
-+                              sh, height, stride_dst, 1 << sh);
-+                } else {
-+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+                              src_l,
-+                              sh, height, stride_dst, stride_src);
-+                }
-+            }
-+            if (src_r != NULL) {
-+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
-+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
-+                              sh, height, stride_dst, 1 << sh);
-+                } else {
-+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
-+                              src_r,
-+                              sh, height, stride_dst, stride_src);
-+                }
-+            }
-+
-+            s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+
-+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+                           x_ctb, y_ctb);
-+            if (sliced && c_idx != 0)
-+            {
-+                // Class always the same for both U & V (which is just as well :-))
-+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
-+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
-+                                                width, height);
-+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
-+                                                    stride_src, stride_dst,
-+                                                    sao,
-+                                                    edges, width,
-+                                                    height, c_idx,
-+                                                    vert_edge,
-+                                                    horiz_edge,
-+                                                    diag_edge);
-+            }
-+            else
-+            {
-+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
-+                                                sao->eo_class[c_idx], width, height);
-+                s->hevcdsp.sao_edge_restore[restore](src, dst,
-+                                                    stride_src, stride_dst,
-+                                                    sao,
-+                                                    edges, width,
-+                                                    height, c_idx,
-+                                                    vert_edge,
-+                                                    horiz_edge,
-+                                                    diag_edge);
-+            }
-+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+                               x, y, width, height, c_idx);
-+            sao->type_idx[c_idx] = SAO_APPLIED;
-+            break;
-+        }
-+        }
-+    }
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
-+        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
-+    {
-+        const unsigned int stride1 = frame_stride1(s->frame, 1);
-+        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        const unsigned int xoff = (x >> 8) * stride2 * stride1;
-+        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
-+        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
-+        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
-+        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
-+        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
-+        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
-+
-+//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
-+        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
-+        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
-+    }
-+#endif
-+}
-+
-+// When bits are delivered to deblock we want them
-+//#define TL 1
-+//#define TR 2
-+//#define BL 4
-+//#define BR 8
-+
-+// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
-+// so we need to rearrange before passing on
-+
-+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+    return (pcm[0] |
-+        (pcm[1] << 8) |
-+        (pcm[s->ps.sps->pcm_width] << 16) |
-+        (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
-+}
-+
-+static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+    return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
-+}
-+
-+// We cast away const here as we want this to work for both get and set
-+static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+    return (uint32_t *)(bs +
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#warning Unexpected masks
-+        // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
-+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+            (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+    return (uint8_t *)(bs +
-+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+            (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
-+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+
-+// Get block strength
-+// Given how we call we will always get within the 32bit boundries
-+static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
-+                                unsigned int xl, unsigned int xr, const unsigned int y)
-+{
-+    if (xr <= xl) {
-+        return 0;
-+    }
-+    else
-+    {
-+#if HAVE_ARMV6T2_INLINE
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#error This case not yet handled in bs_get32
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+        uint32_t tmp;
-+        __asm__ (
-+            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
-+            "rsb         %[xr], %[xl], %[xr]                         \n\t"
-+            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
-+            "add         %[xr], %[xr], #7                            \n\t"
-+            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
-+            "bic         %[xr], %[xr], #7                            \n\t"
-+            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
-+            "lsr         %[xr], %[xr], #1                            \n\t"
-+            "cmp         %[xr], #32                                  \n\t"
-+            "mvn         %[tmp], #0                                  \n\t"
-+            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
-+            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
-+            "lsr         %[xl], %[bs], %[xl]                         \n\t"
-+            "it ne                                                   \n\t"
-+            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
-+            :  // Outputs
-+                      [bs]"+r"(bs),
-+                 [stride2]"+r"(stride2),
-+                      [xl]"+r"(xl),
-+                      [xr]"+r"(xr),
-+                     [tmp]"=&r"(tmp)
-+            :  // Inputs
-+                       [y]"r"(y),
-+                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
-+                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
-+                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+            :  // Clobbers
-+                "cc"
-+        );
-+        return (uint32_t) bs;
-+#else
-+        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
-+        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
-+
-+        return n == 32 ? a :
-+            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
-+#endif
-+    }
-+}
-+
-+static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+    return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
-+}
-+
-+static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+    return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
-+}
-+
-+
-+static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 :  1);
-+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+    const DBParams * cb_dbp = s->deblock + ctb_n;
-+    const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+
-+    unsigned int cb_x;
-+
-+    // Do in CTB-shaped blocks
-+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
-+    {
-+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+        const unsigned int bv_l = FFMAX(cb_x, 8);
-+        const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
-+        const unsigned int bh_l = bv_l - 8;
-+        unsigned int y;
-+
-+        // Main body
-+        for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
-+        {
-+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
-+
-+            const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
-+            const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+            const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+            if (vbs != 0)
-+            {
-+                const uint8_t * const tcv = tctable + dbp->tc_offset;
-+                const uint8_t * const betav = betatable + dbp->beta_offset;
-+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+                unsigned int x;
-+
-+                for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
-+                {
-+                    if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
-+                    {
-+                        const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                        s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                         frame_stride1(s->frame, LUMA),
-+                                                         betav[qp],
-+                                                         ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
-+                                                          (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
-+                                                         pcmfa & 3,
-+                                                         av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
-+                    }
-+                }
-+            }
-+
-+            if (y != 0)
-+            {
-+                uint32_t hbs;
-+
-+                // H left - mostly separated out so we only need a uint32_t hbs
-+                if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
-+                {
-+                    const unsigned int x = bh_l;
-+                    const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+                    const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                    const DBParams * const dbph = dbp - 1;
-+                    const uint8_t * const tc = tctable + dbph->tc_offset + qp;
-+
-+                    av_assert2(cb_x - bh_l == 8);
-+
-+                    s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                         frame_stride1(s->frame, LUMA),
-+                                                         betatable[qp + dbph->beta_offset],
-+                                                         ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+                                                            (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+                                                         (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+                }
-+
-+                // H
-+                if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0)  // Will give (x <= bh_r) in for loop
-+                {
-+                    unsigned int x;
-+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);
-+
-+                    for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
-+                    {
-+                        if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
-+                        {
-+                            const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                            const uint8_t * const tc = tctable + dbp->tc_offset + qp;
-+                            s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                                frame_stride1(s->frame, LUMA),
-+                                                                betatable[qp + dbp->beta_offset],
-+                                                                ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+                                                                   (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+                                                                (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+                        }
-+                    }
-+                }
-+            }
-+
-+        }
-+    }
-+}
-+
-+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+    return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
-+}
-+
-+static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 :  8);
-+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+    const DBParams * dbp = s->deblock + ctb_n;
-+    const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+    const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
-+    const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
-+
-+    unsigned int cb_x;
-+
-+    av_assert1((bounds.x & (ctb_size - 1)) == 0);
-+    av_assert1((bounds.y & (ctb_size - 1)) == 0);
-+    av_assert1(bounds.h <= ctb_size);
-+
-+    // Do in CTB-shaped blocks
-+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
-+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+        const unsigned int bv_l = FFMAX(cb_x, 16);
-+        unsigned int y;
-+
-+        // V above
-+        if (bounds.y != 0) {
-+            // Deblock V up 8
-+            // CTB above current
-+            // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
-+            const unsigned int y = bounds.y - 8;
-+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
-+
-+            if (vbs != 0)
-+            {
-+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+                const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
-+                unsigned int x;
-+
-+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+                {
-+                    if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
-+                    {
-+                        const int qp0 = q2h(s, x, y);
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                                       frame_stride1(s->frame, 1),
-+                                                       tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
-+                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+                                                       pcmfa & 3);
-+                    }
-+                }
-+            }
-+        }
-+
-+        for (y = bounds.y; y < b_b; y += 16)
-+        {
-+            uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
-+                (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
-+
-+            // V
-+            if (vbs != 0)
-+            {
-+                unsigned int x;
-+                unsigned int pcmfa =
-+                    (y + 16 > b_b ?
-+                        pcm2(s, bv_l - 1, y) | 0xffff0000 :
-+                        pcm4(s, bv_l - 1, y));
-+                const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+                {
-+                    if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+                    {
-+                        const int qp0 = q2h(s, x, y);
-+                        const int qp1 = q2h(s, x, y + 8);
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                            frame_stride1(s->frame, 1),
-+                            ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                            av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                    }
-+                }
-+            }
-+
-+            // H
-+            if (y != 0)
-+            {
-+                uint32_t hbs;
-+                const unsigned int bh_l = bv_l - 16;
-+                const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
-+                const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+                const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+                // H left - mostly separated out so we only need a uint32_t hbs
-+                // Stub is width 8 to the left of bounds, but width 16 internally
-+                if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
-+                {
-+                    unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+
-+                    // Chop off bits we don't want...
-+                    if (bh_l < bounds.x) {
-+                        pcmfa |= 0x10001; // TL|BL pre rearrangement
-+                        hbs &= ~3;  // Make BS 0
-+                    }
-+
-+                    // Double check we still want this
-+                    if (hbs != 0 && (~pcmfa & 0x30003) != 0)
-+                    {
-+                        const unsigned int x = bh_l;
-+                        const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                        const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+                        const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
-+
-+                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                            frame_stride1(s->frame, 1),
-+                            ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                    }
-+                }
-+
-+                // H main
-+                if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
-+                {
-+                    unsigned int x;
-+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);  // Might like to mask out far right writes but probably not worth it
-+
-+                    for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
-+                    {
-+                        if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+                        {
-+                            const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                            const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+                            const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+                            s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                frame_stride1(s->frame, 1),
-+                                ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                    ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                                (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                        }
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
-+{
-+    return x & ~(~0U << log2_n);
-+}
-+
-+static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+    av_assert2((y & 7) == 0);
-+
-+    // This doesn't have the same simultainious update issues that bsf_stash
-+    // does (other threads will have a different y) so we can do it the easy way
-+    if ((bsf &= mask) != 0)
-+        *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
-+}
-+
-+
-+static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+    // We arrange this in a slightly odd fashion but it lines up with
-+    // how we are going to use it in the actual deblock code & it is easier
-+    // to do the contortions here than there
-+    //
-+    // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
-+
-+    av_assert2((x & 7) == 0);
-+
-+    if ((bsf &= mask) != 0)
-+    {
-+        uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
-+        const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
-+
-+        if (mask <= 0xf)
-+        {
-+            *p |= (bsf << sh);
-+        }
-+        else
-+        {
-+            do {
-+                *p |= (bsf & 0xf) << sh;
-+                p += HEVC_RPI_BS_STRIDE1_BYTES;
-+            } while ((bsf >>= 4) != 0);
-+        }
-+    }
-+}
-+
-+static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
-+                              const unsigned int rep, const unsigned int dup,
-+                              const unsigned int mvf_stride0,
-+                              const unsigned int mvf_stride1,
-+                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
-+                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
-+{
-+    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
-+            mvf_p, mvf_q,
-+            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
-+            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
-+}
-+
-+
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
-+                                               const HEVCRpiLocalContext * const lc,
-+                                               const unsigned int x0, const unsigned int y0,
-+                                               const unsigned int log2_trafo_size,
-+                                               const int is_coded_block)
-+{
-+    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
-+    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
-+    const RefPicList * const rpl        = s->refPicList;
-+    // Rep count for bsf_mv when running with min_pu chuncks
-+    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
-+    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
-+    const unsigned int trafo_size       = (1U << log2_trafo_size);
-+    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
-+    const uint32_t bsf_cbf              = (bsf_mask & 0x55555555);
-+
-+    // Do we cover a pred split line?
-+    const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
-+    const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
-+
-+    uint32_t bsf_h;
-+    uint32_t bsf_v;
-+
-+#ifdef DISABLE_STRENGTHS
-+    return;
-+#endif
-+
-+    // We are always on a size boundary
-+    av_assert2((x0 & (trafo_size - 1)) == 0);
-+    av_assert2((y0 & (trafo_size - 1)) == 0);
-+    // log2_trafo_size not really a transform size; we can have to deal
-+    // with size 2^6 blocks
-+    av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
-+
-+    // Retrieve and update coded (b0), intra (b1) bs flags
-+    //
-+    // Store on min width (rather than uint32_t) to avoid possible issues
-+    // with another thread on another core running wpp using the same
-+    // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
-+    //
-+    // In bsf BS=2 is represented by 3 as it is much easier to test & set
-+    // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
-+    // 3 will work the same
-+    {
-+        // Given where we are called from is_cbf_luma & is_intra will be constant over the block
-+        const uint32_t bsf0 =  (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
-+        uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
-+        uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
-+
-+        switch (log2_trafo_size)
-+        {
-+            case 2:
-+            case 3:
-+            {
-+                const unsigned int sh_h = (x0 >> 1) & 7;
-+                const unsigned int sh_v = (y0 >> 1) & 7;
-+                bsf_h = *p;
-+                bsf_v = *q;
-+                *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
-+                *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
-+                bsf_h >>= sh_h;
-+                bsf_v >>= sh_v;
-+                break;
-+            }
-+            case 4:
-+                bsf_h = *p;
-+                bsf_v = *q;
-+                *p = bsf0;
-+                *q = bsf0;
-+                break;
-+            case 5:
-+                bsf_h = *(uint16_t *)p;
-+                bsf_v = *(uint16_t *)q;
-+                *(uint16_t *)p = bsf0;
-+                *(uint16_t *)q = bsf0;
-+                break;
-+            case 6:
-+            default:
-+                bsf_h = *(uint32_t *)p;
-+                bsf_v = *(uint32_t *)q;
-+                *(uint32_t *)p = bsf0;
-+                *(uint32_t *)q = bsf0;
-+                break;
-+        }
-+
-+        bsf_h |= bsf0;
-+        bsf_v |= bsf0;
-+    }
-+
-+    // Do Horizontal
-+    if ((y0 & 7) == 0)
-+    {
-+        // Boundary upper
-+        if (y0 != 0 &&
-+            (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
-+             (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
-+        {
-+            // Look at MVs (BS=1) if we don't already has a full set of bs bits
-+            if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
-+            {
-+                // If we aren't on the top boundary we must be in the middle
-+                // and in that case we know where mvf can change
-+                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
-+                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
-+                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
-+                      rpl;
-+
-+                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                    trafo_size >> (log2_min_pu_size + log2_rep),
-+                    trafo_size >> (log2_min_pu_size + log2_rep),
-+                    rpl, rpl_top,
-+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
-+            }
-+
-+            // Finally put the results into bs
-+            hbs_set(s, x0, y0, bsf_mask, bsf_h);
-+        }
-+
-+        // Max of 1 pu internal split - ignore if not on 8pel boundary
-+        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
-+        {
-+            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
-+            // If we have the x split as well then it must be in the middle
-+            const unsigned int log2_rep = has_x_split ? 1 : 0;
-+
-+            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
-+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                   trafo_size >> (log2_min_pu_size + log2_rep),
-+                   trafo_size >> (log2_min_pu_size + log2_rep),
-+                   rpl, rpl,
-+                   mvf, mvf - MVF_STASH_WIDTH_PU));
-+        }
-+    }
-+
-+    // And again for vertical - same logic as horizontal just in the other direction
-+    if ((x0 & 7) == 0)
-+    {
-+        // Boundary left
-+        if (x0 != 0 &&
-+            (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
-+             (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
-+        {
-+            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
-+            {
-+                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
-+                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
-+                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
-+                    rpl;
-+
-+                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                    rpl, rpl_left,
-+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
-+            }
-+
-+            vbs_set(s, x0, y0, bsf_mask, bsf_v);
-+        }
-+
-+        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
-+        {
-+            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
-+            const unsigned int log2_rep = has_y_split ? 1 : 0;
-+
-+            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
-+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                   rpl, rpl,
-+                   mvf, mvf - 1));
-+        }
-+    }
-+}
-+
-+#undef LUMA
-+#undef CB
-+#undef CR
-+
-+static inline unsigned int ussub(const unsigned int a, const unsigned int b)
-+{
-+    return a < b ? 0 : a - b;
-+}
-+
-+static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
-+{
-+    return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
-+}
-+
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
-+{
-+    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+    int x, y;
-+
-+    const unsigned int br = FFMIN(bounds.x + bounds.w, s->ps.sps->width);
-+    const unsigned int bb = FFMIN(bounds.y + bounds.h, s->ps.sps->height);
-+
-+    const int x_end = (br >= s->ps.sps->width);
-+    const int y_end = (bb >= s->ps.sps->height);
-+
-+    // Deblock may not touch the edges of the bound as they are still needed
-+    // for Intra pred
-+    //
-+    // Deblock is disabled with a per-slice flag
-+    // Given that bounds may cover multiple slices & we dblock outside bounds
-+    // anyway we can't avoid deblock using that flag - about the only thing we
-+    // could do is have a "no deblock seen yet" flag but it doesn't really
-+    // seem worth the effort
-+
-+    deblock_y_blk(s, bounds, x_end, y_end);
-+    deblock_uv_blk(s, bounds, x_end, y_end);
-+
-+    // SAO needs
-+    // (a) CTB alignment
-+    // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
-+    {
-+        const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
-+        const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
-+        const unsigned int yt = ussub(bounds.y, yo);
-+        const unsigned int yb = y_end ? bb : ussub(bb, yo);
-+        const unsigned int xl = ussub(bounds.x, xo);
-+        const unsigned int xr = x_end ? br : ussub(br, xo);
-+
-+        if (s->ps.sps->sao_enabled)
-+        {
-+            for (y = yt; y < yb; y += ctb_size) {
-+                for (x = xl; x < xr; x += ctb_size) {
-+                    sao_filter_CTB(s, x, y);
-+                }
-+            }
-+        }
-+
-+        // Cache invalidate
-+        y = 0;
-+        if (xr != 0 && yb != 0)
-+        {
-+            const unsigned int llen =
-+                (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
-+            const unsigned int mask = ~(llen - 1);
-+            const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
-+            const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
-+            const unsigned int it = ussub(yt, 1);
-+            const unsigned int ib = y_end ? bb : yb - 1;
-+
-+            if (il < ir) {
-+                rpi_cache_buf_t cbuf;
-+                rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
-+                rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+                  il, it, ir - il, ib - it,
-+                  ctx_vshift(s, 1), 1, 1);
-+
-+                // If we have to commit the right hand tile boundry due to
-+                // cache boundry considerations then at EoTile we must commit
-+                // that boundry to bottom of tile (bounds)
-+                if (ib != bb && ir == br && eot) {
-+                    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+                      br - 1, ib, 1, bb - ib,
-+                      ctx_vshift(s, 1), 1, 1);
-+                }
-+
-+                rpi_cache_flush_finish(rfe);
-+
-+                if (x_end)
-+                    y = y_end ? INT_MAX : ib;
-+
-+//                printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
-+            }
-+        }
-+    }
-+
-+    return y;
-+}
-+
-diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h
-new file mode 100644
-index 0000000000..6b36f5e737
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mv.h
-@@ -0,0 +1,71 @@
-+#ifndef AVCODEC_RPI_HEVC_MV_H
-+#define AVCODEC_RPI_HEVC_MV_H
-+
-+#include "config.h"
-+
-+typedef int32_t MvXY;
-+
-+typedef struct HEVCRpiMvField {
-+    MvXY xy[2];
-+    int8_t ref_idx[2];
-+    int8_t pred_flag;
-+    int8_t dummy; // To 12 bytes
-+} HEVCRpiMvField;
-+
-+
-+#define MV_X(xy) (((xy) << 16) >> 16)
-+#define MV_Y(xy) ((xy) >> 16)
-+#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_mv_arm.h"
-+#endif
-+
-+#ifndef mvxy_add
-+static inline MvXY mvxy_add(const MvXY a, const MvXY b)
-+{
-+    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
-+}
-+#endif
-+
-+
-+#ifndef mv_scale_xy
-+static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
-+{
-+    int tx, scale_factor;
-+
-+    td = td == 0 ? 1 : av_clip_int8(td);
-+    tb = av_clip_int8(tb);
-+    tx = (0x4000 + (abs(td) >> 1)) / td;
-+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
-+    return MV_XY(
-+        av_clip_int16((scale_factor * MV_X(src) + 127 +
-+                           (scale_factor * MV_X(src) < 0)) >> 8),
-+        av_clip_int16((scale_factor * MV_Y(src) + 127 +
-+                           (scale_factor * MV_Y(src) < 0)) >> 8));
-+}
-+#endif
-+
-+// 8.3.1 states that the bitstream may not contain poc diffs that do not
-+// fit in 16 bits, so given that we don't care about the high bits we only
-+// store the low 16 + LT & Inter flags
-+
-+#define COL_POC_INTRA   0
-+#define COL_POC_INTER   (1 << 16)
-+#define COL_POC_LT      (1 << 17)
-+#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
-+#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
-+#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
-+
-+typedef struct ColMv_s {
-+    int32_t poc;
-+    int32_t xy;
-+} ColMv;
-+
-+typedef struct ColMvField_s {
-+    ColMv L[2];
-+} ColMvField;
-+
-+
-+
-+#endif // AVCODEC_RPI_HEVC_MV_H
-diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
-new file mode 100644
-index 0000000000..221755fb6e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mvs.c
-@@ -0,0 +1,486 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Anand Meher Kotra
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+static av_always_inline int
-+is_eq_mer(const unsigned int plevel,
-+    const unsigned int xN, const unsigned int yN,
-+    const unsigned int xP, const unsigned int yP)
-+{
-+    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
-+}
-+
-+// check if the mv's and refidx are the same between A and B
-+static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+    return a->pred_flag == b->pred_flag &&
-+        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
-+        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
-+    return 0;
-+}
-+
-+/*
-+ * 8.5.3.1.7  temporal luma motion vector prediction
-+ */
-+static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
-+                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+                                       const int nPbW, const int nPbH, const int refIdxLx,
-+                                       MvXY * const mvLXCol, const int X)
-+{
-+    int x, y;
-+    const ColMv * cmv = NULL;
-+
-+    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
-+    const RefPicList * const refPicList = s->refPicList + X;
-+    const int cur_lt = refPicList->isLongTerm[refIdxLx];
-+
-+    *mvLXCol = 0;
-+    // Unlikely but we might have a col_ref IDR frame!
-+    if (col_ref->col_mvf == NULL)
-+        return 0;
-+
-+    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
-+
-+    //bottom right collocated motion vector
-+    x = x0 + nPbW;
-+    y = y0 + nPbH;
-+
-+    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
-+        y < s->ps.sps->height &&
-+        x < s->ps.sps->width)
-+    {
-+        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+            (y >> 4) * s->col_mvf_stride;
-+
-+        if (col->L[0].poc != COL_POC_INTRA &&
-+            (col->L[1].poc == COL_POC_INTRA ||
-+             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+        {
-+            cmv = col->L + 0;
-+        }
-+        else if (col->L[1].poc != COL_POC_INTRA)
-+        {
-+            cmv = col->L + 1;
-+        }
-+    }
-+
-+    // derive center collocated motion vector
-+    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
-+    {
-+        cmv = NULL;
-+        x                  = x0 + (nPbW >> 1);
-+        y                  = y0 + (nPbH >> 1);
-+
-+        {
-+            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+              (y >> 4) * s->col_mvf_stride;
-+
-+            if (col->L[0].poc != COL_POC_INTRA &&
-+              (col->L[1].poc == COL_POC_INTRA ||
-+               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+            {
-+              cmv = col->L + 0;
-+            }
-+            else if (col->L[1].poc != COL_POC_INTRA)
-+            {
-+              cmv = col->L + 1;
-+            }
-+        }
-+    }
-+
-+    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
-+        return 0;
-+
-+    {
-+        const int col_poc  = col_ref->poc;
-+        const int ref_poc  = refPicList->list[refIdxLx];
-+
-+        *mvLXCol = (cur_lt ||
-+                        cmv->poc == col_poc ||
-+                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
-+                    cmv->xy :
-+                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
-+    }
-+
-+    return cmv != NULL;
-+}
-+
-+static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+    return b != NULL && compare_mv_ref_idx(a, b);
-+}
-+
-+
-+
-+/*
-+ * 8.5.3.1.2  Derivation process for spatial merging candidates
-+ */
-+static inline const HEVCRpiMvField *
-+derive_spatial_merge_candidates(
-+    const HEVCRpiContext * const s,
-+    const HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    const unsigned int part_idx,
-+    const unsigned int merge_idx,
-+    HEVCRpiMvField * const mvf_t)
-+{
-+    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
-+    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
-+
-+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
-+    const unsigned int part_mode = lc->cu.part_mode;
-+
-+    const HEVCRpiMvField * perm[4];
-+    unsigned int nb_merge_cand = 0;
-+
-+    // singleMCLFlag => part_idx == 0 so no need to test for it
-+    if ((avail & AVAIL_L) == 0 ||
-+        (part_idx == 1 &&
-+            ((parts_a1 >> part_mode) & 1) != 0 ||
-+                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
-+        mvf_a1->pred_flag == PF_INTRA)
-+    {
-+        mvf_a1 = NULL;
-+    }
-+    else
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_a1;
-+        perm[nb_merge_cand++] = mvf_a1;
-+    }
-+
-+    if ((avail & AVAIL_U) == 0 ||
-+            (part_idx == 1 &&
-+               ((parts_b1 >> part_mode) & 1) != 0 ||
-+                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
-+            mvf_b1->pred_flag == PF_INTRA)
-+    {
-+        mvf_b1 = NULL;
-+    }
-+    else if (!mvf_eq(mvf_b1, mvf_a1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_b1;
-+        perm[nb_merge_cand++] = mvf_b1;
-+    }
-+
-+    // above right spatial merge candidate
-+    // Never need mvf_b0 again so don't bother zeroing if navail
-+    if ((avail & AVAIL_UR) != 0 &&
-+        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
-+        mvf_b0->pred_flag != PF_INTRA &&
-+        !mvf_eq(mvf_b0, mvf_b1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_b0;
-+        perm[nb_merge_cand++] = mvf_b0;
-+    }
-+
-+    // left bottom spatial merge candidate
-+    // Never need mvf_a0 again so don't bother zeroing if navail
-+    if ((avail & AVAIL_DL) != 0 &&
-+        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
-+        mvf_a0->pred_flag != PF_INTRA &&
-+        !mvf_eq(mvf_a0, mvf_a1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_a0;
-+        perm[nb_merge_cand++] = mvf_a0;
-+    }
-+
-+    // above left spatial merge candidate
-+    if (nb_merge_cand != 4 &&
-+        (avail & AVAIL_UL) != 0 &&
-+        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
-+    {
-+        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
-+
-+        if (mvf_b2->pred_flag != PF_INTRA &&
-+            !mvf_eq(mvf_b2, mvf_a1) &&
-+            !mvf_eq(mvf_b2, mvf_b1))
-+        {
-+            if (merge_idx == nb_merge_cand)
-+                return mvf_b2;
-+            perm[nb_merge_cand++] = mvf_b2;
-+        }
-+    }
-+
-+    // temporal motion vector candidate
-+    if (s->sh.slice_temporal_mvp_enabled_flag)
-+    {
-+        static const HEVCRpiMvField mvf_z = {{0}};
-+
-+        *mvf_t = mvf_z;
-+
-+        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                        0, mvf_t->xy + 0, 0))
-+            mvf_t->pred_flag = PF_L0;
-+
-+        if (s->sh.slice_type == HEVC_SLICE_B &&
-+                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                            0, mvf_t->xy + 1, 1))
-+            mvf_t->pred_flag |= PF_L1;
-+
-+        if (mvf_t->pred_flag != 0)
-+        {
-+            if (merge_idx == nb_merge_cand)
-+                return mvf_t;
-+            perm[nb_merge_cand++] = mvf_t;
-+        }
-+    }
-+
-+    // combined bi-predictive merge candidates  (applies for B slices)
-+    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
-+    {
-+        unsigned int comb_idx = 0;
-+        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
-+        const RefPicList * const refPicList = s->refPicList;
-+
-+        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
-+        {
-+            static const uint8_t l0_l1_cand_idx[12][2] = {
-+                { 0, 1, },
-+                { 1, 0, },
-+                { 0, 2, },
-+                { 2, 0, },
-+                { 1, 2, },
-+                { 2, 1, },
-+                { 0, 3, },
-+                { 3, 0, },
-+                { 1, 3, },
-+                { 3, 1, },
-+                { 2, 3, },
-+                { 3, 2, },
-+            };
-+
-+            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
-+            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
-+            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
-+            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
-+
-+            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
-+                (mvf_c1->pred_flag & PF_L1) != 0 &&
-+                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
-+                 mvf_c0->xy[0] != mvf_c1->xy[1]))
-+            {
-+                if (merge_idx == nb_merge_cand++)
-+                {
-+                    // Need to be a bit careful as we will construct mvf_t and we
-+                    // may already be using that as one of our condidates
-+                    // so build & copy rather than build in place
-+                    const HEVCRpiMvField mvf_m = {
-+                        .xy = {
-+                            mvf_c0->xy[0],
-+                            mvf_c1->xy[1]},
-+                        .ref_idx = {
-+                            mvf_c0->ref_idx[0],
-+                            mvf_c1->ref_idx[1]},
-+                        .pred_flag = PF_BI
-+                    };
-+                    *mvf_t = mvf_m;
-+                    return mvf_t;
-+                }
-+            }
-+        }
-+    }
-+
-+    // "append" Zero motion vector candidates
-+    {
-+        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
-+                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
-+        const unsigned int zero_idx = merge_idx - nb_merge_cand;
-+
-+        const HEVCRpiMvField mvf_m = {
-+            .xy = {0, 0},
-+            .ref_idx = {
-+                zero_idx < nb_refs ? zero_idx : 0,
-+                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
-+            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
-+        };
-+
-+        *mvf_t = mvf_m;
-+        return mvf_t;
-+    }
-+}
-+
-+
-+// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, HEVCRpiMvField * const mv)
-+{
-+    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
-+        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
-+                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
-+                                        0, merge_idx, mv) :
-+        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
-+                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
-+                                        part_idx, merge_idx, mv);
-+
-+    if (mvf_m != mv)
-+        *mv = *mvf_m;
-+
-+    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
-+        mv->pred_flag = PF_L0;
-+}
-+
-+
-+static av_always_inline const MvXY *
-+mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
-+{
-+    if (mvf != NULL)
-+    {
-+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
-+            return mvf->xy + pfi0;
-+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
-+            return mvf->xy + pfi1;
-+    }
-+    return NULL;
-+}
-+
-+static av_always_inline const MvXY *
-+mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
-+              const int islt0, const int poc0, const int poc_cur,
-+              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
-+{
-+    if (mvf != NULL)
-+    {
-+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
-+        {
-+            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
-+            if (islt0 || poc1 == poc0) {
-+                return mvf->xy + pfi0;
-+            }
-+            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
-+            return mv_t;
-+        }
-+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
-+        {
-+            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
-+            if (islt0 || poc1 == poc0) {
-+                return mvf->xy + pfi1;
-+            }
-+            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
-+            return mv_t;
-+        }
-+    }
-+    return NULL;
-+}
-+
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    HEVCRpiMvField * const mv,
-+    const unsigned int mvp_lx_flag, const unsigned int LX)
-+{
-+    const unsigned int pfi0 = LX;
-+    const unsigned int pfi1 = LX == 0 ? 1 : 0;
-+    const RefPicList * const rpl = s->refPicList;
-+    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
-+    const int poc_cur = s->poc;
-+    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
-+
-+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
-+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+    const MvXY * mva = NULL;
-+    const MvXY * mvb;
-+    MvXY * const mv_rv = mv->xy + LX;
-+    MvXY mvt_a, mvt_b;
-+
-+    *mv_rv = 0;
-+
-+    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
-+        mvf_a0 = NULL;
-+    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
-+        goto use_mva;
-+
-+    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
-+        mvf_a1 = NULL;
-+
-+    if (mva == NULL &&
-+        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
-+        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
-+        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
-+
-+    if (mvp_lx_flag == 0 && mva != NULL)
-+        goto use_mva;
-+
-+    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
-+        mvf_b0 = NULL;
-+    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
-+        mvf_b1 = NULL;
-+    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
-+        mvf_b2 = NULL;
-+
-+    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
-+        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
-+        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
-+
-+    if (mvf_a0 == NULL && mvf_a1 == NULL) {
-+        mva = mvb;
-+        if (mvp_lx_flag == 0 && mva != NULL)
-+            goto use_mva;
-+
-+        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
-+            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
-+            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
-+    }
-+
-+    if (mva == NULL) {
-+        mva = mvb;
-+        mvb = NULL;
-+    }
-+
-+    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
-+        mvb = NULL;
-+
-+    if (mvp_lx_flag == 0 && mva != NULL) {
-+        goto use_mva;
-+    }
-+    else if (mvp_lx_flag != 0 && mvb != NULL) {
-+        *mv_rv = *mvb;
-+    }
-+    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
-+        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
-+                                    nPbH, mv->ref_idx[LX],
-+                                    mv_rv, LX);
-+    }
-+    return;
-+
-+use_mva:
-+    *mv_rv = *mva;
-+    return;
-+}
-+
-diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
-new file mode 100644
-index 0000000000..36affa9afa
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.c
-@@ -0,0 +1,142 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "bytestream.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_parse.h"
-+
-+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
-+                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
-+                                 int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+    int i;
-+    int ret = 0;
-+    H2645Packet pkt = { 0 };
-+
-+    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
-+    if (ret < 0) {
-+        goto done;
-+    }
-+
-+    for (i = 0; i < pkt.nb_nals; i++) {
-+        H2645NAL *nal = &pkt.nals[i];
-+
-+        /* ignore everything except parameter sets and VCL NALUs */
-+        switch (nal->type) {
-+        case HEVC_NAL_VPS:
-+            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_SPS:
-+            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_PPS:
-+            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_SEI_PREFIX:
-+        case HEVC_NAL_SEI_SUFFIX:
-+            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        default:
-+            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
-+            break;
-+        }
-+    }
-+
-+done:
-+    ff_h2645_packet_uninit(&pkt);
-+    if (err_recognition & AV_EF_EXPLODE)
-+        return ret;
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+                             int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+    int ret = 0;
-+    GetByteContext gb;
-+
-+    bytestream2_init(&gb, data, size);
-+
-+    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
-+        /* It seems the extradata is encoded as hvcC format.
-+         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
-+         * is finalized. When finalized, configurationVersion will be 1 and we
-+         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
-+        int i, j, num_arrays, nal_len_size;
-+
-+        *is_nalff = 1;
-+
-+        bytestream2_skip(&gb, 21);
-+        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
-+        num_arrays   = bytestream2_get_byte(&gb);
-+
-+        /* nal units in the hvcC always have length coded with 2 bytes,
-+         * so put a fake nal_length_size = 2 while parsing them */
-+        *nal_length_size = 2;
-+
-+        /* Decode nal units from hvcC. */
-+        for (i = 0; i < num_arrays; i++) {
-+            int type = bytestream2_get_byte(&gb) & 0x3f;
-+            int cnt  = bytestream2_get_be16(&gb);
-+
-+            for (j = 0; j < cnt; j++) {
-+                // +2 for the nal size field
-+                int nalsize = bytestream2_peek_be16(&gb) + 2;
-+                if (bytestream2_get_bytes_left(&gb) < nalsize) {
-+                    av_log(logctx, AV_LOG_ERROR,
-+                           "Invalid NAL unit size in extradata.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
-+                                            *nal_length_size, err_recognition, apply_defdispwin,
-+                                            logctx);
-+                if (ret < 0) {
-+                    av_log(logctx, AV_LOG_ERROR,
-+                           "Decoding nal unit %d %d from hvcC failed\n",
-+                           type, i);
-+                    return ret;
-+                }
-+                bytestream2_skip(&gb, nalsize);
-+            }
-+        }
-+
-+        /* Now store right nal length size, that will be used to parse
-+         * all other nals */
-+        *nal_length_size = nal_len_size;
-+    } else {
-+        *is_nalff = 0;
-+        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
-+                                    err_recognition, apply_defdispwin, logctx);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    return ret;
-+}
-diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h
-new file mode 100644
-index 0000000000..4b4d032a16
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.h
-@@ -0,0 +1,36 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * H.265 parser code
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PARSE_H
-+#define AVCODEC_RPI_HEVC_PARSE_H
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+                             int err_recognition, int apply_defdispwin, void *logctx);
-+
-+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
-diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
-new file mode 100644
-index 0000000000..891e3a900c
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.c
-@@ -0,0 +1,1936 @@
-+/*
-+ * HEVC Parameter Set decoding
-+ *
-+ * Copyright (C) 2012 - 2103 Guillaume Martres
-+ * Copyright (C) 2012 - 2103 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/imgutils.h"
-+#include "golomb.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevcdec.h"
-+
-+static const uint8_t default_scaling_list_intra[] = {
-+    16, 16, 16, 16, 17, 18, 21, 24,
-+    16, 16, 16, 16, 17, 19, 22, 25,
-+    16, 16, 17, 18, 20, 22, 25, 29,
-+    16, 16, 18, 21, 24, 27, 31, 36,
-+    17, 17, 20, 24, 30, 35, 41, 47,
-+    18, 19, 22, 27, 35, 44, 54, 65,
-+    21, 22, 25, 31, 41, 54, 70, 88,
-+    24, 25, 29, 36, 47, 65, 88, 115
-+};
-+
-+static const uint8_t default_scaling_list_inter[] = {
-+    16, 16, 16, 16, 17, 18, 20, 24,
-+    16, 16, 16, 17, 18, 20, 24, 25,
-+    16, 16, 17, 18, 20, 24, 25, 28,
-+    16, 17, 18, 20, 24, 25, 28, 33,
-+    17, 18, 20, 24, 25, 28, 33, 41,
-+    18, 20, 24, 25, 28, 33, 41, 54,
-+    20, 24, 25, 28, 33, 41, 54, 71,
-+    24, 25, 28, 33, 41, 54, 71, 91
-+};
-+
-+static const AVRational vui_sar[] = {
-+    {  0,   1 },
-+    {  1,   1 },
-+    { 12,  11 },
-+    { 10,  11 },
-+    { 16,  11 },
-+    { 40,  33 },
-+    { 24,  11 },
-+    { 20,  11 },
-+    { 32,  11 },
-+    { 80,  33 },
-+    { 18,  11 },
-+    { 15,  11 },
-+    { 64,  33 },
-+    { 160, 99 },
-+    {  4,   3 },
-+    {  3,   2 },
-+    {  2,   1 },
-+};
-+
-+
-+// pps_cb_qp_offset: -12,+12
-+// slice_cb_qp_offset: -12,+12 also
-+//   "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
-+// cr_qp_offset_list[n]: -12,+12
-+// So worst case total offset: -24,+24
-+
-+#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
-+#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
-+#define M(B,n) C(B,(-n))
-+
-+// Sizeof the QP_START_BLOCK
-+#define QP_OFFSET_0 (8*6 + 12*2)
-+#define QP_START(B) \
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+\
-+    M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
-+    M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
-+    M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
-+    M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
-+    M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
-+    M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
-+    M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
-+    M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
-+#define QP_END(B) \
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
-+
-+#define T1(B)\
-+{\
-+    QP_START(B),\
-+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+    C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
-+    C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
-+    C(B,44), C(B,45),\
-+    C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
-+    QP_END(B)\
-+}
-+#define T0(B)\
-+{\
-+    QP_START(B),\
-+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+    C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
-+    C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
-+    C(B,50), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    QP_END(B)\
-+}
-+
-+#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
-+
-+static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
-+static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
-+
-+#undef T
-+#undef C
-+#undef QP_END
-+
-+#define C(B,n) ((n)<0?0:(n)>51?51:(n))
-+// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
-+#define QP_DBLK_OFFSET_0 QP_OFFSET_0
-+#define QP_END(B)\
-+ 51, 51, 51, 51, 51, 51
-+
-+// These don't need all the padding we have here (12 top/bottom would be enough)
-+static const uint8_t qp_c_dblk_0[] = T0(0);
-+static const uint8_t qp_c_dblk_1[] = T1(0);
-+
-+#undef T
-+#undef M
-+#undef C
-+#undef QP_END
-+#undef QP_START
-+
-+
-+static void remove_pps(HEVCRpiParamSets * const s, const int id)
-+{
-+    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
-+        s->pps = NULL;
-+    av_buffer_unref(&s->pps_list[id]);
-+}
-+
-+static void remove_sps(HEVCRpiParamSets * const s, const int id)
-+{
-+    int i;
-+    if (s->sps_list[id]) {
-+        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
-+            s->sps = NULL;
-+
-+        /* drop all PPS that depend on this SPS */
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
-+            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
-+                remove_pps(s, i);
-+
-+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
-+    }
-+    av_buffer_unref(&s->sps_list[id]);
-+}
-+
-+static void remove_vps(HEVCRpiParamSets * const s, const int id)
-+{
-+    int i;
-+    if (s->vps_list[id]) {
-+        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
-+            s->vps = NULL;
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
-+            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
-+                remove_sps(s, i);
-+    }
-+    av_buffer_unref(&s->vps_list[id]);
-+}
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                  ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
-+{
-+    uint8_t rps_predict = 0;
-+    int delta_poc;
-+    int k0 = 0;
-+    int k1 = 0;
-+    int k  = 0;
-+    int i;
-+
-+    if (rps != sps->st_rps && sps->nb_st_rps)
-+        rps_predict = get_bits1(gb);
-+
-+    if (rps_predict) {
-+        const ShortTermRPS *rps_ridx;
-+        int delta_rps;
-+        unsigned abs_delta_rps;
-+        uint8_t use_delta_flag = 0;
-+        uint8_t delta_rps_sign;
-+
-+        if (is_slice_header) {
-+            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
-+            if (delta_idx > sps->nb_st_rps) {
-+                av_log(avctx, AV_LOG_ERROR,
-+                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
-+                       delta_idx, sps->nb_st_rps);
-+                return AVERROR_INVALIDDATA;
-+            }
-+            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
-+            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
-+        } else
-+            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
-+
-+        delta_rps_sign = get_bits1(gb);
-+        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
-+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "Invalid value of abs_delta_rps: %d\n",
-+                   abs_delta_rps);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
-+        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
-+            int used = rps->used[k] = get_bits1(gb);
-+
-+            if (!used)
-+                use_delta_flag = get_bits1(gb);
-+
-+            if (used || use_delta_flag) {
-+                if (i < rps_ridx->num_delta_pocs)
-+                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
-+                else
-+                    delta_poc = delta_rps;
-+                rps->delta_poc[k] = delta_poc;
-+                if (delta_poc < 0)
-+                    k0++;
-+                else
-+                    k1++;
-+                k++;
-+            }
-+        }
-+
-+        if (k >= FF_ARRAY_ELEMS(rps->used)) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "Invalid num_delta_pocs: %d\n", k);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        rps->num_delta_pocs    = k;
-+        rps->num_negative_pics = k0;
-+        // sort in increasing order (smallest first)
-+        if (rps->num_delta_pocs != 0) {
-+            int used, tmp;
-+            for (i = 1; i < rps->num_delta_pocs; i++) {
-+                delta_poc = rps->delta_poc[i];
-+                used      = rps->used[i];
-+                for (k = i - 1; k >= 0; k--) {
-+                    tmp = rps->delta_poc[k];
-+                    if (delta_poc < tmp) {
-+                        rps->delta_poc[k + 1] = tmp;
-+                        rps->used[k + 1]      = rps->used[k];
-+                        rps->delta_poc[k]     = delta_poc;
-+                        rps->used[k]          = used;
-+                    }
-+                }
-+            }
-+        }
-+        if ((rps->num_negative_pics >> 1) != 0) {
-+            int used;
-+            k = rps->num_negative_pics - 1;
-+            // flip the negative values to largest first
-+            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
-+                delta_poc         = rps->delta_poc[i];
-+                used              = rps->used[i];
-+                rps->delta_poc[i] = rps->delta_poc[k];
-+                rps->used[i]      = rps->used[k];
-+                rps->delta_poc[k] = delta_poc;
-+                rps->used[k]      = used;
-+                k--;
-+            }
-+        }
-+    } else {
-+        unsigned int prev, nb_positive_pics;
-+        rps->num_negative_pics = get_ue_golomb_long(gb);
-+        nb_positive_pics       = get_ue_golomb_long(gb);
-+
-+        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
-+            nb_positive_pics >= HEVC_MAX_REFS) {
-+            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
-+        if (rps->num_delta_pocs) {
-+            prev = 0;
-+            for (i = 0; i < rps->num_negative_pics; i++) {
-+                delta_poc = get_ue_golomb_long(gb) + 1;
-+                if (delta_poc < 1 || delta_poc > 32768) {
-+                    av_log(avctx, AV_LOG_ERROR,
-+                        "Invalid value of delta_poc: %d\n",
-+                        delta_poc);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                prev -= delta_poc;
-+                rps->delta_poc[i] = prev;
-+                rps->used[i]      = get_bits1(gb);
-+            }
-+            prev = 0;
-+            for (i = 0; i < nb_positive_pics; i++) {
-+                delta_poc = get_ue_golomb_long(gb) + 1;
-+                if (delta_poc < 1 || delta_poc > 32768) {
-+                    av_log(avctx, AV_LOG_ERROR,
-+                        "Invalid value of delta_poc: %d\n",
-+                        delta_poc);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                prev += delta_poc;
-+                rps->delta_poc[rps->num_negative_pics + i] = prev;
-+                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
-+            }
-+        }
-+    }
-+    return 0;
-+}
-+
-+
-+static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                      PTLCommon * const ptl)
-+{
-+    int i;
-+
-+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
-+        return -1;
-+
-+    ptl->profile_space = get_bits(gb, 2);
-+    ptl->tier_flag     = get_bits1(gb);
-+    ptl->profile_idc   = get_bits(gb, 5);
-+    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
-+        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
-+        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
-+        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
-+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
-+    else
-+        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
-+
-+    for (i = 0; i < 32; i++) {
-+        ptl->profile_compatibility_flag[i] = get_bits1(gb);
-+
-+        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
-+            ptl->profile_idc = i;
-+    }
-+    ptl->progressive_source_flag    = get_bits1(gb);
-+    ptl->interlaced_source_flag     = get_bits1(gb);
-+    ptl->non_packed_constraint_flag = get_bits1(gb);
-+    ptl->frame_only_constraint_flag = get_bits1(gb);
-+
-+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
-+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
-+    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
-+
-+    return 0;
-+}
-+
-+static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
-+                      PTL * const ptl, const int max_num_sub_layers)
-+{
-+    int i;
-+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
-+        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
-+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
-+        return -1;
-+    }
-+
-+    ptl->general_ptl.level_idc = get_bits(gb, 8);
-+
-+    for (i = 0; i < max_num_sub_layers - 1; i++) {
-+        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
-+        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
-+    }
-+
-+    if (max_num_sub_layers - 1> 0)
-+        for (i = max_num_sub_layers - 1; i < 8; i++)
-+            skip_bits(gb, 2); // reserved_zero_2bits[i]
-+    for (i = 0; i < max_num_sub_layers - 1; i++) {
-+        if (ptl->sub_layer_profile_present_flag[i] &&
-+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "PTL information for sublayer %i too short\n", i);
-+            return -1;
-+        }
-+        if (ptl->sub_layer_level_present_flag[i]) {
-+            if (get_bits_left(gb) < 8) {
-+                av_log(avctx, AV_LOG_ERROR,
-+                       "Not enough data for sublayer %i level_idc\n", i);
-+                return -1;
-+            } else
-+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
-+                                const int subpic_params_present)
-+{
-+    int i;
-+
-+    for (i = 0; i < nb_cpb; i++) {
-+        get_ue_golomb_long(gb); // bit_rate_value_minus1
-+        get_ue_golomb_long(gb); // cpb_size_value_minus1
-+
-+        if (subpic_params_present) {
-+            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
-+            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
-+        }
-+        skip_bits1(gb); // cbr_flag
-+    }
-+}
-+
-+static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
-+                      const int max_sublayers)
-+{
-+    int nal_params_present = 0, vcl_params_present = 0;
-+    int subpic_params_present = 0;
-+    int i;
-+
-+    if (common_inf_present) {
-+        nal_params_present = get_bits1(gb);
-+        vcl_params_present = get_bits1(gb);
-+
-+        if (nal_params_present || vcl_params_present) {
-+            subpic_params_present = get_bits1(gb);
-+
-+            if (subpic_params_present) {
-+                skip_bits(gb, 8); // tick_divisor_minus2
-+                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
-+                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
-+                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
-+            }
-+
-+            skip_bits(gb, 4); // bit_rate_scale
-+            skip_bits(gb, 4); // cpb_size_scale
-+
-+            if (subpic_params_present)
-+                skip_bits(gb, 4);  // cpb_size_du_scale
-+
-+            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
-+            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
-+            skip_bits(gb, 5); // dpb_output_delay_length_minus1
-+        }
-+    }
-+
-+    for (i = 0; i < max_sublayers; i++) {
-+        int low_delay = 0;
-+        unsigned int nb_cpb = 1;
-+        int fixed_rate = get_bits1(gb);
-+
-+        if (!fixed_rate)
-+            fixed_rate = get_bits1(gb);
-+
-+        if (fixed_rate)
-+            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
-+        else
-+            low_delay = get_bits1(gb);
-+
-+        if (!low_delay) {
-+            nb_cpb = get_ue_golomb_long(gb) + 1;
-+            if (nb_cpb < 1 || nb_cpb > 32) {
-+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        }
-+
-+        if (nal_params_present)
-+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+        if (vcl_params_present)
-+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+    }
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                           HEVCRpiParamSets * const ps)
-+{
-+    int i,j;
-+    int vps_id = 0;
-+    ptrdiff_t nal_size;
-+    HEVCRpiVPS *vps;
-+    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
-+
-+    if (!vps_buf)
-+        return AVERROR(ENOMEM);
-+    vps = (HEVCRpiVPS*)vps_buf->data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(vps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(vps->data));
-+        vps->data_size = sizeof(vps->data);
-+    } else {
-+        vps->data_size = nal_size;
-+    }
-+    memcpy(vps->data, gb->buffer, vps->data_size);
-+
-+    vps_id = get_bits(gb, 4);
-+    if (vps_id >= HEVC_MAX_VPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
-+        goto err;
-+    }
-+
-+    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
-+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
-+        goto err;
-+    }
-+
-+    vps->vps_max_layers               = get_bits(gb, 6) + 1;
-+    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
-+    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
-+
-+    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
-+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
-+        goto err;
-+    }
-+
-+    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
-+               vps->vps_max_sub_layers);
-+        goto err;
-+    }
-+
-+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
-+        goto err;
-+
-+    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
-+
-+    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
-+    for (; i < vps->vps_max_sub_layers; i++) {
-+        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
-+        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
-+        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
-+
-+        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
-+            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+                   vps->vps_max_dec_pic_buffering[i] - 1);
-+            goto err;
-+        }
-+        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
-+            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
-+                   vps->vps_num_reorder_pics[i]);
-+            if (avctx->err_recognition & AV_EF_EXPLODE)
-+                goto err;
-+        }
-+    }
-+
-+    vps->vps_max_layer_id   = get_bits(gb, 6);
-+    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
-+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
-+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
-+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
-+        goto err;
-+    }
-+
-+    for (i = 1; i < vps->vps_num_layer_sets; i++)
-+        for (j = 0; j <= vps->vps_max_layer_id; j++)
-+            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
-+
-+    vps->vps_timing_info_present_flag = get_bits1(gb);
-+    if (vps->vps_timing_info_present_flag) {
-+        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
-+        vps->vps_time_scale                      = get_bits_long(gb, 32);
-+        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
-+        if (vps->vps_poc_proportional_to_timing_flag)
-+            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
-+        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
-+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
-+            goto err;
-+        }
-+        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
-+            int common_inf_present = 1;
-+
-+            get_ue_golomb_long(gb); // hrd_layer_set_idx
-+            if (i)
-+                common_inf_present = get_bits1(gb);
-+            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
-+        }
-+    }
-+    get_bits1(gb); /* vps_extension_flag */
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread VPS by %d bits\n", -get_bits_left(gb));
-+        if (ps->vps_list[vps_id])
-+            goto err;
-+    }
-+
-+    if (ps->vps_list[vps_id] &&
-+        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
-+        av_buffer_unref(&vps_buf);
-+    } else {
-+        remove_vps(ps, vps_id);
-+        ps->vps_list[vps_id] = vps_buf;
-+    }
-+
-+    return 0;
-+
-+err:
-+    av_buffer_unref(&vps_buf);
-+    return AVERROR_INVALIDDATA;
-+}
-+
-+static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
-+                       const int apply_defdispwin, HEVCRpiSPS * const sps)
-+{
-+    VUI backup_vui, * const vui = &sps->vui;
-+    GetBitContext backup;
-+    int sar_present, alt = 0;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
-+
-+    sar_present = get_bits1(gb);
-+    if (sar_present) {
-+        uint8_t sar_idx = get_bits(gb, 8);
-+        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
-+            vui->sar = vui_sar[sar_idx];
-+        else if (sar_idx == 255) {
-+            vui->sar.num = get_bits(gb, 16);
-+            vui->sar.den = get_bits(gb, 16);
-+        } else
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Unknown SAR index: %u.\n", sar_idx);
-+    }
-+
-+    vui->overscan_info_present_flag = get_bits1(gb);
-+    if (vui->overscan_info_present_flag)
-+        vui->overscan_appropriate_flag = get_bits1(gb);
-+
-+    vui->video_signal_type_present_flag = get_bits1(gb);
-+    if (vui->video_signal_type_present_flag) {
-+        vui->video_format                    = get_bits(gb, 3);
-+        vui->video_full_range_flag           = get_bits1(gb);
-+        vui->colour_description_present_flag = get_bits1(gb);
-+        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
-+            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
-+        if (vui->colour_description_present_flag) {
-+            vui->colour_primaries        = get_bits(gb, 8);
-+            vui->transfer_characteristic = get_bits(gb, 8);
-+            vui->matrix_coeffs           = get_bits(gb, 8);
-+
-+            // Set invalid values to "unspecified"
-+            if (!av_color_primaries_name(vui->colour_primaries))
-+                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
-+            if (!av_color_transfer_name(vui->transfer_characteristic))
-+                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
-+            if (!av_color_space_name(vui->matrix_coeffs))
-+                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
-+            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
-+                switch (sps->pix_fmt) {
-+                case AV_PIX_FMT_YUV444P:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP;
-+                    break;
-+                case AV_PIX_FMT_YUV444P10:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
-+                    break;
-+                case AV_PIX_FMT_YUV444P12:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
-+                    break;
-+                }
-+            }
-+        }
-+    }
-+
-+    vui->chroma_loc_info_present_flag = get_bits1(gb);
-+    if (vui->chroma_loc_info_present_flag) {
-+        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
-+        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
-+    }
-+
-+    vui->neutra_chroma_indication_flag = get_bits1(gb);
-+    vui->field_seq_flag                = get_bits1(gb);
-+    vui->frame_field_info_present_flag = get_bits1(gb);
-+
-+    // Backup context in case an alternate header is detected
-+    memcpy(&backup, gb, sizeof(backup));
-+    memcpy(&backup_vui, vui, sizeof(backup_vui));
-+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
-+        vui->default_display_window_flag = 0;
-+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
-+    } else
-+        vui->default_display_window_flag = get_bits1(gb);
-+
-+    if (vui->default_display_window_flag) {
-+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
-+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
-+        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
-+        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
-+        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
-+
-+        if (apply_defdispwin &&
-+            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+            av_log(avctx, AV_LOG_DEBUG,
-+                   "discarding vui default display window, "
-+                   "original values are l:%u r:%u t:%u b:%u\n",
-+                   vui->def_disp_win.left_offset,
-+                   vui->def_disp_win.right_offset,
-+                   vui->def_disp_win.top_offset,
-+                   vui->def_disp_win.bottom_offset);
-+
-+            vui->def_disp_win.left_offset   =
-+            vui->def_disp_win.right_offset  =
-+            vui->def_disp_win.top_offset    =
-+            vui->def_disp_win.bottom_offset = 0;
-+        }
-+    }
-+
-+timing_info:
-+    vui->vui_timing_info_present_flag = get_bits1(gb);
-+
-+    if (vui->vui_timing_info_present_flag) {
-+        if( get_bits_left(gb) < 66 && !alt) {
-+            // The alternate syntax seem to have timing info located
-+            // at where def_disp_win is normally located
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Strange VUI timing information, retrying...\n");
-+            memcpy(vui, &backup_vui, sizeof(backup_vui));
-+            memcpy(gb, &backup, sizeof(backup));
-+            alt = 1;
-+            goto timing_info;
-+        }
-+        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
-+        vui->vui_time_scale                      = get_bits_long(gb, 32);
-+        if (alt) {
-+            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
-+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
-+        }
-+        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
-+        if (vui->vui_poc_proportional_to_timing_flag)
-+            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
-+        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
-+        if (vui->vui_hrd_parameters_present_flag)
-+            decode_hrd(gb, 1, sps->max_sub_layers);
-+    }
-+
-+    vui->bitstream_restriction_flag = get_bits1(gb);
-+    if (vui->bitstream_restriction_flag) {
-+        if (get_bits_left(gb) < 8 && !alt) {
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Strange VUI bitstream restriction information, retrying"
-+                   " from timing information...\n");
-+            memcpy(vui, &backup_vui, sizeof(backup_vui));
-+            memcpy(gb, &backup, sizeof(backup));
-+            alt = 1;
-+            goto timing_info;
-+        }
-+        vui->tiles_fixed_structure_flag              = get_bits1(gb);
-+        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
-+        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
-+        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
-+        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
-+        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
-+        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
-+        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
-+    }
-+
-+    if (get_bits_left(gb) < 1 && !alt) {
-+        // XXX: Alternate syntax when sps_range_extension_flag != 0?
-+        av_log(avctx, AV_LOG_WARNING,
-+               "Overread in VUI, retrying from timing information...\n");
-+        memcpy(vui, &backup_vui, sizeof(backup_vui));
-+        memcpy(gb, &backup, sizeof(backup));
-+        alt = 1;
-+        goto timing_info;
-+    }
-+}
-+
-+static void set_default_scaling_list_data(ScalingList * const sl)
-+{
-+    int matrixId;
-+
-+    for (matrixId = 0; matrixId < 6; matrixId++) {
-+        // 4x4 default is 16
-+        memset(sl->sl[0][matrixId], 16, 16);
-+        sl->sl_dc[0][matrixId] = 16; // default for 16x16
-+        sl->sl_dc[1][matrixId] = 16; // default for 32x32
-+    }
-+
-+    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
-+
-+    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
-+
-+    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
-+}
-+
-+static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
-+                             const HEVCRpiSPS * const sps)
-+{
-+    uint8_t scaling_list_pred_mode_flag;
-+    int32_t scaling_list_dc_coef[2][6];
-+    int size_id, matrix_id, pos;
-+    int i;
-+
-+    for (size_id = 0; size_id < 4; size_id++)
-+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
-+            scaling_list_pred_mode_flag = get_bits1(gb);
-+            if (!scaling_list_pred_mode_flag) {
-+                unsigned int delta = get_ue_golomb_long(gb);
-+                /* Only need to handle non-zero delta. Zero means default,
-+                 * which should already be in the arrays. */
-+                if (delta) {
-+                    // Copy from previous array.
-+                    delta *= (size_id == 3) ? 3 : 1;
-+                    if (matrix_id < delta) {
-+                        av_log(avctx, AV_LOG_ERROR,
-+                               "Invalid delta in scaling list data: %d.\n", delta);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+
-+                    memcpy(sl->sl[size_id][matrix_id],
-+                           sl->sl[size_id][matrix_id - delta],
-+                           size_id > 0 ? 64 : 16);
-+                    if (size_id > 1)
-+                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
-+                }
-+            } else {
-+                int next_coef, coef_num;
-+                int32_t scaling_list_delta_coef;
-+
-+                next_coef = 8;
-+                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
-+                if (size_id > 1) {
-+                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
-+                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
-+                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
-+                }
-+                for (i = 0; i < coef_num; i++) {
-+                    if (size_id == 0)
-+                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
-+                                  ff_hevc_rpi_diag_scan4x4_x[i];
-+                    else
-+                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
-+                                  ff_hevc_rpi_diag_scan8x8_x[i];
-+
-+                    scaling_list_delta_coef = get_se_golomb(gb);
-+                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
-+                    sl->sl[size_id][matrix_id][pos] = next_coef;
-+                }
-+            }
-+        }
-+
-+    if (sps->chroma_format_idc == 3) {
-+        for (i = 0; i < 64; i++) {
-+            sl->sl[3][1][i] = sl->sl[2][1][i];
-+            sl->sl[3][2][i] = sl->sl[2][2][i];
-+            sl->sl[3][4][i] = sl->sl[2][4][i];
-+            sl->sl[3][5][i] = sl->sl[2][5][i];
-+        }
-+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
-+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
-+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
-+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
-+    }
-+
-+
-+    return 0;
-+}
-+
-+static int map_pixel_format(HEVCRpiSPS * const sps)
-+{
-+    const int cfmt = sps->chroma_format_idc;
-+
-+    sps->pix_fmt = AV_PIX_FMT_NONE;
-+    switch (sps->bit_depth) {
-+    case 8:
-+        if (cfmt == 1)
-+            sps->pix_fmt = AV_PIX_FMT_SAND128;
-+        break;
-+    case 10:
-+        if (cfmt == 1)
-+            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    sps->hshift[0] = sps->vshift[0] = 0;
-+    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
-+    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
-+
-+    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
-+
-+    return 0;
-+}
-+
-+static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
-+                      const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
-+{
-+    HEVCRpiWindow *ow;
-+    int ret = 0;
-+    int log2_diff_max_min_transform_block_size;
-+    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
-+    int i;
-+
-+    // Coded parameters
-+
-+    sps->vps_id = get_bits(gb, 4);
-+    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (vps_list && !vps_list[sps->vps_id]) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
-+               sps->vps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->max_sub_layers = get_bits(gb, 3) + 1;
-+    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
-+               sps->max_sub_layers);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->temporal_id_nesting_flag = get_bits(gb, 1);
-+
-+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
-+        return ret;
-+
-+    *sps_id = get_ue_golomb_long(gb);
-+    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->chroma_format_idc = get_ue_golomb_long(gb);
-+    if (sps->chroma_format_idc > 3U) {
-+        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->chroma_format_idc == 3)
-+        sps->separate_colour_plane_flag = get_bits1(gb);
-+
-+    if (sps->separate_colour_plane_flag)
-+        sps->chroma_format_idc = 0;
-+
-+    sps->width  = get_ue_golomb_long(gb);
-+    sps->height = get_ue_golomb_long(gb);
-+    if ((ret = av_image_check_size(sps->width,
-+                                   sps->height, 0, avctx)) < 0)
-+        return ret;
-+
-+    if (get_bits1(gb)) { // pic_conformance_flag
-+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
-+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
-+        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
-+        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
-+        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
-+
-+        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+            av_log(avctx, AV_LOG_DEBUG,
-+                   "discarding sps conformance window, "
-+                   "original values are l:%u r:%u t:%u b:%u\n",
-+                   sps->pic_conf_win.left_offset,
-+                   sps->pic_conf_win.right_offset,
-+                   sps->pic_conf_win.top_offset,
-+                   sps->pic_conf_win.bottom_offset);
-+
-+            sps->pic_conf_win.left_offset   =
-+            sps->pic_conf_win.right_offset  =
-+            sps->pic_conf_win.top_offset    =
-+            sps->pic_conf_win.bottom_offset = 0;
-+        }
-+        sps->output_window = sps->pic_conf_win;
-+    }
-+
-+    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
-+    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
-+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Luma bit depth (%d) is different from chroma bit depth (%d), "
-+               "this is unsupported.\n",
-+               sps->bit_depth, bit_depth_chroma);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    ret = map_pixel_format(sps);
-+    if (ret < 0)
-+        return ret;
-+
-+    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
-+    if (sps->log2_max_poc_lsb > 16) {
-+        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
-+               sps->log2_max_poc_lsb - 4);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sublayer_ordering_info = get_bits1(gb);
-+    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
-+    for (i = start; i < sps->max_sub_layers; i++) {
-+        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
-+        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
-+        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
-+        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
-+            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
-+            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
-+                   sps->temporal_layer[i].num_reorder_pics);
-+            if (avctx->err_recognition & AV_EF_EXPLODE ||
-+                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
-+                return AVERROR_INVALIDDATA;
-+            }
-+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
-+        }
-+    }
-+
-+    if (!sublayer_ordering_info) {
-+        for (i = 0; i < start; i++) {
-+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
-+            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
-+            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
-+        }
-+    }
-+
-+    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
-+    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
-+    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
-+    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
-+    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
-+                                               sps->log2_min_tb_size;
-+
-+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->log2_diff_max_min_coding_block_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    {
-+        const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
-+        // Not a bitstream limitation, but all profiles
-+        if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
-+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
-+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        // Inferred parameters
-+        sps->log2_ctb_size = CtbLog2SizeY;
-+//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
-+    }
-+
-+    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
-+    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
-+
-+    sps->scaling_list_enable_flag = get_bits1(gb);
-+    if (sps->scaling_list_enable_flag) {
-+        set_default_scaling_list_data(&sps->scaling_list);
-+
-+        if (get_bits1(gb)) {
-+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
-+            if (ret < 0)
-+                return ret;
-+        }
-+    }
-+
-+    sps->amp_enabled_flag = get_bits1(gb);
-+    sps->sao_enabled      = get_bits1(gb);
-+
-+    // Set pcm defaults (0) so we don't have to test _enabled when we
-+    // want to use them
-+    memset(&sps->pcm, 0, sizeof(sps->pcm));
-+
-+    if (get_bits1(gb))  // pcm_enabled_flag
-+    {
-+        const unsigned int limit_max_pcm = FFMIN(5,
-+            sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
-+        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
-+        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
-+        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
-+        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
-+                                        get_ue_golomb_long(gb);
-+        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
-+                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
-+            sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
-+            av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
-+                   sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
-+    }
-+
-+    // Could be based on min_pcm_cb_size but much easier logic if we just stick
-+    // with 8 (and costs us little)
-+    sps->pcm_width = (sps->width + 63) >> 6;  // 8 for min size, 8 bits per byte - round up
-+    sps->pcm_height = (sps->height + 7) >> 3;
-+
-+    sps->nb_st_rps = get_ue_golomb_long(gb);
-+    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
-+        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
-+               sps->nb_st_rps);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    for (i = 0; i < sps->nb_st_rps; i++) {
-+        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
-+                                                 sps, 0)) < 0)
-+            return ret;
-+    }
-+
-+    sps->long_term_ref_pics_present_flag = get_bits1(gb);
-+    if (sps->long_term_ref_pics_present_flag) {
-+        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
-+        if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
-+            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
-+                   sps->num_long_term_ref_pics_sps);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
-+            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
-+            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
-+        }
-+    }
-+
-+    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
-+    sps->sps_strong_intra_smoothing_enable_flag = get_bits1(gb);
-+    sps->vui.sar = (AVRational){0, 1};
-+    vui_present = get_bits1(gb);
-+    if (vui_present)
-+        decode_vui(gb, avctx, apply_defdispwin, sps);
-+
-+    if (get_bits1(gb)) { // sps_extension_flag
-+        int sps_extension_flag[1];
-+        for (i = 0; i < 1; i++)
-+            sps_extension_flag[i] = get_bits1(gb);
-+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
-+        if (sps_extension_flag[0]) {
-+            int extended_precision_processing_flag;
-+            int cabac_bypass_alignment_enabled_flag;
-+
-+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
-+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
-+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+            extended_precision_processing_flag = get_bits1(gb);
-+            if (extended_precision_processing_flag)
-+                av_log(avctx, AV_LOG_WARNING,
-+                   "extended_precision_processing_flag not yet implemented\n");
-+
-+            sps->intra_smoothing_disabled_flag       = get_bits1(gb);
-+            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
-+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
-+
-+            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
-+            if (cabac_bypass_alignment_enabled_flag)
-+                av_log(avctx, AV_LOG_WARNING,
-+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
-+        }
-+    }
-+    if (apply_defdispwin) {
-+        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
-+        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
-+        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
-+        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
-+    }
-+
-+    ow = &sps->output_window;
-+    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
-+        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
-+        ow->left_offset + ow->right_offset  >= sps->width ||
-+        ow->top_offset  + ow->bottom_offset >= sps->height) {
-+        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
-+               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
-+        if (avctx->err_recognition & AV_EF_EXPLODE) {
-+            return AVERROR_INVALIDDATA;
-+        }
-+        av_log(avctx, AV_LOG_WARNING,
-+               "Displaying the whole video surface.\n");
-+        memset(ow, 0, sizeof(*ow));
-+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
-+    }
-+
-+    // Inferred parameters
-+
-+    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
-+
-+    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
-+    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
-+    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
-+    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
-+    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
-+    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
-+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
-+
-+    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
-+    sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
-+
-+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
-+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
-+               sps->max_transform_hierarchy_depth_inter);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
-+               sps->max_transform_hierarchy_depth_intra);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "max transform block size out of range: %d\n",
-+               sps->log2_max_trafo_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread SPS by %d bits\n", -get_bits_left(gb));
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps, int apply_defdispwin)
-+{
-+    HEVCRpiSPS *sps;
-+    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
-+    unsigned int sps_id;
-+    int ret;
-+    ptrdiff_t nal_size;
-+
-+    if (!sps_buf)
-+        return AVERROR(ENOMEM);
-+    sps = (HEVCRpiSPS*)sps_buf->data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(sps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(sps->data));
-+        sps->data_size = sizeof(sps->data);
-+    } else {
-+        sps->data_size = nal_size;
-+    }
-+    memcpy(sps->data, gb->buffer, sps->data_size);
-+
-+    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
-+                            apply_defdispwin,
-+                            ps->vps_list, avctx);
-+    if (ret < 0) {
-+        av_buffer_unref(&sps_buf);
-+        return ret;
-+    }
-+
-+    if (avctx->debug & FF_DEBUG_BITSTREAM) {
-+        av_log(avctx, AV_LOG_DEBUG,
-+               "Parsed SPS: id %d; coded wxh: %dx%d; "
-+               "cropped wxh: %dx%d; pix_fmt: %s.\n",
-+               sps_id, sps->width, sps->height,
-+               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
-+               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
-+               av_get_pix_fmt_name(sps->pix_fmt));
-+    }
-+
-+    /* check if this is a repeat of an already parsed SPS, then keep the
-+     * original one.
-+     * otherwise drop all PPSes that depend on it */
-+    if (ps->sps_list[sps_id] &&
-+        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
-+        av_buffer_unref(&sps_buf);
-+    } else {
-+        remove_sps(ps, sps_id);
-+        ps->sps_list[sps_id] = sps_buf;
-+    }
-+
-+    return 0;
-+}
-+
-+static void hevc_pps_free(void *opaque, uint8_t *data)
-+{
-+    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
-+
-+    av_freep(&pps->column_width);
-+    av_freep(&pps->row_height);
-+    av_freep(&pps->col_bd);
-+    av_freep(&pps->row_bd);
-+    av_freep(&pps->col_idxX);
-+    av_freep(&pps->ctb_addr_rs_to_ts);
-+    av_freep(&pps->ctb_addr_ts_to_rs);
-+    av_freep(&pps->tile_pos_ts);
-+    av_freep(&pps->tile_size);
-+    av_freep(&pps->tile_id);
-+    av_freep(&pps->ctb_ts_flags);
-+
-+    av_freep(&pps);
-+}
-+
-+static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
-+{
-+    do
-+    {
-+        const int offset = get_se_golomb_long(gb);
-+        if (offset < -12 || offset > 12) {
-+            av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        *offsets++ = offset;
-+    } while (n_minus_1-- != 0);
-+    return 0;
-+}
-+
-+static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+    if (pps->transform_skip_enabled_flag) {
-+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
-+    }
-+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
-+    if (pps->cross_component_prediction_enabled_flag &&
-+        (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
-+    if (pps->chroma_qp_offset_list_enabled_flag) {
-+        int err;
-+
-+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
-+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
-+        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
-+
-+        if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
-+            (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
-+            return err;
-+    }
-+
-+    {
-+        const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
-+
-+        pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
-+        if (pps->log2_sao_offset_scale_luma > max_offset) {
-+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
-+        if (pps->log2_sao_offset_scale_chroma > max_offset) {
-+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    return(0);
-+}
-+
-+static inline int setup_pps(AVCodecContext * const avctx,
-+                            HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+    int pic_area_in_ctbs;
-+    int i, j, x, y, ctb_addr_rs, tile_id;
-+
-+    // Inferred parameters
-+
-+    // qp_y -> qp_u/qp_v tables
-+    // The tables have at least -24,+24 overrun after adding offset here
-+    // which should allow for clipless offseting
-+
-+    pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0;  // No offset for luma, but may be useful for general code
-+    pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
-+
-+    if (sps->chroma_format_idc == 1) {
-+        pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+        pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+    }
-+    else
-+    {
-+        pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+        pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+    }
-+
-+    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
-+    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
-+    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
-+    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
-+        return AVERROR(ENOMEM);
-+
-+    if (pps->uniform_spacing_flag) {
-+        if (!pps->column_width) {
-+            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
-+        }
-+        if (!pps->column_width || !pps->row_height)
-+            return AVERROR(ENOMEM);
-+
-+        for (i = 0; i < pps->num_tile_columns; i++) {
-+            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
-+                                   (i * sps->ctb_width) / pps->num_tile_columns;
-+        }
-+
-+        for (i = 0; i < pps->num_tile_rows; i++) {
-+            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
-+                                 (i * sps->ctb_height) / pps->num_tile_rows;
-+        }
-+    }
-+
-+    {
-+        const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
-+        pps->col_bd[0] = 0;
-+        pps->tile_wpp_inter_disable = 0;
-+        for (i = 0; i < pps->num_tile_columns; i++)
-+        {
-+            pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
-+
-+            // Avoid trying tile parallel if the columns don't fall on cache boundries
-+            // (this causes too much pain syncing flushes with the QPU)
-+            // Ignore the final (RHS of pic) tile boundry
-+            if ((pps->col_bd[i] & td_mask) != 0) {
-+                pps->tile_wpp_inter_disable = 1;
-+            }
-+        }
-+
-+        // If we can start the next row before finishing the first line of
-+        // this one then we must wait at the end of the tile
-+        // * if this happens a lot then there are better but more complicated
-+        //   conditions that we could apply
-+        if (pps->tile_wpp_inter_disable) {
-+            for (i = 0; i < pps->num_tile_rows; i++)
-+            {
-+                if (pps->row_height[i] <= RPI_MAX_JOBS) {
-+                    pps->tile_wpp_inter_disable = 2;
-+                    break;
-+                }
-+            }
-+        }
-+    }
-+
-+    pps->row_bd[0] = 0;
-+    for (i = 0; i < pps->num_tile_rows; i++)
-+        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
-+
-+    for (i = 0, j = 0; i < sps->ctb_width; i++) {
-+        if (i >= pps->col_bd[j + 1])
-+            j++;
-+        pps->col_idxX[i] = j;
-+    }
-+
-+    /**
-+     * 6.5
-+     */
-+    pic_area_in_ctbs     = sps->ctb_size;
-+
-+    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
-+    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
-+    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-+    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
-+    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
-+    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
-+    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-+        !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
-+
-+    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
-+        int tb_x   = ctb_addr_rs % sps->ctb_width;
-+        int tb_y   = ctb_addr_rs / sps->ctb_width;
-+        int tile_x = 0;
-+        int tile_y = 0;
-+        int val    = 0;
-+
-+        for (i = 0; i < pps->num_tile_columns; i++) {
-+            if (tb_x < pps->col_bd[i + 1]) {
-+                tile_x = i;
-+                break;
-+            }
-+        }
-+
-+        for (i = 0; i < pps->num_tile_rows; i++) {
-+            if (tb_y < pps->row_bd[i + 1]) {
-+                tile_y = i;
-+                break;
-+            }
-+        }
-+
-+        for (i = 0; i < tile_x; i++)
-+            val += pps->row_height[tile_y] * pps->column_width[i];
-+        for (i = 0; i < tile_y; i++)
-+            val += sps->ctb_width * pps->row_height[i];
-+
-+        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
-+               tb_x - pps->col_bd[tile_x];
-+
-+        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
-+        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
-+    }
-+
-+    {
-+        uint8_t * pflags = pps->ctb_ts_flags;
-+        uint16_t * ptid = pps->tile_id;
-+
-+        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
-+        {
-+            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
-+            {
-+                const unsigned int tile_w = pps->column_width[i];
-+
-+                pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+                for (x = 0; x != tile_w; ++x) {
-+                    pflags[x] |= CTB_TS_FLAGS_TOT;
-+                }
-+
-+                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
-+                {
-+                    pflags[0] |= CTB_TS_FLAGS_SOTL;
-+
-+                    if (pps->entropy_coding_sync_enabled_flag)
-+                    {
-+                        if (pps->column_width[i] != 1)
-+                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
-+                        else
-+                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
-+                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
-+                    }
-+
-+                    for (x = 0; x != tile_w; ++x)
-+                        *ptid++ = tile_id;
-+
-+                    pflags += tile_w;
-+                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
-+                    if (i + 1 == pps->num_tile_columns)
-+                        pflags[-1] |= CTB_TS_FLAGS_EOL;
-+                }
-+
-+                pflags[-1] |= CTB_TS_FLAGS_EOT;
-+            }
-+        }
-+    }
-+
-+    {
-+        unsigned int ts = 0;
-+        for (j = 0; j < pps->num_tile_rows; j++)
-+            for (i = 0; i < pps->num_tile_columns; i++)
-+            {
-+                const unsigned int size = pps->column_width[i] * pps->row_height[j];
-+                pps->tile_size[j * pps->num_tile_columns + i] = size;
-+                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
-+                ts += size;
-+            }
-+    }
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                           HEVCRpiParamSets * const ps)
-+{
-+    const HEVCRpiSPS *sps = NULL;
-+    int i, ret = 0;
-+    unsigned int pps_id = 0;
-+    ptrdiff_t nal_size;
-+    unsigned log2_parallel_merge_level_minus2;
-+
-+    AVBufferRef *pps_buf;
-+    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
-+
-+    if (!pps)
-+        return AVERROR(ENOMEM);
-+
-+    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
-+                               hevc_pps_free, NULL, 0);
-+    if (!pps_buf) {
-+        av_freep(&pps);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(pps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(pps->data));
-+        pps->data_size = sizeof(pps->data);
-+    } else {
-+        pps->data_size = nal_size;
-+    }
-+    memcpy(pps->data, gb->buffer, pps->data_size);
-+
-+    // Default values
-+    pps->loop_filter_across_tiles_enabled_flag = 1;
-+    pps->num_tile_columns                      = 1;
-+    pps->num_tile_rows                         = 1;
-+    pps->uniform_spacing_flag                  = 1;
-+    pps->disable_dbf                           = 0;
-+    pps->beta_offset                           = 0;
-+    pps->tc_offset                             = 0;
-+    pps->log2_max_transform_skip_block_size    = 2;
-+
-+    // Coded parameters
-+    pps_id = get_ue_golomb_long(gb);
-+    if (pps_id >= HEVC_MAX_PPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->sps_id = get_ue_golomb_long(gb);
-+    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    if (!ps->sps_list[pps->sps_id]) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
-+
-+    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
-+    pps->output_flag_present_flag              = get_bits1(gb);
-+    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
-+
-+    pps->sign_data_hiding_flag = get_bits1(gb);
-+
-+    pps->cabac_init_present_flag = get_bits1(gb);
-+
-+    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
-+    if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
-+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
-+    if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
-+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    pps->pic_init_qp_minus26 = get_se_golomb(gb);
-+    if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "init_qp_minus26 %d is outside the valid range "
-+               "[%d, %d].\n",
-+               pps->pic_init_qp_minus26,
-+               -(26 + sps->qp_bd_offset), 25);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    pps->constrained_intra_pred_flag = get_bits1(gb);
-+    pps->transform_skip_enabled_flag = get_bits1(gb);
-+
-+    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
-+    pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
-+    if (pps->cu_qp_delta_enabled_flag)
-+    {
-+        const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
-+
-+        if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
-+            av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
-+                   diff_cu_qp_delta_depth);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+
-+        pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
-+    }
-+
-+    pps->cb_qp_offset = get_se_golomb(gb);
-+    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
-+        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
-+               pps->cb_qp_offset);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->cr_qp_offset = get_se_golomb(gb);
-+    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
-+        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
-+               pps->cr_qp_offset);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
-+
-+    pps->weighted_pred_flag   = get_bits1(gb);
-+    pps->weighted_bipred_flag = get_bits1(gb);
-+
-+    pps->transquant_bypass_enable_flag    = get_bits1(gb);
-+    pps->tiles_enabled_flag               = get_bits1(gb);
-+    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
-+
-+    if (pps->tiles_enabled_flag) {
-+        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
-+        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
-+        if (pps->num_tile_columns <= 0 ||
-+            pps->num_tile_columns >= sps->width) {
-+            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
-+                   pps->num_tile_columns - 1);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+        if (pps->num_tile_rows <= 0 ||
-+            pps->num_tile_rows >= sps->height) {
-+            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
-+                   pps->num_tile_rows - 1);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+
-+        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
-+        if (!pps->column_width || !pps->row_height) {
-+            ret = AVERROR(ENOMEM);
-+            goto err;
-+        }
-+
-+        pps->uniform_spacing_flag = get_bits1(gb);
-+        if (!pps->uniform_spacing_flag) {
-+            uint64_t sum = 0;
-+            for (i = 0; i < pps->num_tile_columns - 1; i++) {
-+                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
-+                sum                 += pps->column_width[i];
-+            }
-+            if (sum >= sps->ctb_width) {
-+                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
-+
-+            sum = 0;
-+            for (i = 0; i < pps->num_tile_rows - 1; i++) {
-+                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
-+                sum               += pps->row_height[i];
-+            }
-+            if (sum >= sps->ctb_height) {
-+                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
-+        }
-+        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
-+    }
-+
-+    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+
-+    pps->deblocking_filter_control_present_flag = get_bits1(gb);
-+    if (pps->deblocking_filter_control_present_flag) {
-+        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
-+        pps->disable_dbf                             = get_bits1(gb);
-+        if (!pps->disable_dbf) {
-+            int beta_offset_div2 = get_se_golomb(gb);
-+            int tc_offset_div2   = get_se_golomb(gb) ;
-+            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
-+                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
-+                       beta_offset_div2);
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
-+                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
-+                       tc_offset_div2);
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->beta_offset = 2 * beta_offset_div2;
-+            pps->tc_offset   = 2 *   tc_offset_div2;
-+        }
-+    }
-+
-+    pps->scaling_list_data_present_flag = get_bits1(gb);
-+    if (pps->scaling_list_data_present_flag) {
-+        set_default_scaling_list_data(&pps->scaling_list);
-+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
-+        if (ret < 0)
-+            goto err;
-+    }
-+    pps->lists_modification_present_flag = get_bits1(gb);
-+    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
-+    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
-+               log2_parallel_merge_level_minus2);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
-+
-+    pps->slice_header_extension_present_flag = get_bits1(gb);
-+
-+    if (get_bits1(gb)) { // pps_extension_present_flag
-+        int pps_range_extensions_flag = get_bits1(gb);
-+        /* int pps_extension_7bits = */ get_bits(gb, 7);
-+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
-+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
-+                goto err;
-+        }
-+    }
-+
-+    ret = setup_pps(avctx, pps, sps);
-+    if (ret < 0)
-+        goto err;
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread PPS by %d bits\n", -get_bits_left(gb));
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    remove_pps(ps, pps_id);
-+    ps->pps_list[pps_id] = pps_buf;
-+
-+    return 0;
-+
-+err:
-+    av_buffer_unref(&pps_buf);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
-+{
-+    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
-+    int prev_poc_lsb = pocTid0 % max_poc_lsb;
-+    int prev_poc_msb = pocTid0 - prev_poc_lsb;
-+    int poc_msb;
-+
-+    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
-+        poc_msb = prev_poc_msb + max_poc_lsb;
-+    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
-+        poc_msb = prev_poc_msb - max_poc_lsb;
-+    else
-+        poc_msb = prev_poc_msb;
-+
-+    // For BLA picture types, POCmsb is set to 0.
-+    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
-+        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
-+        nal_unit_type == HEVC_NAL_BLA_N_LP)
-+        poc_msb = 0;
-+
-+    return poc_msb + poc_lsb;
-+}
-diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
-new file mode 100644
-index 0000000000..712464a075
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.h
-@@ -0,0 +1,447 @@
-+/*
-+ * HEVC parameter set parsing
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PS_H
-+#define AVCODEC_RPI_HEVC_PS_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/buffer.h"
-+#include "libavutil/pixfmt.h"
-+#include "libavutil/rational.h"
-+
-+#include "avcodec.h"
-+#include "get_bits.h"
-+#include "hevc.h"
-+
-+typedef struct ShortTermRPS {
-+    unsigned int num_negative_pics;
-+    int num_delta_pocs;
-+    int rps_idx_num_delta_pocs;
-+    int32_t delta_poc[32];
-+    uint8_t used[32];
-+} ShortTermRPS;
-+
-+typedef struct LongTermRPS {
-+    int     poc[32];
-+    uint8_t used[32];
-+    uint8_t nb_refs;
-+} LongTermRPS;
-+
-+typedef struct RpiSliceHeader {
-+    unsigned int pps_id;
-+
-+    ///< address (in raster order) of the first block in the current slice segment
-+    unsigned int   slice_segment_addr;
-+    ///< address (in raster order) of the first block in the current slice
-+    unsigned int   slice_addr;
-+
-+    enum HEVCSliceType slice_type;
-+
-+    int pic_order_cnt_lsb;
-+
-+    uint8_t first_slice_in_pic_flag;
-+    uint8_t dependent_slice_segment_flag;
-+    uint8_t pic_output_flag;
-+    uint8_t colour_plane_id;
-+
-+    ///< RPS coded in the slice header itself is stored here
-+    int short_term_ref_pic_set_sps_flag;
-+    int short_term_ref_pic_set_size;
-+    ShortTermRPS slice_rps;
-+    const ShortTermRPS *short_term_rps;
-+    int long_term_ref_pic_set_size;
-+    LongTermRPS long_term_rps;
-+    unsigned int list_entry_lx[2][32];
-+
-+    uint8_t rpl_modification_flag[2];
-+    uint8_t no_output_of_prior_pics_flag;
-+    uint8_t slice_temporal_mvp_enabled_flag;
-+
-+    unsigned int nb_refs[2];
-+
-+    uint8_t slice_sample_adaptive_offset_flag[3];
-+    uint8_t mvd_l1_zero_flag;
-+
-+    uint8_t cabac_init_flag;
-+    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
-+    uint8_t slice_loop_filter_across_slices_enabled_flag;
-+    uint8_t collocated_list;
-+
-+    uint8_t no_dblk_boundary_flags;
-+
-+    unsigned int collocated_ref_idx;
-+
-+    int slice_qp_delta;
-+    int slice_cb_qp_offset;  // -12, +12
-+    int slice_cr_qp_offset;  // -12, +12
-+
-+    uint8_t cu_chroma_qp_offset_enabled_flag;
-+
-+    int beta_offset;    ///< beta_offset_div2 * 2
-+    int tc_offset;      ///< tc_offset_div2 * 2
-+
-+    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
-+
-+    unsigned *entry_point_offset;
-+    int * offset;
-+    int * size;
-+    int num_entry_point_offsets;
-+    int offsets_allocated;
-+
-+    uint8_t offload_wpp;
-+    uint8_t offload_tiles;
-+
-+    int8_t slice_qp;
-+
-+    uint8_t luma_log2_weight_denom;
-+    uint8_t chroma_log2_weight_denom;
-+
-+    int16_t luma_weight_l0[16];     // -128, +255
-+    int16_t luma_offset_l0[16];
-+    int16_t chroma_weight_l0[16][2];
-+    int16_t chroma_offset_l0[16][2];
-+
-+    int16_t luma_weight_l1[16];
-+    int16_t luma_offset_l1[16];
-+    int16_t chroma_weight_l1[16][2];
-+    int16_t chroma_offset_l1[16][2];
-+
-+} RpiSliceHeader;
-+
-+typedef struct HEVCRpiWindow {
-+    uint16_t left_offset;
-+    uint16_t right_offset;
-+    uint16_t top_offset;
-+    uint16_t bottom_offset;
-+} HEVCRpiWindow;
-+
-+typedef struct VUI {
-+    AVRational sar;
-+
-+    int overscan_info_present_flag;
-+    int overscan_appropriate_flag;
-+
-+    int video_signal_type_present_flag;
-+    int video_format;
-+    int video_full_range_flag;
-+    int colour_description_present_flag;
-+    uint8_t colour_primaries;
-+    uint8_t transfer_characteristic;
-+    uint8_t matrix_coeffs;
-+
-+    int chroma_loc_info_present_flag;
-+    int chroma_sample_loc_type_top_field;
-+    int chroma_sample_loc_type_bottom_field;
-+    int neutra_chroma_indication_flag;
-+
-+    int field_seq_flag;
-+    int frame_field_info_present_flag;
-+
-+    int default_display_window_flag;
-+    HEVCRpiWindow def_disp_win;
-+
-+    int vui_timing_info_present_flag;
-+    uint32_t vui_num_units_in_tick;
-+    uint32_t vui_time_scale;
-+    int vui_poc_proportional_to_timing_flag;
-+    int vui_num_ticks_poc_diff_one_minus1;
-+    int vui_hrd_parameters_present_flag;
-+
-+    int bitstream_restriction_flag;
-+    int tiles_fixed_structure_flag;
-+    int motion_vectors_over_pic_boundaries_flag;
-+    int restricted_ref_pic_lists_flag;
-+    int min_spatial_segmentation_idc;
-+    int max_bytes_per_pic_denom;
-+    int max_bits_per_min_cu_denom;
-+    int log2_max_mv_length_horizontal;
-+    int log2_max_mv_length_vertical;
-+} VUI;
-+
-+typedef struct PTLCommon {
-+    uint8_t profile_space;
-+    uint8_t tier_flag;
-+    uint8_t profile_idc;
-+    uint8_t profile_compatibility_flag[32];
-+    uint8_t level_idc;
-+    uint8_t progressive_source_flag;
-+    uint8_t interlaced_source_flag;
-+    uint8_t non_packed_constraint_flag;
-+    uint8_t frame_only_constraint_flag;
-+} PTLCommon;
-+
-+typedef struct PTL {
-+    PTLCommon general_ptl;
-+    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
-+
-+    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
-+    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
-+} PTL;
-+
-+typedef struct HEVCRpiVPS {
-+    uint8_t vps_temporal_id_nesting_flag;
-+    int vps_max_layers;
-+    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
-+
-+    PTL ptl;
-+    int vps_sub_layer_ordering_info_present_flag;
-+    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
-+    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
-+    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
-+    int vps_max_layer_id;
-+    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
-+    uint8_t vps_timing_info_present_flag;
-+    uint32_t vps_num_units_in_tick;
-+    uint32_t vps_time_scale;
-+    uint8_t vps_poc_proportional_to_timing_flag;
-+    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
-+    int vps_num_hrd_parameters;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+} HEVCRpiVPS;
-+
-+typedef struct ScalingList {
-+    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
-+     * and size ID 3 only has 2 arrays, not 6. */
-+    uint8_t sl[4][6][64];
-+    uint8_t sl_dc[2][6];
-+} ScalingList;
-+
-+typedef struct HEVCRpiSPS {
-+    unsigned vps_id;
-+    uint8_t chroma_format_idc;
-+    uint8_t separate_colour_plane_flag;
-+
-+    HEVCRpiWindow output_window;
-+
-+    HEVCRpiWindow pic_conf_win;
-+
-+    uint16_t wp_offset_half_range;  // WpOffsetHalfRange
-+
-+    uint8_t bit_depth;
-+
-+//    int bit_depth_chroma;  // We only support lum_bit_depth = chroma_bit_depth
-+    uint8_t pixel_shift;
-+    enum AVPixelFormat pix_fmt;
-+
-+    unsigned int log2_max_poc_lsb;
-+
-+    int max_sub_layers;
-+    struct {
-+        int max_dec_pic_buffering;
-+        int num_reorder_pics;
-+        int max_latency_increase;
-+    } temporal_layer[HEVC_MAX_SUB_LAYERS];
-+    uint8_t temporal_id_nesting_flag;
-+
-+    uint8_t scaling_list_enable_flag;
-+    ScalingList scaling_list;
-+
-+    unsigned int nb_st_rps;
-+    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
-+
-+    uint8_t amp_enabled_flag;
-+    uint8_t sao_enabled;
-+
-+    uint8_t long_term_ref_pics_present_flag;
-+    uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
-+    uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
-+    uint8_t num_long_term_ref_pics_sps;
-+
-+    struct {
-+        uint8_t bit_depth;
-+        uint8_t bit_depth_chroma;
-+        uint8_t log2_min_pcm_cb_size;
-+        uint8_t log2_max_pcm_cb_size;
-+        uint8_t loop_filter_disable_flag;
-+    } pcm;
-+    uint8_t sps_temporal_mvp_enabled_flag;
-+    uint8_t sps_strong_intra_smoothing_enable_flag;
-+
-+    unsigned int log2_min_cb_size;  // 3..6
-+    unsigned int log2_diff_max_min_coding_block_size;
-+    unsigned int log2_min_tb_size;  // 2..5
-+    unsigned int log2_max_trafo_size;
-+    unsigned int log2_ctb_size;     // 4..6
-+//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
-+#define LOG2_MIN_PU_SIZE 2
-+#define LOG2_MIN_CU_SIZE 3
-+
-+    int max_transform_hierarchy_depth_inter;
-+    int max_transform_hierarchy_depth_intra;
-+
-+    int transform_skip_rotation_enabled_flag;
-+    int transform_skip_context_enabled_flag;
-+    int implicit_rdpcm_enabled_flag;
-+    int explicit_rdpcm_enabled_flag;
-+    int intra_smoothing_disabled_flag;
-+    int high_precision_offsets_enabled_flag;
-+    int persistent_rice_adaptation_enabled_flag;
-+
-+    ///< coded frame dimension in various units
-+    int width;
-+    int height;
-+    int ctb_width;
-+    int ctb_height;
-+    int ctb_size;   // Pic size in CTBs not size of a CTB
-+    int min_cb_width;
-+    int min_cb_height;
-+    int min_tb_width;
-+    int min_tb_height;
-+    int min_pu_width;
-+    int min_pu_height;
-+    int pcm_width;
-+    int pcm_height;
-+    int tb_mask;
-+
-+    int hshift[3];
-+    int vshift[3];
-+
-+    int qp_bd_offset;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+
-+    VUI vui;
-+    PTL ptl;
-+} HEVCRpiSPS;
-+
-+#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
-+#define CTB_TS_FLAGS_EOTL       (1U << 1)       // Last CTB of a tile line
-+#define CTB_TS_FLAGS_EOL        (1U << 2)       // Last CTB of a complete line
-+#define CTB_TS_FLAGS_EOT        (1U << 3)       // Last CTB of a tile
-+#define CTB_TS_FLAGS_CSAVE      (1U << 4)
-+#define CTB_TS_FLAGS_CIREQ      (1U << 5)       // Cabac init request
-+#define CTB_TS_FLAGS_TOT        (1U << 6)       // CTB on top row of a tile
-+#define CTB_TS_FLAGS_CLOAD      (1U << 7)
-+
-+typedef struct HEVCRpiPPS {
-+    unsigned int sps_id; ///< seq_parameter_set_id
-+
-+    uint8_t sign_data_hiding_flag;
-+
-+    uint8_t cabac_init_present_flag;
-+
-+    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
-+    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
-+    int pic_init_qp_minus26;
-+
-+    uint8_t constrained_intra_pred_flag;
-+    uint8_t transform_skip_enabled_flag;
-+
-+    uint8_t cu_qp_delta_enabled_flag;
-+    uint8_t log2_min_cu_qp_delta_size;
-+    int cb_qp_offset;   // -12..12
-+    int cr_qp_offset;   // -12..12
-+    const uint8_t * qp_dblk_x[3];
-+    const int8_t * qp_bd_x[3];
-+
-+    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
-+    uint8_t weighted_pred_flag;
-+    uint8_t weighted_bipred_flag;
-+    uint8_t output_flag_present_flag;
-+    uint8_t transquant_bypass_enable_flag;
-+
-+    uint8_t dependent_slice_segments_enabled_flag;
-+    uint8_t tiles_enabled_flag;
-+    uint8_t entropy_coding_sync_enabled_flag;
-+
-+    uint8_t tile_wpp_inter_disable;
-+    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
-+    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
-+    uint8_t uniform_spacing_flag;
-+    uint8_t loop_filter_across_tiles_enabled_flag;
-+
-+    uint8_t seq_loop_filter_across_slices_enabled_flag;
-+
-+    uint8_t deblocking_filter_control_present_flag;
-+    uint8_t deblocking_filter_override_enabled_flag;
-+    uint8_t disable_dbf;
-+    int beta_offset;    ///< beta_offset_div2 * 2
-+    int tc_offset;      ///< tc_offset_div2 * 2
-+
-+    uint8_t scaling_list_data_present_flag;
-+    ScalingList scaling_list;
-+
-+    uint8_t lists_modification_present_flag;
-+    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
-+    int num_extra_slice_header_bits;
-+    uint8_t slice_header_extension_present_flag;
-+    uint8_t log2_max_transform_skip_block_size;
-+    uint8_t cross_component_prediction_enabled_flag;
-+    uint8_t chroma_qp_offset_list_enabled_flag;
-+    uint8_t diff_cu_chroma_qp_offset_depth;
-+    uint8_t chroma_qp_offset_list_len_minus1;
-+    int8_t  cb_qp_offset_list[6];
-+    int8_t  cr_qp_offset_list[6];
-+    uint8_t log2_sao_offset_scale_luma;
-+    uint8_t log2_sao_offset_scale_chroma;
-+
-+    // Inferred parameters
-+    uint16_t *column_width;  ///< ColumnWidth
-+    uint16_t *row_height;    ///< RowHeight
-+    uint16_t *col_bd;        ///< ColBd
-+    uint16_t *row_bd;        ///< RowBd
-+    uint16_t *col_idxX;
-+
-+    // We can limit these to uint16_t given our other size limits
-+    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
-+    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
-+    uint16_t *tile_id;           ///< TileId
-+    uint16_t *tile_pos_ts;       ///< TilePosRS
-+    uint16_t *tile_size;         ///< TileSize
-+    uint8_t * ctb_ts_flags;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+} HEVCRpiPPS;
-+
-+typedef struct HEVCRpiParamSets {
-+    /* currently active parameter sets */
-+    const HEVCRpiVPS *vps;
-+    const HEVCRpiSPS *sps;
-+    const HEVCRpiPPS *pps;
-+
-+    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
-+    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
-+    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
-+} HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps);
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps, int apply_defdispwin);
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps);
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-+                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
-+
-+int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
-+                           uint8_t *buf, int buf_size);
-+
-+/**
-+ * Compute POC of the current frame and return it.
-+ */
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
-+
-+#endif /* AVCODEC_RPI_HEVC_PS_H */
-diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
-new file mode 100644
-index 0000000000..8cc5796cf0
---- /dev/null
-+++ b/libavcodec/rpi_hevc_refs.c
-@@ -0,0 +1,485 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
-+{
-+    /* frame->frame can be NULL if context init failed */
-+    if (!frame->frame || !frame->frame->buf[0])
-+        return;
-+
-+    frame->flags &= ~flags;
-+    if (!frame->flags) {
-+        ff_thread_release_buffer(s->avctx, &frame->tf);
-+
-+        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
-+        frame->col_mvf = NULL;
-+
-+        frame->collocated_ref = NULL;
-+    }
-+}
-+
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
-+{
-+    int i;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
-+                            HEVC_FRAME_FLAG_SHORT_REF |
-+                            HEVC_FRAME_FLAG_LONG_REF);
-+}
-+
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
-+{
-+    int i;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+}
-+
-+static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
-+{
-+    int i, ret;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame * const frame = &s->DPB[i];
-+        if (frame->frame->buf[0])
-+            continue;
-+
-+        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
-+                                   AV_GET_BUFFER_FLAG_REF);
-+        if (ret < 0)
-+            return NULL;
-+
-+        frame->col_mvf = NULL;
-+        frame->col_mvf_buf = NULL;
-+        if (s->used_for_ref && !s->is_irap)
-+        {
-+            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
-+            if (!frame->col_mvf_buf)
-+                goto fail;
-+            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
-+        }
-+
-+        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
-+        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
-+
-+        return frame;
-+
-+fail:
-+        ff_hevc_rpi_unref_frame(s, frame, ~0);
-+        return NULL;
-+    }
-+    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
-+    return NULL;
-+}
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
-+{
-+    HEVCRpiFrame *ref;
-+    int i;
-+
-+    /* check that this POC doesn't already exist */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+
-+        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
-+            frame->poc == poc) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
-+                   poc);
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    ref = alloc_frame(s);
-+    if (!ref)
-+        return AVERROR(ENOMEM);
-+
-+    *frame = ref->frame;
-+    s->ref = ref;
-+
-+    if (s->sh.pic_output_flag)
-+        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
-+    else
-+        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
-+
-+    ref->poc      = poc;
-+    ref->sequence = s->seq_decode;
-+    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
-+    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
-+    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
-+    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
-+{
-+    do {
-+        int nb_output = 0;
-+        int min_poc   = INT_MAX;
-+        int i, min_idx, ret;
-+
-+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
-+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+                HEVCRpiFrame *frame = &s->DPB[i];
-+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
-+                        frame->sequence == s->seq_output) {
-+                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+                }
-+            }
-+        }
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
-+                frame->sequence == s->seq_output) {
-+                nb_output++;
-+                if (frame->poc < min_poc || nb_output == 1) {
-+                    min_poc = frame->poc;
-+                    min_idx = i;
-+                }
-+            }
-+        }
-+
-+        /* wait for more frames before output */
-+        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
-+            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
-+            return 0;
-+
-+        if (nb_output) {
-+            HEVCRpiFrame *frame = &s->DPB[min_idx];
-+            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
-+                return 0;
-+
-+            ret = av_frame_ref(out, frame->frame);
-+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
-+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
-+            else
-+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+            if (ret < 0)
-+                return ret;
-+            av_log(s->avctx, AV_LOG_DEBUG,
-+                   "Output frame with POC %d.\n", frame->poc);
-+            return 1;
-+        }
-+
-+        if (s->seq_output != s->seq_decode)
-+            s->seq_output = (s->seq_output + 1) & 0xff;
-+        else
-+            break;
-+    } while (1);
-+
-+    return 0;
-+}
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
-+{
-+    int dpb = 0;
-+    int min_poc = INT_MAX;
-+    int i;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+        if ((frame->flags) &&
-+            frame->sequence == s->seq_output &&
-+            frame->poc != s->poc) {
-+            dpb++;
-+        }
-+    }
-+
-+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if ((frame->flags) &&
-+                frame->sequence == s->seq_output &&
-+                frame->poc != s->poc) {
-+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
-+                    min_poc = frame->poc;
-+                }
-+            }
-+        }
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
-+                frame->sequence == s->seq_output &&
-+                frame->poc <= min_poc) {
-+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
-+            }
-+        }
-+
-+        dpb--;
-+    }
-+}
-+
-+static int init_slice_rpl(HEVCRpiContext *s)
-+{
-+    if (s->slice_idx >= s->rpl_tab_size)
-+        return AVERROR_INVALIDDATA;
-+
-+    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
-+{
-+    RpiSliceHeader *sh = &s->sh;
-+
-+    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
-+    uint8_t list_idx;
-+    int i, j, ret;
-+
-+    ret = init_slice_rpl(s);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
-+          s->rps[LT_CURR].nb_refs)) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    for (list_idx = 0; list_idx < nb_list; list_idx++) {
-+        RefPicList  rpl_tmp = { { 0 } };
-+        RefPicList *rpl     = &s->refPicList[list_idx];
-+
-+        /* The order of the elements is
-+         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
-+         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
-+        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
-+                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
-+                              LT_CURR };
-+
-+        /* concatenate the candidate lists for the current frame */
-+        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
-+            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
-+                RefPicList *rps = &s->rps[cand_lists[i]];
-+                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
-+                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
-+                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
-+                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
-+                    rpl_tmp.nb_refs++;
-+                }
-+            }
-+        }
-+
-+        /* reorder the references if necessary */
-+        if (sh->rpl_modification_flag[list_idx]) {
-+            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
-+                int idx = sh->list_entry_lx[list_idx][i];
-+
-+                if (idx >= rpl_tmp.nb_refs) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                rpl->list[i]       = rpl_tmp.list[idx];
-+                rpl->ref[i]        = rpl_tmp.ref[idx];
-+                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
-+                rpl->nb_refs++;
-+            }
-+        } else {
-+            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
-+            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
-+        }
-+
-+        if (sh->collocated_list == list_idx &&
-+            sh->collocated_ref_idx < rpl->nb_refs)
-+            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
-+    }
-+
-+    return 0;
-+}
-+
-+static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
-+{
-+    int i;
-+    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *ref = &s->DPB[i];
-+        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
-+            if ((ref->poc & LtMask) == poc)
-+                return ref;
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *ref = &s->DPB[i];
-+        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
-+            if (ref->poc == poc || (ref->poc & LtMask) == poc)
-+                return ref;
-+        }
-+    }
-+
-+    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Could not find ref with POC %d\n", poc);
-+    return NULL;
-+}
-+
-+static void mark_ref(HEVCRpiFrame *frame, int flag)
-+{
-+    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
-+    frame->flags |= flag;
-+}
-+
-+static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
-+{
-+    HEVCRpiFrame *frame;
-+    int i, x, y;
-+
-+    frame = alloc_frame(s);
-+    if (!frame)
-+        return NULL;
-+
-+    if (!s->ps.sps->pixel_shift) {
-+        for (i = 0; frame->frame->buf[i]; i++)
-+            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
-+                   frame->frame->buf[i]->size);
-+    } else {
-+        for (i = 0; frame->frame->data[i]; i++)
-+            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
-+                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
-+                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
-+                            1 << (s->ps.sps->bit_depth - 1));
-+                }
-+    }
-+
-+    frame->poc      = poc;
-+    frame->sequence = s->seq_decode;
-+    frame->flags    = 0;
-+
-+    ff_hevc_rpi_progress_set_all_done(frame);
-+
-+    return frame;
-+}
-+
-+/* add a reference with the given poc to the list and mark it as used in DPB */
-+static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
-+                             int poc, int ref_flag)
-+{
-+    HEVCRpiFrame *ref = find_ref_idx(s, poc);
-+
-+    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
-+        return AVERROR_INVALIDDATA;
-+
-+    if (!ref) {
-+        ref = generate_missing_ref(s, poc);
-+        if (!ref)
-+            return AVERROR(ENOMEM);
-+    }
-+
-+    list->list[list->nb_refs] = ref->poc;
-+    list->ref[list->nb_refs]  = ref;
-+    list->nb_refs++;
-+
-+    mark_ref(ref, ref_flag);
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
-+{
-+    const ShortTermRPS *short_rps = s->sh.short_term_rps;
-+    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
-+    RefPicList               *rps = s->rps;
-+    int i, ret = 0;
-+
-+    if (!short_rps) {
-+        rps[0].nb_refs = rps[1].nb_refs = 0;
-+        return 0;
-+    }
-+
-+    /* clear the reference flags on all frames except the current one */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+
-+        if (frame == s->ref)
-+            continue;
-+
-+        mark_ref(frame, 0);
-+    }
-+
-+    for (i = 0; i < NB_RPS_TYPE; i++)
-+        rps[i].nb_refs = 0;
-+
-+    /* add the short refs */
-+    for (i = 0; i < short_rps->num_delta_pocs; i++) {
-+        int poc = s->poc + short_rps->delta_poc[i];
-+        int list;
-+
-+        if (!short_rps->used[i])
-+            list = ST_FOLL;
-+        else if (i < short_rps->num_negative_pics)
-+            list = ST_CURR_BEF;
-+        else
-+            list = ST_CURR_AFT;
-+
-+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
-+        if (ret < 0)
-+            goto fail;
-+    }
-+
-+    /* add the long refs */
-+    for (i = 0; i < long_rps->nb_refs; i++) {
-+        int poc  = long_rps->poc[i];
-+        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
-+
-+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
-+        if (ret < 0)
-+            goto fail;
-+    }
-+
-+fail:
-+    /* release any frames that are now unused */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
-+
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
-+{
-+    int ret = 0;
-+    int i;
-+    const ShortTermRPS *rps = s->sh.short_term_rps;
-+    LongTermRPS *long_rps   = &s->sh.long_term_rps;
-+
-+    if (rps) {
-+        for (i = 0; i < rps->num_negative_pics; i++)
-+            ret += !!rps->used[i];
-+        for (; i < rps->num_delta_pocs; i++)
-+            ret += !!rps->used[i];
-+    }
-+
-+    if (long_rps) {
-+        for (i = 0; i < long_rps->nb_refs; i++)
-+            ret += !!long_rps->used[i];
-+    }
-+    return ret;
-+}
-diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
-new file mode 100644
-index 0000000000..cd8149d58e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.c
-@@ -0,0 +1,368 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "golomb.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
-+{
-+    int cIdx, i;
-+    uint8_t hash_type;
-+    //uint16_t picture_crc;
-+    //uint32_t picture_checksum;
-+    hash_type = get_bits(gb, 8);
-+
-+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
-+        if (hash_type == 0) {
-+            s->is_md5 = 1;
-+            for (i = 0; i < 16; i++)
-+                s->md5[cIdx][i] = get_bits(gb, 8);
-+        } else if (hash_type == 1) {
-+            // picture_crc = get_bits(gb, 16);
-+            skip_bits(gb, 16);
-+        } else if (hash_type == 2) {
-+            // picture_checksum = get_bits_long(gb, 32);
-+            skip_bits(gb, 32);
-+        }
-+    }
-+    return 0;
-+}
-+
-+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
-+{
-+    int i;
-+    // Mastering primaries
-+    for (i = 0; i < 3; i++) {
-+        s->display_primaries[i][0] = get_bits(gb, 16);
-+        s->display_primaries[i][1] = get_bits(gb, 16);
-+    }
-+    // White point (x, y)
-+    s->white_point[0] = get_bits(gb, 16);
-+    s->white_point[1] = get_bits(gb, 16);
-+
-+    // Max and min luminance of mastering display
-+    s->max_luminance = get_bits_long(gb, 32);
-+    s->min_luminance = get_bits_long(gb, 32);
-+
-+    // As this SEI message comes before the first frame that references it,
-+    // initialize the flag to 2 and decrement on IRAP access unit so it
-+    // persists for the coded video sequence (e.g., between two IRAPs)
-+    s->present = 2;
-+    return 0;
-+}
-+
-+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
-+{
-+    // Max and average light levels
-+    s->max_content_light_level     = get_bits_long(gb, 16);
-+    s->max_pic_average_light_level = get_bits_long(gb, 16);
-+    // As this SEI message comes before the first frame that references it,
-+    // initialize the flag to 2 and decrement on IRAP access unit so it
-+    // persists for the coded video sequence (e.g., between two IRAPs)
-+    s->present = 2;
-+    return  0;
-+}
-+
-+static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
-+{
-+    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
-+    s->present = !get_bits1(gb);
-+
-+    if (s->present) {
-+        s->arrangement_type               = get_bits(gb, 7);
-+        s->quincunx_subsampling           = get_bits1(gb);
-+        s->content_interpretation_type    = get_bits(gb, 6);
-+
-+        // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
-+        skip_bits(gb, 3);
-+        s->current_frame_is_frame0_flag = get_bits1(gb);
-+        // frame0_self_contained_flag, frame1_self_contained_flag
-+        skip_bits(gb, 2);
-+
-+        if (!s->quincunx_subsampling && s->arrangement_type != 5)
-+            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
-+        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
-+        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
-+    }
-+    skip_bits1(gb);             // upsampled_aspect_ratio_flag
-+    return 0;
-+}
-+
-+static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
-+{
-+    s->present = !get_bits1(gb);
-+
-+    if (s->present) {
-+        s->hflip = get_bits1(gb);     // hor_flip
-+        s->vflip = get_bits1(gb);     // ver_flip
-+
-+        s->anticlockwise_rotation = get_bits(gb, 16);
-+        skip_bits1(gb);     // display_orientation_persistence_flag
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
-+                                     void *logctx, int size)
-+{
-+    HEVCSEIPictureTiming *h = &s->picture_timing;
-+    HEVCRpiSPS *sps;
-+
-+    if (!ps->sps_list[s->active_seq_parameter_set_id])
-+        return(AVERROR(ENOMEM));
-+    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
-+
-+    if (sps->vui.frame_field_info_present_flag) {
-+        int pic_struct = get_bits(gb, 4);
-+        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
-+        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
-+            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
-+            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
-+        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
-+            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
-+            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
-+        }
-+        get_bits(gb, 2);                   // source_scan_type
-+        get_bits(gb, 1);                   // duplicate_flag
-+        skip_bits1(gb);
-+        size--;
-+    }
-+    skip_bits_long(gb, 8 * size);
-+
-+    return 0;
-+}
-+
-+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
-+                                                      int size)
-+{
-+    int flag;
-+    int user_data_type_code;
-+    int cc_count;
-+
-+    if (size < 3)
-+       return AVERROR(EINVAL);
-+
-+    user_data_type_code = get_bits(gb, 8);
-+    if (user_data_type_code == 0x3) {
-+        skip_bits(gb, 1); // reserved
-+
-+        flag = get_bits(gb, 1); // process_cc_data_flag
-+        if (flag) {
-+            skip_bits(gb, 1);
-+            cc_count = get_bits(gb, 5);
-+            skip_bits(gb, 8); // reserved
-+            size -= 2;
-+
-+            if (cc_count && size >= cc_count * 3) {
-+                const uint64_t new_size = (s->a53_caption_size + cc_count
-+                                           * UINT64_C(3));
-+                int i, ret;
-+
-+                if (new_size > INT_MAX)
-+                    return AVERROR(EINVAL);
-+
-+                /* Allow merging of the cc data from two fields. */
-+                ret = av_reallocp(&s->a53_caption, new_size);
-+                if (ret < 0)
-+                    return ret;
-+
-+                for (i = 0; i < cc_count; i++) {
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                }
-+                skip_bits(gb, 8); // marker_bits
-+            }
-+        }
-+    } else {
-+        int i;
-+        for (i = 0; i < size - 1; i++)
-+            skip_bits(gb, 8);
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
-+                                                         int size)
-+{
-+    uint32_t country_code;
-+    uint32_t user_identifier;
-+
-+    if (size < 7)
-+        return AVERROR(EINVAL);
-+    size -= 7;
-+
-+    country_code = get_bits(gb, 8);
-+    if (country_code == 0xFF) {
-+        skip_bits(gb, 8);
-+        size--;
-+    }
-+
-+    skip_bits(gb, 8);
-+    skip_bits(gb, 8);
-+
-+    user_identifier = get_bits_long(gb, 32);
-+
-+    switch (user_identifier) {
-+        case MKBETAG('G', 'A', '9', '4'):
-+            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
-+        default:
-+            skip_bits_long(gb, size * 8);
-+            break;
-+    }
-+    return 0;
-+}
-+
-+static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
-+{
-+    int num_sps_ids_minus1;
-+    int i;
-+    unsigned active_seq_parameter_set_id;
-+
-+    get_bits(gb, 4); // active_video_parameter_set_id
-+    get_bits(gb, 1); // self_contained_cvs_flag
-+    get_bits(gb, 1); // num_sps_ids_minus1
-+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
-+
-+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
-+        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
-+    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
-+
-+    for (i = 1; i <= num_sps_ids_minus1; i++)
-+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
-+{
-+    s->present = 1;
-+    s->preferred_transfer_characteristics = get_bits(gb, 8);
-+    return 0;
-+}
-+
-+static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
-+                                 int type, int size)
-+{
-+    switch (type) {
-+    case 256:  // Mismatched value from HM 8.1
-+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+    case HEVC_SEI_TYPE_FRAME_PACKING:
-+        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
-+    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
-+        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
-+    case HEVC_SEI_TYPE_PICTURE_TIMING:
-+        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
-+    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
-+        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
-+    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
-+        return decode_nal_sei_content_light_info(&s->content_light, gb);
-+    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
-+        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
-+    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
-+        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
-+    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
-+        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
-+    default:
-+        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
-+        skip_bits_long(gb, 8 * size);
-+        return 0;
-+    }
-+}
-+
-+static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                                 int type, int size)
-+{
-+    switch (type) {
-+    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
-+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+    default:
-+        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
-+        skip_bits_long(gb, 8 * size);
-+        return 0;
-+    }
-+}
-+
-+static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
-+                                  const HEVCRpiParamSets * const ps, const int nal_unit_type)
-+{
-+    int payload_type = 0;
-+    int payload_size = 0;
-+    int byte = 0xFF;
-+    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
-+
-+    while (byte == 0xFF) {
-+       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
-+           return AVERROR_INVALIDDATA;
-+        byte          = get_bits(gb, 8);
-+        payload_type += byte;
-+    }
-+    byte = 0xFF;
-+    while (byte == 0xFF) {
-+        if (get_bits_left(gb) < 8 + 8LL*payload_size)
-+            return AVERROR_INVALIDDATA;
-+         byte          = get_bits(gb, 8);
-+        payload_size += byte;
-+    }
-+    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
-+        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
-+    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
-+        return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
-+    }
-+}
-+
-+static int more_rbsp_data(GetBitContext *gb)
-+{
-+    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                           const HEVCRpiParamSets *ps, int type)
-+{
-+    int ret;
-+
-+    do {
-+        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
-+        if (ret < 0)
-+            return ret;
-+    } while (more_rbsp_data(gb));
-+    return 1;
-+}
-+
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
-+{
-+    s->a53_caption.a53_caption_size = 0;
-+    av_freep(&s->a53_caption.a53_caption);
-+}
-diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
-new file mode 100644
-index 0000000000..d4ac348df9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.h
-@@ -0,0 +1,135 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_SEI_H
-+#define AVCODEC_RPI_HEVC_SEI_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/md5.h"
-+
-+#include "get_bits.h"
-+
-+/**
-+ * SEI message types
-+ */
-+typedef enum {
-+    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
-+    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
-+    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
-+    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
-+    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
-+    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
-+    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
-+    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
-+    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
-+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
-+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
-+    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
-+    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
-+    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
-+    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
-+    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
-+    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
-+    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
-+    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
-+    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
-+    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
-+    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
-+    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
-+    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
-+    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
-+    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
-+} HEVC_SEI_Type;
-+
-+typedef struct HEVCSEIPictureHash {
-+    uint8_t       md5[3][16];
-+    uint8_t is_md5;
-+} HEVCSEIPictureHash;
-+
-+typedef struct HEVCSEIFramePacking {
-+    int present;
-+    int arrangement_type;
-+    int content_interpretation_type;
-+    int quincunx_subsampling;
-+    int current_frame_is_frame0_flag;
-+} HEVCSEIFramePacking;
-+
-+typedef struct HEVCSEIDisplayOrientation {
-+    int present;
-+    int anticlockwise_rotation;
-+    int hflip, vflip;
-+} HEVCSEIDisplayOrientation;
-+
-+typedef struct HEVCSEIPictureTiming {
-+    int picture_struct;
-+} HEVCSEIPictureTiming;
-+
-+typedef struct HEVCSEIA53Caption {
-+    int a53_caption_size;
-+    uint8_t *a53_caption;
-+} HEVCSEIA53Caption;
-+
-+typedef struct HEVCSEIMasteringDisplay {
-+    int present;
-+    uint16_t display_primaries[3][2];
-+    uint16_t white_point[2];
-+    uint32_t max_luminance;
-+    uint32_t min_luminance;
-+} HEVCSEIMasteringDisplay;
-+
-+typedef struct HEVCSEIContentLight {
-+    int present;
-+    uint16_t max_content_light_level;
-+    uint16_t max_pic_average_light_level;
-+} HEVCSEIContentLight;
-+
-+typedef struct HEVCSEIAlternativeTransfer {
-+    int present;
-+    int preferred_transfer_characteristics;
-+} HEVCSEIAlternativeTransfer;
-+
-+typedef struct HEVCSEIContext {
-+    HEVCSEIPictureHash picture_hash;
-+    HEVCSEIFramePacking frame_packing;
-+    HEVCSEIDisplayOrientation display_orientation;
-+    HEVCSEIPictureTiming picture_timing;
-+    HEVCSEIA53Caption a53_caption;
-+    HEVCSEIMasteringDisplay mastering_display;
-+    HEVCSEIContentLight content_light;
-+    int active_seq_parameter_set_id;
-+    HEVCSEIAlternativeTransfer alternative_transfer;
-+} HEVCSEIContext;
-+
-+struct HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                           const struct HEVCRpiParamSets *ps, int type);
-+
-+/**
-+ * Reset SEI values that are stored on the Context.
-+ * e.g. Caption data that was extracted during NAL
-+ * parsing.
-+ *
-+ * @param s HEVCRpiContext.
-+ */
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
-+
-+#endif /* AVCODEC_RPI_HEVC_SEI_H */
-diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
-new file mode 100644
-index 0000000000..23b49a99ae
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.c
-@@ -0,0 +1,1537 @@
-+#include "rpi_hevc_shader.h"
-+
-+#ifdef _MSC_VER
-+   #include <stdint.h>
-+   /* cast through uintptr_t to avoid warnings */
-+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-+#else
-+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" { /* the types are probably wrong... */
-+#endif
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#ifdef _MSC_VER
-+__declspec(align(8))
-+#elif defined(__GNUC__)
-+__attribute__((aligned(8)))
-+#endif
-+unsigned int ff_hevc_rpi_shader[] = {
-+// ::mc_setup_c_q0
-+// ::mc_start
-+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_c_qn
-+/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
-+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
-+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
-+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
-+/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
-+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000110] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
-+/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
-+/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
-+/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
-+/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+// :1
-+/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
-+/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
-+/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
-+/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
-+// ::mc_filter_c_p
-+/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
-+/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
-+/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_p_l1
-+/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
-+/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
-+/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
-+/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_b
-+/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
-+/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
-+/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
-+/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
-+/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
-+/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
-+/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
-+/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
-+/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
-+/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
-+/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
-+/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
-+/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
-+/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
-+/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
-+/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+// :1
-+/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
-+/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
-+/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
-+/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
-+/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
-+/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
-+/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
-+/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_sync_q0
-+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q1
-+/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q2
-+/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q3
-+/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q4
-+/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q5
-+/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q6
-+/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q7
-+/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q8
-+/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q9
-+/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q10
-+/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q11
-+/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_qn
-+// ::mc_exit_y_qn
-+/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_q0
-+// ::mc_exit_y_q0
-+/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y_q0
-+/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_y_qn
-+/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
-+/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
-+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
-+/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
-+/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
-+/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
-+/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
-+// :1
-+/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000df0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
-+/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
-+/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
-+/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
-+// :per_block_setup_8
-+/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
-+/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
-+/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
-+/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
-+/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
-+/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
-+/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
-+/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
-+/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
-+/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
-+/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
-+/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
-+/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
-+/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+// ::mc_filter_y_pxx
-+/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
-+/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_bxx
-+/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
-+/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
-+/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
-+/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
-+/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
-+/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_p00
-+/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
-+/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
-+/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
-+/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
-+/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
-+/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
-+/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_b00
-+/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
-+/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
-+/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_setup_c10_q0
-+/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_c10_qn
-+/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
-+/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
-+/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
-+/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
-+/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
-+/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
-+/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00001770] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
-+/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
-+/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
-+/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
-+/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+// :1
-+/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
-+/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
-+/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
-+/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
-+// ::mc_filter_c10_p
-+/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
-+/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_p_l1
-+/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
-+/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
-+/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_b
-+/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
-+/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
-+/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
-+/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
-+/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
-+/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
-+/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
-+/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
-+/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
-+/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
-+/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
-+/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
-+/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
-+/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
-+/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
-+/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+// :1
-+/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
-+/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
-+/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
-+/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
-+/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
-+/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
-+/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_sync10_q0
-+/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q1
-+/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q2
-+/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q3
-+/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q4
-+/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q5
-+/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q6
-+/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q7
-+/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q8
-+/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q9
-+/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q10
-+/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q11
-+/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_q0
-+// ::mc_exit_y10_q0
-+/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_qn
-+// ::mc_exit_y10_qn
-+/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y10_q0
-+/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_y10_qn
-+/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
-+/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
-+/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
-+/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
-+/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
-+/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
-+/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
-+// :1
-+/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00002428] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
-+/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
-+/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
-+/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
-+// :per_block_setup_10
-+/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
-+/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
-+/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
-+/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
-+/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
-+/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
-+/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
-+/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
-+/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
-+/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
-+/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
-+/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
-+/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
-+/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+// ::mc_filter_y10_pxx
-+/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
-+/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_p00
-+/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
-+/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
-+/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
-+/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
-+/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
-+/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
-+/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_bxx
-+/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
-+/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
-+/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
-+/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
-+/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
-+/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_b00
-+/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
-+/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
-+/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_end
-+};
-+#ifdef __HIGHC__
-+#pragma Align_to(8, ff_hevc_rpi_shader)
-+#endif
-diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
-new file mode 100644
-index 0000000000..79651c9b6c
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.h
-@@ -0,0 +1,63 @@
-+#ifndef rpi_hevc_shader_H
-+#define rpi_hevc_shader_H
-+
-+extern unsigned int ff_hevc_rpi_shader[];
-+
-+#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
-+#define mc_start (ff_hevc_rpi_shader + 0)
-+#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
-+#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
-+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
-+#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
-+#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
-+#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
-+#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
-+#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
-+#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
-+#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
-+#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
-+#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
-+#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
-+#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
-+#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
-+#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
-+#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
-+#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
-+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
-+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
-+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
-+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
-+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
-+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
-+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
-+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
-+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
-+#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
-+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
-+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
-+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
-+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
-+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
-+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
-+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
-+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
-+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
-+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
-+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
-+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
-+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
-+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
-+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
-+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
-+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
-+#define mc_end (ff_hevc_rpi_shader + 2860)
-+
-+#endif
-diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
-new file mode 100644
-index 0000000000..77946a0443
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.qasm
-@@ -0,0 +1,1821 @@
-+# Inter pred asm
-+#
-+# Logic here should be good to 14 bits without modification
-+# but only 8 & 10 are currently instantiated & tested
-+# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
-+# in _p00 & _b00
-+
-+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
-+# the warning that we are using rotation & ra/rb registers. r0..3 can be
-+# rotated through all 16 elems ra regs can only be rotated through their
-+# local 4.  As it happens this is what is wanted here as we do not want the
-+# constants from the other half of the calc.
-+
-+# Number limits in P/B calculation
-+#
-+# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
-+# we offset our intermediates s.t. they always end up +ve before the next
-+# multiply (may be -ve whilst summing but that doesn't matter).
-+#
-+# Range calc for up to 14 bits (Y-B pred):
-+#
-+# denom: [0, 7]
-+# bmax = (1 << bits) - 1
-+# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
-+#
-+# wt_mul: [-128, 255]
-+# wt_off = off * 2 + 1: [-bmax, bmax]
-+#
-+# pel: [0, bmax]
-+# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
-+# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
-+# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
-+# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
-+# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
-+#  [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
-+#
-+# This all looks good and is mostly bit depth independant - and as we manage
-+# to do unsigned multiplies everywhere (now) this should be good for any bit
-+# depth up to 14 (we could probably do 16 - but that requires a few tweaks
-+# to the shifts we don't currently have logic for)
-+
-+# PREREAD is the number of requests that we have sitting in the TMU request
-+# queue.
-+#
-+# There are 8 slots availible in the TMU request Q for tm0s requests, but
-+# only 4 output FIFO entries and overflow is bad (corruption or crash)
-+# (If threaded then only 2 out FIFO entries, but we aren't.)
-+# In s/w we are effectively limited to the min vertical read which is >= 4
-+# so output FIFO is the limit.
-+#
-+# As the test for read-next is is the main part of the Luma loop (rather than
-+# the preload FIFO part) we are limited to min_luma_height - 1
-+# Min_luma_height is 4 so we can only have a preload of 3
-+# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
-+# in chroma without abandoning preload pretty much entirely (which would be bad)
-+#
-+# Timing tests vs preload of 4 suggests this doesn't hurt us much
-+# Could have preread 4 for Chroma but when tested it didn't help
-+
-+.set PREREAD,                      3
-+
-+# Offset added (effectively) at the exit of the H FIR filter
-+# This is enough to force the result +ve
-+# Is good if it is a power of 2 as that allows for >> without loss
-+#
-+# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
-+# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
-+# Round up to next power of 2
-+
-+.set FIR_OFFSET,                   0x4000
-+
-+# Block heights - 8 & 16 are the only numbers we currently support
-+
-+.set C_BLK_HEIGHT_8,               16
-+.set C_BLK_HEIGHT_16,              8
-+.set Y_BLK_HEIGHT_8,               16
-+.set Y_BLK_HEIGHT_16,              8
-+
-+# QPU counts - depend on block size
-+# If we have a 2-byte format & block_size > 8 then can only afford
-+# 8 QPUs
-+# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
-+
-+.set N_QPU_8,                      12
-+.set N_QPU_16,                     12
-+
-+# Value to add to the weight multiplier to convert it into an unsigned value
-+# Should be power of two for convienience
-+
-+.set LOG2_MUL_ADD,                 14
-+.set MUL_ADD,                      (1 << LOG2_MUL_ADD)
-+
-+# Fixed denom (max that it can be set to)
-+.set DENOM,                        7
-+
-+# register allocation
-+#
-+
-+# ra0-3
-+# Used as temp and may be loop filter coeffs (split into .8s)
-+# or temp in loop. Check usage on an individual basis.
-+
-+# ra4-11
-+# V FIFO / temp / free
-+
-+# -- free --                       ra12
-+
-+# -- free --                       ra13
-+
-+# -- free --                       ra14
-+
-+# -- free --                       ra15
-+
-+# uniform: width:height
-+.set ra_width_height,              ra16
-+.set ra_width,                     ra16.16b
-+.set ra_height,                    ra16.16a
-+
-+# y:y2 same layout as y_y2_next so we can update both together
-+.set ra_y_y2,                      ra17
-+.set ra_y2,                        ra17.16a
-+.set ra_y,                         ra17.16b
-+
-+# uniform: L1 weight (U on left, V on right)
-+# Only used in Y B
-+.set ra_wt_off_mul_l1,             ra18
-+.set ra_wt_off_l1,                 ra18.16b
-+.set ra_wt_mul_l1,                 ra18.16a
-+
-+# y_next:y2_next same layout as y_y2 so we can update both together
-+.set ra_y_y2_next,                 ra19
-+.set ra_y_next,                    ra19.16b
-+.set ra_y2_next,                   ra19.16a
-+
-+# Setup: consts - subdivide a single register
-+.set ra_kff800100,                 ra20
-+.set ra_k256,                      ra20.16a
-+.set ra_k0,                        ra20.8a
-+.set ra_k1,                        ra20.8b
-+.set ra_k128,                      ra20.8c
-+.set ra_k255,                      ra20.8d
-+
-+# Loop: xshifts
-+.set ra_xshift,                    ra21.16a
-+.set ra_xshift_next,               ra21.16b
-+
-+# Loop var: L0 weight (U on left, V on right)
-+# _off_ is not used in loop as we want to modify it before use
-+.set ra_wt_off_mul_l0,             ra22
-+.set ra_wt_mul_l0,                 ra22.16a
-+.set ra_wt_off_l0,                 ra22.16b
-+
-+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
-+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
-+#   2nd byte   but as the source should never be > 3 there 0x3ff should do
-+.set ra_blk_height_pmax,           ra23
-+.set ra_pmax,                      ra23.16a
-+.set ra_blk_height,                ra23.8c
-+# --free --                        ra23.8d
-+
-+# Loop:  src frame base (L0)
-+.set ra_base,                      ra24
-+
-+# Misc  offsets
-+.set ra_fir_off_val_wt_den_p7,     ra25
-+.set ra_wt_den_p7,                 ra25.8a
-+# -- free --                       ra25.8b
-+.set ra_fir_off_val,               ra25.16b
-+
-+# As it happens these constants are the same
-+.if FIR_OFFSET == MUL_ADD
-+# Weight multiplier unsigned add
-+.set ra_kmul_add,                  ra_fir_off_val
-+.else
-+.error "FIR_OFFSET != MUL_ADD: Need new register & init"
-+.endif
-+
-+# Loop: next src frame base (L0)
-+.set ra_base_next,                 ra26
-+
-+# Loop: height<<23 + width<<16 + vdw_setup_0
-+.set ra_dma0,                      ra27
-+
-+# Loop: destination address
-+.set ra_dest,                      ra28
-+
-+# Setup: Dup of rb_ef
-+# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
-+# (top bits are ignored by mul24)
-+.set ra_ef,                        ra29
-+
-+# Use an even numbered register as a link register to avoid corrupting flags
-+.set ra_link,                      ra30
-+
-+# -- free --                       ra31
-+
-+.set rb_xshift2,                   rb0
-+.set rb_xshift2_next,              rb1
-+
-+# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
-+.set rb_elem_x,                    rb2
-+
-+# El Flags
-+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
-+# Duped into ra_ef as sometimes that is easier to use
-+.set rb_ef,                        rb3
-+
-+# rb4-11
-+# Loop: V filter FIFO or V filter coeff
-+
-+# Loop var: offset to add before shift (round + weighting offsets)
-+# Exact value varies by loop
-+.set rb_wt_off,                    rb12
-+
-+# -- free --                       rb13
-+
-+# -- free --                       rb14
-+
-+# Loop: src frame base (L1)
-+.set rb_base2,                     rb15
-+
-+# Line pitch (128 for sand128)
-+.set rb_pitch,                     rb16
-+
-+# Loop count - 2 (set up TMU for next xfer)
-+.set rb_i_tmu,                     rb17
-+
-+# Loop count for min(height, 16)
-+# Y will reset & loop again if height > 16
-+.set rb_lcount,                    rb18
-+
-+# frame_base2_next
-+.set rb_base2_next,                rb19
-+
-+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
-+# offset to the slice
-+.set rb_xpitch,                    rb20
-+
-+# These 3 consts each save 1 instruction in Y loop setup
-+# so whilst they are worthwhile they should be the 1st to die if we need
-+# another b reg
-+.set rb_y_coeffs_2,                rb21                         # 0x050b0a00
-+.set rb_y_coeffs_3,                rb22                         # 0x11283a40
-+.set rb_y_coeffs_5,                rb23                         # 0x0a0b0500
-+
-+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
-+.set rb_pmask,                     rb24
-+
-+# vdw_setup_1(dst_pitch)
-+.set rb_dma1_base,                 rb25
-+
-+# Setup: pic width - 1
-+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
-+.set rb_max_x,                     rb26
-+
-+# vdw_setup_0 (depends on QPU number)
-+.set rb_dma0_base,                 rb27
-+
-+# Setup: vw_setup value to reset VPM write pointer
-+.set rb_vpm_init,                  rb28
-+
-+# Loop: vdw_setup_1(dst_pitch-width) = stride
-+.set rb_dma1,                      rb29
-+
-+# Setup: pic_height - 1
-+.set rb_max_y,                     rb30
-+
-+# Setup: FIR H offset
-+.set rb_fir_off_h,                 rb31
-+
-+
-+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
-+.set i_shift16,                    -16
-+.set i_shift21,                    -11
-+.set i_shift23,                     -9
-+.set i_shift30,                     -2
-+
-+# Much of the setup code is common between Y & C
-+# Macros that express this - obviously these can't be overlapped
-+# so are probably unsuitable for loop code
-+
-+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
-+  mov r2, qpu_num
-+.if v_bit_depth <= 8
-+  # 8 bit version
-+  asr r1, r2, 2
-+  shl r1, r1, 6
-+  and r0, r2, 3
-+  or  r0, r0, r1
-+
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add r_vpm, r0, r1  # VPM 8bit storage
-+
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
-+
-+.else
-+  # 16 bit version
-+  # Limited to 8 QPUs if blk height > 8
-+  asr r1, r2, 1
-+.if v_blk_height <= 8
-+  shl r1, r1, 4
-+.else
-+  shl r1, r1, 5
-+.endif
-+  and r0, r2, 1
-+  or  r0, r0, r1
-+
-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
-+  add r_vpm, r0, r1
-+
-+  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
-+  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
-+  shl r0, r0, 6
-+.endif
-+  add r_dma, r0, r1  # DMA out
-+.endm
-+
-+
-+.macro m_setup_q0
-+  srel -, 12
-+.endm
-+
-+# Code start label
-+::mc_start
-+
-+################################################################################
-+# mc_setup_c
-+#
-+# typedef struct qpu_mc_pred_c_s_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint32_t pic_cw;            // C Width (== Y width / 2)
-+#     uint32_t pic_ch;            // C Height (== Y Height / 2)
-+#     uint32_t stride2;
-+#     uint32_t stride1;
-+#     uint32_t wdenom;
-+#     int16_t y2;
-+#     int16_t x2;
-+#     uint32_t base2;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_s_t;
-+
-+.macro m_setup_c, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_pmask,           0xff
-+.set v_blk_height,      C_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         2
-+.set v_pmask,           0xffff
-+.set v_blk_height,      C_BLK_HEIGHT_16
-+.endif
-+
-+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
-+
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30      ; mov ra_base, unif             # ; ref_c_base
-+
-+# Read image dimensions
-+  sub r0, unif, 1                                               # pic c width
-+  shl rb_max_x, r0, v_x_shift                                   # rb_max_x in bytes
-+  sub rb_max_y, unif, 1                                         # pic c height
-+
-+# load constants
-+  mov ra_kff800100, 0xff800100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+
-+# get source pitch
-+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # ; stride2
-+  mov rb_pitch, unif                                            # stride1
-+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay] Merged with dst_stride shortly
-+  add rb_dma1_base, r1, rb_pitch                                # vdw_setup_1
-+
-+  and r0, 1, elem_num
-+  nop                           ; mul24 r0, r0, 5
-+.if v_bit_depth <= 8
-+  add rb_elem_x, r0, elem_num
-+.else
-+  add r0, r0, elem_num
-+  add rb_elem_x, r0, r0
-+.endif
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
-+
-+  shl r0, ra0.16b, v_x_shift                                    # [rb_elem_x delay]
-+  add r0, r0, rb_elem_x                                         # Add elem no to x to get X for this slice
-+  max r0, r0, 0                 ; mov ra_y, ra0.16a             # ; stash Y
-+  min r0, r0, rb_max_x
-+
-+# Get shift
-+# Shift will always calculate as 0 for 9+ bit
-+# Ideally we can optimize the shift out of the code in these cases but for now
-+# it is tidier to leave it in
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.else
-+  mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
-+.endif
-+
-+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
-+
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra0, unif                 # ; next_x2_y2
-+  add ra_base, ra_base, r0
-+
-+# Compute part of VPM to use for DMA output
-+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+# And again for L1, but only worrying about frame2 stuff
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# rb_base2 ends up with t1s base
-+
-+  shl r0, ra0.16b, v_x_shift
-+  add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a            # Add QPU slice offset
-+  max r0, r0, 0                 ; mov rb_base2, unif            # ref_c_base2
-+  min r0, r0, rb_max_x
-+
-+# Get shift (already zero if 9+ bit so ignore)
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
-+
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
-+
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1                ; mov r3, PREREAD
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov r2, ra_y2
-+  add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+
-+# Do preloads
-+# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
-+
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1          ; mov ra_y, r0
-+
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t1s, rb_base2, r1         ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+  mov ra_link, unif                                             # link
-+# touch registers to keep simulator happy (and fills in delay slots)
-+  mov ra4, 0                    ; mov rb4, 0
-+  bra -, ra_link
-+  mov ra5, 0                    ; mov rb5, 0
-+  mov ra6, 0                    ; mov rb6, 0
-+  mov ra7, 0                    ; mov rb7, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_c_q0
-+  m_setup_q0
-+::mc_setup_c_qn
-+  m_setup_c 8
-+
-+################################################################################
-+#
-+# mc_filter_c_p
-+#
-+# typedef struct qpu_mc_pred_c_p_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint16_t h;
-+#     uint16_t w;
-+#     uint32_t coeffs_x;
-+#     uint32_t coeffs_y;
-+#     uint32_t wo_u;
-+#     uint32_t wo_v;
-+#     uint32_t dst_addr_c;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_p_t;
-+
-+.macro m_filter_c_p, v_tmu, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_x_mul,           4
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+.if v_tmu == 0
-+.set vrx_xshift,        rb_xshift2              # b side more convienient
-+.set vrx_xshift_next,   ra_xshift_next
-+.set vra_y_next,        ra_y_next
-+.set vrx_base_next,     ra_base_next
-+.set vra_y,             ra_y
-+.set vra_base,          ra_base
-+.set vr_txs,            t0s
-+.else
-+.set vrx_xshift,        ra_xshift               # a side more convienient
-+.set vrx_xshift_next,   rb_xshift2_next
-+.set vra_y_next,        ra_y2_next
-+.set vrx_base_next,     rb_base2_next
-+.set vra_y,             ra_y2
-+.set vra_base,          rb_base2
-+.set vr_txs,            t1s
-+.endif
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
-+
-+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; base
-+
-+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0          # r5 = 0
-+  add r0, r0, rb_elem_x         ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
-+  sub r1, r5, rb_pitch          ; mov ra0, unif                 # ; H filter coeffs
-+  max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+  min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+
-+.if v_bit_depth <= 8
-+  shl vrx_xshift_next, r0, 3
-+  and r0, r0, -4
-+.endif
-+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra3, unif                 # ; V filter coeffs
-+  add vrx_base_next, r3, r0     ; mov r1, ra_height
-+
-+# set up VPM write
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
-+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
-+
-+# Misc final setup...
-+
-+  shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif             # ; dst_addr
-+  add r0, r0, r2                ; mov r2, ra_fir_off_val        # Combine width and height of destination area (r0=h<<8, r2=w*2)
-+  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c              # Shift into bits 16 upwards of the vdw_setup0 register
-+  add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0          # ; r1=weight
-+  shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+  sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4            # ; loop counter (V FIFO fill = 4)
-+  mov rb11, ra3.8d              ; mov ra_link, unif             # ; Link
-+
-+# r5           = -4                     (loop counter)
-+# ra_wt_mul_l0 = weight L0 + 128        (now unsigned)
-+# rb_wt_off    = (offset * 2 + 1) << (wt_den + 5)
-+# rb31         = FIR value offset
-+
-+# FIFO: rb4, ra5, rb6, ra7
-+# Coeffs in ra3.8a, ra3.8b, rb10, rb11
-+
-+# We want (r0r1)
-+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
-+# We fetch (after shift)
-+#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+.if v_tmu == 0
-+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+  shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+.else
-+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+  shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next       # [r1 << delay]
-+.endif
-+
-+  add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+  min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+
-+  and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+.if v_tmu == 0
-+  add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask        # ; mask bytes
-+.else
-+  add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax         # ; mask bytes
-+.endif
-+
-+# apply horizontal filter
-+# The filter coeffs for the two halves of this are the same (unlike in the
-+# Y case) so it doesn't matter which ra0 we get them from
-+# Also as the two halves are locked together we don't need to separate the 1st
-+# r0 mul or the last r1 mul as they are valid for all QPUs
-+
-+  add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+  sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
-+# We would like to save the r5->r4 shift but we need a delay slot
-+# for both r7 & r6 which we can't find anything to put in if we have
-+# already multiplied r4 & r5!
-+  brr.anyn -, r:1b
-+  add r2, r2, r3                ; mul24 r0, ra7, rb10           # r6 post
-+  mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b         # r5 post
-+  asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+# >>> .anyn 1b
-+
-+  add r1, r1, r0                ; mul24 r0, rb4, ra3.8a         # [ra7 delay]
-+  sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+  sub r1, r1, r0
-+
-+  asr r1, r1, 6                 ; mov r3, ra_blk_height         # ; NxtLoop
-+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; v8subs r0, ra_height, r3      # ; NxtLoop
-+  brr.anyn -, r:1b
-+  asr r1, r1, i_wt_den_p6
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_p
-+  m_filter_c_p 0, 8
-+
-+::mc_filter_c_p_l1
-+  m_filter_c_p 1, 8
-+
-+################################################################################
-+#
-+# mc_filter_c_b
-+#
-+# typedef struct qpu_mc_pred_c_b_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint16_t h;
-+#     uint16_t w;
-+#     uint32_t coeffs_x1;
-+#     uint32_t coeffs_y1;
-+#     int16_t weight_u1;
-+#     int16_t weight_v1;
-+#     int16_t y2;
-+#     int16_t x2;
-+#     uint32_t base2;
-+#     uint32_t coeffs_x2;
-+#     uint32_t coeffs_y2;
-+#     uint32_t wo_u2;
-+#     uint32_t wo_v2;
-+#     uint32_t dst_addr_c;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_b_t;
-+
-+.macro m_filter_c_b, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+.set v_x_mul,           (1 << v_x_shift)
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
-+
-+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; r3=base
-+
-+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1          # x ; r5=0
-+  add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+  sub r1, r5, rb_pitch          ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
-+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x          ; mov ra0, unif                 # ; L0 H filter coeffs
-+
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.endif
-+
-+  and r0, r0, -4                ; mov ra2, unif                 # ; L0 V filter coeffs
-+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=x*2 (we are working in pel pairs)
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov r1, ra_height             # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
-+
-+# set up VPM write
-+
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U weight
-+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
-+
-+  shl r0, r1, v_dma_h_shift     ; mov ra3, unif                 # ; x2_y2
-+  add r0, r0, r2                ; mov r3, unif                  # [ra3 delay] ; base
-+  shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a       # Shift into bits 16 upwards of the vdw_setup0 register
-+  add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b               # r0=x
-+
-+# L1 - uniform layout could possibly be optimized
-+
-+  shl r0, r0, v_x_shift         ; mov ra1, unif                 # r0=x<<shift ; L1 H filter coeffs
-+  add r0, r0, rb_elem_x         ; mov ra3, unif                 # ; L1 V filter coeffs
-+  sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif    # [ra3 delay] r1=pitch2 mask ; U offset/weight
-+  max r0, r0, r5                ; mov ra9, rb_max_y
-+  min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
-+
-+  and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
-+  and r1, r0, r1                ; mov r5rep, -4
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra_dest, unif             #  Add stripe offsets ; dst_addr
-+  add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+  add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+  add r0, r0, r1                ; mov r1, ra_wt_off_l1          # ; L0 off unset
-+  shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+  sub rb_wt_off, r1, r0         ; mov ra_link, unif             # ; link
-+
-+  mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+
-+# r5        loop counter (-4)
-+# ra0       H coeffs L0
-+# ra1       H coeffs L1
-+# ra2       V coeffs L0
-+# ra3       V coeffs L1
-+# ra9       rb_max_y alias
-+# ra10      rb_xshift2 alias
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+  shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+  shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next # [ra_y delay]
-+  add ra_y, 1, ra_y             ; mov r3, ra_y
-+
-+  max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+  min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+
-+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+  add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask        # ; masks bytes
-+
-+# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
-+
-+  and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+  add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+
-+  shr r2, r4, ra10              ; mov rb5, rb6
-+  shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+  shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7                  # [r1 << delay]
-+
-+  add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+  min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+
-+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+  add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax         # ; masks bytes
-+
-+# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
-+
-+  add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+  add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+  mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+  shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+# >>> .anyn 1b
-+
-+  sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b        # L1 ; L0
-+  sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+
-+  sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c        # L1
-+  add r2, r2, r0                ; mul24 r0, ra11, rb11          # L1
-+  sub r2, r2, r0
-+
-+  shr r1, r1, 6
-+  shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+  add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+  add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+  sub r1, r1, r2                ; mov r3, ra_blk_height         # ; NxtLoop
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3      # ; NxtLoop
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, ra_wt_den_p7
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_b
-+  m_filter_c_b 8
-+
-+################################################################################
-+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
-+# conflicts
-+
-+.macro m_exit_drain
-+.if PREREAD == 2
-+# Special case 2 as loop is wasteful
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  nop                   ; nop           ; ldtmu0
-+  mov -, vw_wait        ; nop           ; ldtmu1
-+.else
-+  mov.setf r3, PREREAD - 1
-+:1
-+  brr.anynz -, r:1b
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  sub.setf r3, r3, 1
-+ # >>>
-+  mov  -, vw_wait
-+.endif
-+.endm
-+
-+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
-+# All qpus start at the beginning and after that (group - 1) must have finished
-+# before (group) can start
-+#
-+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
-+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
-+# lockup otherwise)
-+#
-+# There is some, currently ill defined, potential lockup if we have the VDM active
-+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
-+#
-+# The code stalled when I had many waiters on a single sem so we have a
-+# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
-+# and we currently have both the memory & sems to support it.
-+.macro m_sync_q, n_qpu, n_quads
-+# Do not generate code for qpu >= quads * 4 -  fns should never be called
-+.if n_qpu < n_quads * 4
-+  mov ra_link, unif     # Can only branch to an a reg (not r0)
-+  mov -, vw_wait        # [ra_link delay]
-+
-+.set n_sem_sync, n_qpu - (n_qpu % 4)
-+.set n_sem_in, n_qpu
-+.set n_sem_out, n_qpu + 1
-+
-+.if n_qpu % 4 == 0
-+
-+.set n_sem_quad_in,  12 + n_qpu / 4
-+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
-+
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  bra -, ra_link
-+  sacq -, n_sem_quad_in
-+  srel -, n_sem_out
-+  srel -, n_sem_quad_out
-+
-+.else
-+  bra -, ra_link
-+  srel -, n_sem_sync
-+  sacq -, n_sem_in
-+.if n_sem_out % 4 != 0
-+  srel -, n_sem_out
-+.else
-+  nop
-+.endif
-+.endif
-+.endif
-+.endm
-+
-+.set v_quads8, N_QPU_8 / 4
-+
-+::mc_sync_q0
-+  m_sync_q 0, v_quads8
-+::mc_sync_q1
-+  m_sync_q 1, v_quads8
-+::mc_sync_q2
-+  m_sync_q 2, v_quads8
-+::mc_sync_q3
-+  m_sync_q 3, v_quads8
-+::mc_sync_q4
-+  m_sync_q 4, v_quads8
-+::mc_sync_q5
-+  m_sync_q 5, v_quads8
-+::mc_sync_q6
-+  m_sync_q 6, v_quads8
-+::mc_sync_q7
-+  m_sync_q 7, v_quads8
-+::mc_sync_q8
-+  m_sync_q 8, v_quads8
-+::mc_sync_q9
-+  m_sync_q 9, v_quads8
-+::mc_sync_q10
-+  m_sync_q 10, v_quads8
-+::mc_sync_q11
-+  m_sync_q 11, v_quads8
-+
-+# mc_exit()
-+# Chroma & Luma the same now
-+
-+.macro m_exit_qn
-+  m_exit_drain
-+  nop                   ; nop           ; thrend
-+  nop
-+  nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_qn
-+::mc_exit_y_qn
-+  m_exit_qn
-+
-+
-+
-+# mc_interrupt_exit12()
-+
-+.macro m_exit_q0
-+  m_exit_drain
-+  sacq -, 12
-+  nop                   ; nop           ; thrend
-+  mov interrupt, 1
-+  nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_q0
-+::mc_exit_y_q0
-+  m_exit_q0
-+
-+# LUMA CODE
-+
-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-+# For P frames we make the second x,y coordinates offset by +8
-+
-+
-+################################################################################
-+# mc_setup
-+#
-+# typedef struct qpu_mc_pred_y_s_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t pic_h;
-+#    uint16_t pic_w;
-+#    uint32_t stride2;
-+#    uint32_t stride1;
-+#    uint32_t wdenom;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_s_t;
-+
-+.macro m_setup_y, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_pmask,           0xff
-+.set v_blk_height,      Y_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         1
-+.set v_pmask,           0xffff
-+.set v_blk_height,      Y_BLK_HEIGHT_16
-+.endif
-+
-+
-+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
-+  mov ra9, unif                                                 # ref_y_base
-+  mov ra1, unif                                                 # x2_y2
-+
-+
-+# load constants
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30      ; mov ra11, unif                # ; ref_y2_base
-+
-+  mov ra_kff800100, 0xff800100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+  mov rb_y_coeffs_2, 0x050b0a00
-+  mov rb_y_coeffs_3, 0x11283a40
-+  mov rb_y_coeffs_5, 0x0a0b0500
-+
-+# Compute part of VPM to use
-+
-+# Read image dimensions
-+  mov ra3, unif                                                 # width_height
-+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # [ra3 delay] ; stride2
-+.if v_x_shift == 0
-+  sub rb_max_x, ra3.16b, 1
-+.else
-+  sub r0, ra3.16b, 1
-+  shl rb_max_x, r0, v_x_shift
-+.endif
-+  sub rb_max_y, ra3.16a, 1
-+  mov r3, elem_num              ; mov rb_pitch, unif            # stride1
-+
-+# get destination pitch
-+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay]
-+  or  rb_dma1_base, r1, rb_pitch
-+
-+# Compute base address for first and second access
-+  add r0, ra0.16b, r3                                           # Load x + elem_num
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+
-+# X is byte offset - we can only load words - mask
-+
-+  and r0, r0, -4                ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                                # Add stripe offsets
-+  add ra_base, ra9, r0
-+
-+  # r3 still contains elem_num
-+  add r0, ra1.16b, r3                                           # Load x
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
-+
-+  # r2 still contains mask
-+  and r0, r0, -4
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                                # Add stripe offsets
-+  add rb_base2, ra11, r0
-+
-+# Do preloads
-+  nop                           ; mov r0, ra0.16a               # ; r0 = y
-+  mov r3, PREREAD               ; mov r2, ra1.16a               # ; r2 = y2
-+
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1          ; mov ra_y, r0
-+
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t1s, rb_base2, r1         ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+  mov ra_link, unif                                             # Next fn
-+
-+# touch vertical context to keep simulator happy
-+  mov ra8,  0                   ; mov rb8,  0                   # [ra_link delay]
-+  bra -, ra_link
-+  mov ra9,  0                   ; mov rb9,  0
-+  mov ra10, 0                   ; mov rb10, 0
-+  mov ra11, 0                   ; mov rb11, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_y_q0
-+  m_setup_q0
-+::mc_setup_y_qn
-+  m_setup_y 8
-+
-+################################################################################
-+#
-+# Start of per-block setup code
-+# P and B blocks share the same setup code to save on Icache space
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# 1st 3 instructions of per_block-setup in branch delay
-+#
-+# typedef struct qpu_mc_pred_y_p_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t mymx21;
-+#    uint32_t wo1;
-+#    uint32_t wo2;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p_t;
-+#
-+
-+.macro m_luma_setup, v_bit_depth
-+# Hack - QASM may well have have label pasting but I have no idea how...
-+.if v_bit_depth == 8
-+  brr ra_link, r:per_block_setup_8
-+.elif v_bit_depth == 10
-+  brr ra_link, r:per_block_setup_10
-+.endif
-+  mov ra0, unif                 ; mov r3, elem_num              # y_x ; elem_num has implicit unpack??
-+  add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2          # [ra0 delay] ; r5 = 0
-+  add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+.endm
-+
-+.macro m_per_block_setup, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x
-+
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+  and r0, r0, -4
-+  sub r2, r5, rb_pitch          ; mov ra_base_next, unif        # ; src1.base
-+  and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra1, unif                 # Add stripe offsets ; src2.x_y
-+  add ra_base_next, ra_base_next, r0                            # [ra1 delay]
-+
-+  add r0, ra1.16b, r3                                           # Load x2
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+  min r0, r0, rb_max_x          ; mov rb_base2_next, unif       # ; src2.base
-+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
-+  and r0, r0, -4                ; mov ra_width_height, unif     # ; width_height
-+  and r1, r0, r2                ; mov vw_setup, rb_vpm_init     # ; set up VPM write
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul   # Add stripe offsets ; r1 = x in bytes
-+  add rb_base2_next, rb_base2_next, r0
-+
-+# get width,height of block (unif load above), r1 = width * pel_size
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height             # Compute vdw_setup1(dst_pitch-width)
-+  add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+  add rb_lcount, r0, (7-8)
-+  shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add           # ; r3 return val
-+  add r0, r0, r1                                                # Combine width and height of destination area
-+  shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val        # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
-+  add ra_dma0, r0, rb_dma0_base ; mov r0, unif                  # ; Packed filter offsets
-+
-+# get filter coefficients and discard unused B frame values
-+  shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif    #  Pick half to use ; L0 offset/weight
-+  shl ra8, r0, 3                ; mov rb5, ra_k255
-+
-+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
-+
-+# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
-+# but I can't see a way of doing that that is cheap enough to be worth it
-+
-+# Picked out in a slightly random order to space out uniform loads
-+
-+  # 1
-+  mov r1, 0x01040400            # [ra8 delay]
-+  ror ra2.8b, r1, ra8.8d
-+  ror ra0.8b, r1, ra8.8c
-+  # 2
-+  ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+  ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+  # 0
-+  mov r1,0x00010100             # -ve  [ra8 delay]
-+  ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif    # ; L1 Wt/Offset
-+  ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+  # 7
-+  shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
-+  ror r0, r1, ra8.8d            ; mov ra_dest, unif             # ; Destination address
-+  ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+  # 3
-+  ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+  ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+  # 5
-+  ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+  ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+  # 6
-+  mov r1,0x04040100
-+  ror ra3.8c, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8                 # ; r5 return val
-+
-+  bra -, ra_link
-+  # 4
-+  mov r1,0x3a281100
-+  ror r0, r1, ra8.8d            ; mov ra_link, unif             # ; link - load after we've used its previous val
-+  ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+# >>> branch ra_link
-+
-+# r5 = -8
-+# r2 = fir_off_val
-+# r3 = 128
-+.endm
-+
-+:per_block_setup_8
-+  m_per_block_setup 8
-+
-+
-+
-+################################################################################
-+#
-+# mc_filter_y_pxx
-+#
-+# Setup (& therefore uniform struct) shared with _bxx
-+# Struct in m_luma_setup
-+#
-+# We can have 2 separate P reqs here as long as they mate to generate a
-+# rectangular output block (i.e. h0 = h1, w0 = 8)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_pxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+  m_luma_setup v_bit_depth
-+
-+  shl r1, ra_wt_off_l0, i_wt_den_p5
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
-+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# This loop is identical to the B loop from here --->
-+:1
-+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+
-+  max r2, ra_y, 0               ; mov r1, 0
-+  min r2, r2, rb_max_y          ; mov r3, ra_k1
-+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+  add t0s, ra_base, r2          ; mov rb5,  rb6
-+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+
-+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
-+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+  add t1s, rb_base2, r2         ; mov ra8,  ra9
-+
-+# apply horizontal filter
-+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+  # >>> .anyn 1b (r5 + r5)
-+
-+  # apply vertical filter and write to VPM
-+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+  add r1, r1, r0                ; mul24 r0, ra11, rb11
-+# <--- to here
-+  sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height                 # ; NxtLoop: r3 = block height
-+  sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+  sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+
-+  asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; v8subs r0, ra_height, r3              # ; NxtLoop: r0 = remaining height (0 saturate)
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, i_wt_den_p6
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch                # ; NxtLoop
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_pxx
-+  m_filter_y_pxx 8
-+
-+
-+################################################################################
-+
-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+#
-+# Setup (& therefore uniform struct) shared with _pxx
-+# Struct in m_luma_setup
-+#
-+# l0 calc in els 0-7, L1 in 8-15
-+# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_bxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+  m_luma_setup v_bit_depth
-+
-+  shl r1, ra_wt_off_l0, i_wt_den_p6
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+  sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+
-+# This loop is identical to the P loop from here --->
-+:1
-+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+
-+  max r2, ra_y, 0               ; mov r1, 0
-+  min r2, r2, rb_max_y          ; mov r3, ra_k1
-+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+  add t0s, ra_base, r2          ; mov rb5,  rb6
-+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+
-+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
-+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+  add t1s, rb_base2, r2         ; mov ra8,  ra9
-+
-+# apply horizontal filter
-+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+  # >>> .anyn 1b (r5 + r5)
-+
-+  # apply vertical filter and write to VPM
-+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+  add r1, r1, r0                ; mul24 r0, ra11, rb11
-+# <--- to here
-+  sub r1, r1, ra4
-+  sub r1, r1, r0                ; mov r2, rb_wt_off
-+
-+  asr r1, r1, 6
-+  sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+  mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+  sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+  add r1, r1, r2                ; mov r0, r1 << 8
-+  add r1, r1, r0                ; mov r3, ra_blk_height         # ; NxtLoop: r3 = block height
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, 0                ; v8subs r0, ra_height, r3      # ; NxtLoop: r0 = remaining height (0 saturate)
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed block_height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link (ra_height - remaining height)
-+
-+# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_bxx
-+  m_filter_y_bxx 8
-+
-+################################################################################
-+#
-+# typedef struct qpu_mc_pred_y_p00_s {
-+#    qpu_mc_src_t next_src1;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t wo1;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p00_t;
-+
-+.macro m_filter_y_p00, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+  mov ra0, unif                 ; mov r0, elem_num              # y_x
-+  mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5          # [ra0 delay] ; r5 = 0
-+  add r0, ra0.16b, r0           ; mov ra_base_next, unif        # ; src1.base
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+
-+  max r0, r0, r5                ; mov ra_y_next, ra0.16a        # ; width_height
-+  min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+  and r0, r0, -4
-+  sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif    # ; weight_offset
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra_dest, unif             # Add stripe offsets ; dest addr
-+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
-+
-+# get width,height of block (unif load above)
-+# Compute vdw_setup1(dst_pitch-width)
-+  shl r1, ra_width, v_x_shift
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+  sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+  shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+  add r0, r0, r1                                                # Combine width and height of destination area
-+  shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+  shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif             # Shift into bits 16 upwards of the vdw_setup0 register ; link
-+  add ra_dma0, r0, rb_dma0_base
-+
-+:1
-+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+  nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+
-+  sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+  shl r1, r1, 8                 ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, DENOM + 8
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, ra_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_p00
-+  m_filter_y_p00 8
-+
-+################################################################################
-+
-+.macro m_filter_y_b00, v_bit_depth
-+# luma setup does a fair bit more than we need calculating filter coeffs
-+# that we will never use but it saves I-cache to use it (also simple!)
-+  m_luma_setup v_bit_depth
-+
-+# Fix up vals that were expecting a filter (somewhat icky)
-+  mov r2, 1
-+  add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0      # Need in rX rather than raX for <<8 to do what we want
-+  shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
-+  nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+
-+:1
-+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+
-+  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+  add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax         # v8subs masks out all but bottom byte
-+  and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+
-+  sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+  add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+
-+  shl r1, r1, 8                 ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, (DENOM + 9) - 32                                  # -32 to get valid shift immediate
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_b00
-+  m_filter_y_b00 8
-+
-+################################################################################
-+################################################################################
-+# 10 BIT
-+
-+::mc_setup_c10_q0
-+  m_setup_q0
-+::mc_setup_c10_qn
-+  m_setup_c 10
-+
-+::mc_filter_c10_p
-+  m_filter_c_p 0, 10
-+
-+::mc_filter_c10_p_l1
-+  m_filter_c_p 1, 10
-+
-+
-+::mc_filter_c10_b
-+  m_filter_c_b 10
-+
-+# Even if these fns are the same as for other bit depths we want our own copy
-+# to keep the code we are using in a single lump to avoid (direct map) cache
-+# thrashing
-+.set v_quads10, N_QPU_16 / 4
-+
-+::mc_sync10_q0
-+  m_sync_q 0, v_quads10
-+::mc_sync10_q1
-+  m_sync_q 1, v_quads10
-+::mc_sync10_q2
-+  m_sync_q 2, v_quads10
-+::mc_sync10_q3
-+  m_sync_q 3, v_quads10
-+::mc_sync10_q4
-+  m_sync_q 4, v_quads10
-+::mc_sync10_q5
-+  m_sync_q 5, v_quads10
-+::mc_sync10_q6
-+  m_sync_q 6, v_quads10
-+::mc_sync10_q7
-+  m_sync_q 7, v_quads10
-+::mc_sync10_q8
-+  m_sync_q 8, v_quads10
-+::mc_sync10_q9
-+  m_sync_q 9, v_quads10
-+::mc_sync10_q10
-+  m_sync_q 10, v_quads10
-+::mc_sync10_q11
-+  m_sync_q 11, v_quads10
-+
-+::mc_exit_y10_q0
-+::mc_exit_c10_q0
-+  m_exit_q0
-+
-+::mc_exit_y10_qn
-+::mc_exit_c10_qn
-+  m_exit_qn
-+
-+::mc_setup_y10_q0
-+  m_setup_q0
-+::mc_setup_y10_qn
-+  m_setup_y 10
-+
-+:per_block_setup_10
-+  m_per_block_setup 10
-+
-+::mc_filter_y10_pxx
-+  m_filter_y_pxx 10
-+
-+::mc_filter_y10_p00
-+  m_filter_y_p00 10
-+
-+::mc_filter_y10_bxx
-+  m_filter_y_bxx 10
-+
-+::mc_filter_y10_b00
-+  m_filter_y_b00 10
-+
-+
-+
-+::mc_end
-+# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
-new file mode 100644
-index 0000000000..2f06987bb9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_cmd.h
-@@ -0,0 +1,128 @@
-+#ifndef RPI_SHADER_CMD_H
-+#define RPI_SHADER_CMD_H
-+
-+#pragma pack(push, 4)
-+
-+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
-+// If mixed then we are just confused and get a lot of warnings....
-+typedef const uint8_t * qpu_mc_src_addr_t;
-+typedef uint8_t * qpu_mc_dst_addr_t;
-+#else
-+typedef uint32_t qpu_mc_src_addr_t;
-+typedef uint32_t qpu_mc_dst_addr_t;
-+#endif
-+
-+typedef struct qpu_mc_src_s
-+{
-+    int16_t y;
-+    int16_t x;
-+    qpu_mc_src_addr_t base;
-+} qpu_mc_src_t;
-+
-+
-+typedef struct qpu_mc_pred_c_p_s {
-+    qpu_mc_src_t next_src;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x;
-+    uint32_t coeffs_y;
-+    uint32_t wo_u;
-+    uint32_t wo_v;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_p_t;
-+
-+typedef struct qpu_mc_pred_c_b_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x1;
-+    uint32_t coeffs_y1;
-+    int16_t weight_u1;
-+    int16_t weight_v1;
-+    qpu_mc_src_t next_src2;
-+    uint32_t coeffs_x2;
-+    uint32_t coeffs_y2;
-+    uint32_t wo_u2;
-+    uint32_t wo_v2;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_b_t;
-+
-+typedef struct qpu_mc_pred_c_s_s {
-+    qpu_mc_src_t next_src1;
-+    uint32_t pic_cw;            // C Width (== Y width / 2)
-+    uint32_t pic_ch;            // C Height (== Y Height / 2)
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    qpu_mc_src_t next_src2;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_s_t;
-+
-+typedef struct qpu_mc_pred_c_s {
-+    union {
-+        qpu_mc_pred_c_p_t p;
-+        qpu_mc_pred_c_b_t b;
-+        qpu_mc_pred_c_s_t s;
-+    };
-+} qpu_mc_pred_c_t;
-+
-+
-+typedef struct qpu_mc_pred_y_p_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t mymx21;
-+    uint32_t wo1;
-+    uint32_t wo2;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p_t;
-+
-+typedef struct qpu_mc_pred_y_p00_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t wo1;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p00_t;
-+
-+typedef struct qpu_mc_pred_y_s_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t pic_h;
-+    uint16_t pic_w;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_s_t;
-+
-+// Only a useful structure in that it allows us to return something other than a void *
-+typedef struct qpu_mc_pred_y_s {
-+    union {
-+        qpu_mc_pred_y_p_t p;
-+        qpu_mc_pred_y_p00_t p00;
-+        qpu_mc_pred_y_s_t s;
-+    };
-+} qpu_mc_pred_y_t;
-+
-+typedef union qpu_mc_pred_cmd_u {
-+    qpu_mc_pred_y_t y;
-+    qpu_mc_pred_c_t c;
-+    uint32_t data[1];
-+} qpu_mc_pred_cmd_t;
-+
-+#define QPU_MC_PRED_N_Y8        12
-+#define QPU_MC_PRED_N_C8        12
-+
-+#define QPU_MC_PRED_N_Y10       12
-+#define QPU_MC_PRED_N_C10       12
-+
-+#define QPU_MC_DENOM            7
-+
-+#pragma pack(pop)
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
-new file mode 100644
-index 0000000000..577850a6b4
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.c
-@@ -0,0 +1,61 @@
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+
-+typedef struct shader_track_s
-+{
-+    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    const struct qpu_mc_src_s *last_l0;
-+    const struct qpu_mc_src_s *last_l1;
-+    uint32_t width;  // pic_width * PW
-+    uint32_t height;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+} shader_track_t;
-+
-+static int wtoidx(const unsigned int w)
-+{
-+    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+    return pel_weight[w];
-+}
-+
-+static const int fctom(uint32_t x)
-+{
-+    int rv;
-+    // As it happens we can take the 2nd filter term & divide it by 8
-+    // (dropping fractions) to get the fractional move
-+    rv = 8 - ((x >> 11) & 0xf);
-+    av_assert2(rv >= 0 && rv <= 7);
-+    return rv;
-+}
-+
-+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
-+{
-+    return (x << shl) >> shr;
-+}
-+
-+static inline int woff_p(HEVCRpiContext *const s, int32_t x)
-+{
-+    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int woff_b(HEVCRpiContext *const s, int32_t x)
-+{
-+    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int wweight(int32_t x)
-+{
-+    return ext(x, 16, 16);
-+}
-+
-+
-+#define PW 1
-+#include "rpi_hevc_shader_template_fn.h"
-+
-+#undef PW
-+#define PW 2
-+#include "rpi_hevc_shader_template_fn.h"
-+
-diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
-new file mode 100644
-index 0000000000..304d73ea4a
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.h
-@@ -0,0 +1,22 @@
-+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiInterPredEnv;
-+
-+void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void rpi_sand_dump8(const char * const name,
-+                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+void rpi_sand_dump16(const char * const name,
-+                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
-new file mode 100644
-index 0000000000..59b00d537b
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template_fn.h
-@@ -0,0 +1,475 @@
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+#define PATCH_STRIDE (16 * PW)
-+
-+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
-+        const pixel s = *(const pixel *)src;
-+        pixel * d = (pixel *)dst;
-+        for (unsigned int j = 0; j < w; j += PW) {
-+            *d++ = s;
-+        }
-+    }
-+}
-+
-+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride) {
-+        memcpy(dst, src, w);
-+    }
-+}
-+
-+static void FUNC(get_patch_y)(const shader_track_t * const st,
-+                         uint8_t * dst, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
-+{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > st->width) {
-+        if (x >= st->width)
-+            x = st->width - PW;
-+        dr = (x + w) - st->width;
-+        w = st->width - x;
-+    }
-+
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > st->height) {
-+        if (y >= st->height)
-+            y = st->height - 1;
-+        db = (y + h) - st->height;
-+        h = st->height - y;
-+    }
-+
-+    dst += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+    // Edge dup
-+    if (dl != 0)
-+        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
-+    if (dr != 0)
-+        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
-+    w += dl + dr;
-+    dst -= dl;
-+
-+    if (dt != 0)
-+        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
-+    if (db != 0)
-+        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
-+}
-+
-+
-+
-+static void FUNC(get_patch_c)(const shader_track_t * const st,
-+                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
-+{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+    const int width = st->width;
-+    const int height = st->height;
-+
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > width) {
-+        if (x >= width)
-+            x = width - PW;
-+        dr = (x + w) - width;
-+        w = width - x;
-+    }
-+
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > height) {
-+        if (y >= height)
-+            y = height - 1;
-+        db = (y + h) - height;
-+        h = height - y;
-+    }
-+
-+    dst_u += dl + dt * dst_stride;
-+    dst_v += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+    // Edge dup
-+    if (dl != 0)
-+    {
-+        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
-+        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
-+    }
-+    if (dr != 0)
-+    {
-+        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
-+        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
-+    }
-+    w += dl + dr;
-+    dst_u -= dl;
-+    dst_v -= dl;
-+
-+    if (dt != 0)
-+    {
-+        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
-+        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
-+    }
-+    if (db != 0)
-+    {
-+        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
-+        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
-+    }
-+}
-+
-+// w, y, w, h in pixels
-+// stride1, stride2 in bytes
-+void FUNC(rpi_sand_dump)(const char * const name,
-+                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
-+{
-+    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
-+
-+    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
-+
-+    if (is_c) {
-+        x *= 2;
-+        w *= 2;
-+    }
-+
-+    for (int i = y; i != y + h; ++i) {
-+        for (int j = x; j != x + w; ++j) {
-+            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
-+            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
-+#if PW == 1
-+            if (j < 0 || i < 0)
-+                printf("..%c", sep);
-+            else
-+                printf("%02x%c", *(const pixel*)p, sep);
-+#else
-+            if (j < 0 || i < 0)
-+                printf("...%c", sep);
-+            else
-+                printf("%03x%c", *(const pixel*)p, sep);
-+#endif
-+        }
-+        printf("\n");
-+    }
-+}
-+
-+
-+void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
-+                  const HEVCRpiInterPredEnv *const ipe_y,
-+                  const HEVCRpiInterPredEnv *const ipe_c)
-+{
-+    for (int c_idx = 0; c_idx < 2; ++c_idx)
-+    {
-+        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
-+        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
-+        unsigned int exit_n = 0;
-+
-+        if (ipe == NULL || !ipe->used) {
-+            continue;
-+        }
-+
-+        do {
-+            for (unsigned int i = 0; i != ipe->n; ++i) {
-+                const HEVCRpiInterPredQ * const q = ipe->q + i;
-+                shader_track_t * const st = tracka + i;
-+                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
-+
-+                for (;;) {
-+                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
-+
-+                    if (link == q->code_setup) {
-+                        if (c_idx == 0) {
-+                            // Luma
-+                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
-+
-+                            st->height = c->pic_h;
-+                            st->width = c->pic_w * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                        else {
-+                            // Chroma
-+                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
-+
-+                            st->height = c->pic_ch;
-+                            st->width = c->pic_cw * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                    }
-+                    else if (link == s->qpu.y_pxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+                        const int w1 = FFMIN(c->w, 8);
-+                        const int w2 = c->w - w1;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        if (w2 > 0) {
-+                            FUNC(get_patch_y)(st,
-+                                        patch_y2, PATCH_STRIDE,
-+                                        st->last_l1,
-+                                        16, c->h + 7);
-+                        }
-+
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
-+                        if (w2 > 0) {
-+                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                                c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
-+                        }
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_bxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h + 7);
-+
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
-+
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_p00) {
-+                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
-+
-+                        st->last_l0 = &c->next_src1;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_b00) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        av_assert0(c->w <= 16 && c->h <= 64);
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h);
-+
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
-+                           patch_y3, patch_y1, PATCH_STRIDE,
-+                           c->h, 0, 0, c->w);
-+
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), 0, 0, c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l0 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx_l1) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l1 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_bxx) {
-+                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
-+                        const int mx1 = fctom(c->coeffs_x1);
-+                        const int my1 = fctom(c->coeffs_y1);
-+                        const int mx2 = fctom(c->coeffs_x2);
-+                        const int my2 = fctom(c->coeffs_y2);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72];
-+                        uint8_t patch_v1[PATCH_STRIDE * 72];
-+                        uint8_t patch_u2[PATCH_STRIDE * 72];
-+                        uint8_t patch_v2[PATCH_STRIDE * 72];
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
-+                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
-+
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
-+                            c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
-+                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
-+                            c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
-+                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == q->code_sync) {
-+                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
-+                        break;
-+                    }
-+                    else if (link == q->code_exit) {
-+                        // We expect exit to occur without other sync
-+                        av_assert0(i == exit_n);
-+                        ++exit_n;
-+                        break;
-+                    }
-+                    else {
-+                        av_assert0(0);
-+                    }
-+                }
-+
-+                st->qpu_mc_curr = cmd;
-+            }
-+        } while (exit_n == 0);
-+    }
-+}
-+
-+#undef FUNC
-+#undef pixel
-+
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000000..3caef20137
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,444 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
-+# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
-+.set USE_STACK, 0
-+
-+# Lines that fail to assemble start with #:
-+# The script insert_magic_opcodes.sh inserts the machine code directly for these.
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+
-+.equ TRANS_SHIFT, 20 - BIT_DEPTH
-+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
-+.equ TRANS_ASL2, 16 - TRANS_SHIFT
-+
-+
-+hevc_trans_16x16:
-+  push r6-r15, lr # TODO cut down number of used registers
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,TRANS_RND2 # Constant used for rounding second pass
-+
-+  sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+
-+  add r11,sp,64 # Space for 32 bytes before, and rounding
-+  lsr r11,5
-+  lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
-+
-+  lsr r10, r2, 16 # Number of compressed blocks stored in top short
-+  extu r2,16
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+  # r0 VRF location of current block
-+  # r1 address of current block
-+  # r2 number of 16*16 transforms to do
-+  # r3 Stride of coefficients (==32)
-+  # r4 TRANS_RND1 (64)
-+  # r5 TRANS_RND2
-+  # r6 temporary used inside col_trans16
-+  # r7 16*16*2 total bytes in block
-+  # r8 64*16 VRF switch locations
-+  # r9 temporary in unpack_coeff for index
-+  # r10 number of 16x16 transforms using compression
-+  # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
-+  # r12 temporary counter in unpack_coeff
-+  # r13
-+  # r14 Save information for 32 bit transform (coeffs location)
-+  # r15 Save information for 32 bit transform (number of transforms)
-+  cmp r2,0
-+  beq done16x16s
-+block_loop:
-+  # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
-+  cmp r10,0
-+  mov r6, r1
-+  beq not_compressed
-+  sub r10, 1
-+  bl unpack16x16
-+not_compressed:
-+  #mov r6,r1 # DEBUG without compress
-+  vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+  #eor r0,r8
-+  #add r1,r7
-+  # Prefetch the next block
-+  #bl unpack16x16
-+  #vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+  #vmov HX(0++,0)+r0,0 REP 16  # DEBUG
-+  #eor r0,r8
-+  #sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+done16x16s:
-+
-+  add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-+  pop r6-r15, pc
-+# This returns a value in r6 that says where to load the data from.
-+# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
-+unpack16x16:
-+# Clear out destination
-+  vmov HX(0,0)+r0,0
-+  mov r6, r11
-+  vsth HX(0,0)+r0,(r6 += r3) REP 16
-+  mov r5, r1 # Moving pointer to input coefficients
-+unpack_outer_loop:
-+  # Loop until we find the end
-+  vldh HX(0,0)+r0,(r5)  # TODO would prefetch help here while unpacking previous?
-+  sub r6,r11,32
-+  #add r6,pc,packed_data-$ # Packed data
-+  vsth HX(0,0)+r0,(r6)  # Store into packed data
-+  mov r12,0
-+unpack_loop:
-+  ld r4,(r6)
-+  add r6,r6,4
-+  lsr r9,r4,16 # r9 is destination value
-+  cmp r4,0 # {value,index}
-+  extu r4,8
-+  beq done_unpack
-+  sth r9,(r11, r4)
-+  addcmpblt r12,1,8,unpack_loop
-+#  # Read next 16
-+  add r5,32
-+  b unpack_outer_loop
-+done_unpack:
-+#  # Set new load location
-+  mov r6, r11
-+  #add r6,pc,unpacked_data-$
-+#  # Restore constants
-+  mov r4,64
-+  mov r5,TRANS_RND2
-+#  pop r6-r15, pc
-+  b lr
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+# r1/r10 input pointer
-+# r0,r4,r5,r6 free
-+# r8/r9 output storage
-+#
-+# Store packed coefficients at r9-32
-+# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
-+unpack32x32:
-+# Clear out destination
-+  vmov HX(0,0),0
-+  add r0, r9, 32*32*2 # Unpacked buffer
-+  mov r4, 32
-+  vsth HX(0,0),(r0 += r4) REP 64
-+unpack_outer_loop32:
-+  # Loop until we find the end
-+  vldh HX(0,0),(r1)  # TODO would prefetch help here while unpacking previous?
-+  sub r6,r9,32
-+  #add r6,pc,packed_data-$ # Packed data
-+  vsth HX(0,0),(r6)  # Store into packed data
-+  mov r8,0
-+unpack_loop32:
-+  ld r4,(r6)
-+  add r6,r6,4
-+  lsr r5,r4,16 # r5 is destination value
-+  cmp r4,0 # {value,index}
-+  extu r4,10
-+  beq done_unpack
-+  sth r5,(r0, r4)
-+  addcmpblt r8,1,8,unpack_loop32
-+#  # Read next 16
-+  add r1,32
-+  b unpack_outer_loop32
-+done_unpack32:
-+  b lr
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
-+#
-+# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
-+hevc_trans_32x32:
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
-+  lsr r15,r15,16 # Number that are packed
-+  extu r2,16 # Total number
-+
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+
-+.if USE_STACK
-+  # Stack base allocation
-+  sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
-+  # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
-+  add r8,sp,63
-+  lsr r8,5
-+  lsl r8,5
-+.else
-+#:version r8
-+  .half 0x00e8 #AUTOINSERTED
-+  btst r8,16
-+#:add r8,pc,intermediate_results-$
-+  .half 0xbfe8
-+  .half intermediate_results-($-2)
-+  beq on_vpu1
-+  add r8,r8,32*32*2*2+16*2 # Move to secondary storage
-+on_vpu1:
-+.endif
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+
-+  cmp r2,0
-+  beq done32x32s
-+block_loop32:
-+
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
-+  cmp r2,r15
-+  bgt not_compressed_32
-+  bl unpack32x32
-+  add r1,r9,32*32*2   # Uncompressed into temporary storage
-+  mov r8,r9           # Transform into here
-+not_compressed_32:
-+  # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r4, TRANS_RND2 # Constant used for rounding second pass
-+  mov r5, TRANS_ASL2 # left shift used for rounding second pass
-+
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+done32x32s:
-+
-+.if USE_STACK
-+  add sp,sp,32*32*4+64# Restore stack
-+.endif
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  push lr
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0),(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  pop pc
-+
-+.if USE_STACK == 0
-+  .balign 32
-+
-+# .space directives generate 0's in the bin so avoid unnecessary padding by
-+# just setting to appropriate value
-+.equ intermediate_results, $+16*2
-+
-+# Layout goes:
-+#
-+#packed_buffer:
-+#  .space 16*2
-+#intermediate_results:
-+#  .space 32*32*2
-+#unpacked_buffer:
-+#  .space 32*32*2
-+#
-+#packed_buffer2:
-+#  .space 16*2
-+#intermediate_results2:
-+#  .space 32*32*2
-+#unpacked_buffer2:
-+#  .space 32*32*2
-+.endif
-+
-+
-diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
-new file mode 100644
-index 0000000000..1c364492d0
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform10.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform10 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
-+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
-+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
-+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
-+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
-+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
-+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
-+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
-+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
-+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
-+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
-+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
-+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
-+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
-+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
-+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
-+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
-+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
-+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
-+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
-+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
-+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
-+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
-+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
-+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
-+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
-+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
-+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
-+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
-+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
-+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
-+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
-+0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
-+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
-+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
-+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
-+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
-+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
-+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
-+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
-+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
-+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
-+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
-+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
-+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
-+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
-+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
-+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
-+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
-+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
-+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
-+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
-+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
-+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
-+0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
-+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
-+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
-+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
-+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
-+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
-+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
-+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
-+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
-+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
-+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
-+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
-+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
-+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
-+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
-+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
-+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
-+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
-+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
-+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
-+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
-+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
-+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
-+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
-+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
-+};
-diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
-new file mode 100644
-index 0000000000..1128a2c054
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform8.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform8 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
-+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
-+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
-+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
-+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
-+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
-+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
-+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
-+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
-+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
-+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
-+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
-+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
-+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
-+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
-+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
-+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
-+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
-+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
-+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
-+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
-+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
-+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
-+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
-+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
-+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
-+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
-+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
-+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
-+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
-+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
-+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
-+0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
-+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
-+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
-+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
-+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
-+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
-+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
-+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
-+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
-+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
-+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
-+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
-+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
-+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
-+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
-+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
-+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
-+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
-+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
-+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
-+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
-+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
-+0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
-+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
-+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
-+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
-+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
-+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
-+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
-+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
-+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
-+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
-+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
-+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
-+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
-+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
-+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
-+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
-+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
-+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
-+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
-+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
-+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
-+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
-+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
-+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
-+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
-+};
-diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
-new file mode 100644
-index 0000000000..c874222ebb
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,6016 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Wassim Hamidouche
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+#include "libavutil/display.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/mastering_display_metadata.h"
-+#include "libavutil/md5.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/stereo3d.h"
-+
-+#include "bswapdsp.h"
-+#include "bytestream.h"
-+#include "golomb.h"
-+#include "hevc.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_parse.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+#include "profiles.h"
-+#include "hwaccel.h"
-+
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "pthread.h"
-+#include <stdatomic.h>
-+
-+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
-+
-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
-+
-+#ifndef av_mod_uintp2
-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-+{
-+    return a & ((1 << p) - 1);
-+}
-+#   define av_mod_uintp2   av_mod_uintp2_c
-+#endif
-+
-+const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
-+
-+#define MC_DUMMY_X (-32)
-+#define MC_DUMMY_Y (-32)
-+
-+// UV & Y both have min 4x4 pred (no 2x2 chroma)
-+// Allow for even spread +1 for setup, +1 for rounding
-+// As we have load sharing this can (in theory) be exceeded so we have to
-+// check after each CTU, but it is a good base size
-+
-+// Worst case (all 4x4) commands per CTU
-+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
-+#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
-+
-+#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
-+
-+#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
-+#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
-+
-+#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
-+#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
-+
-+// Total cmds to allocate - allow for slack & setup
-+#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+
-+#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
-+#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
-+
-+// The QPU code for UV blocks only works up to a block width of 8
-+#define RPI_CHROMA_BLOCK_WIDTH 8
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+
-+
-+// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8] = {
-+        ENCODE_COEFFS(  0,  64,   0,  0),
-+        ENCODE_COEFFS(  2,  58,  10,  2),
-+        ENCODE_COEFFS(  4,  54,  16,  2),
-+        ENCODE_COEFFS(  6,  46,  28,  4),
-+        ENCODE_COEFFS(  4,  36,  36,  4),
-+        ENCODE_COEFFS(  4,  28,  46,  6),
-+        ENCODE_COEFFS(  2,  16,  54,  4),
-+        ENCODE_COEFFS(  2,  10,  58,  2)
-+};
-+
-+// Function arrays by QPU
-+
-+static const int * const inter_pred_setup_c_qpu[12] = {
-+    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
-+};
-+
-+static const int * const inter_pred_setup_c10_qpu[12] = {
-+    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
-+};
-+
-+static const int * const inter_pred_setup_y_qpu[12] = {
-+    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
-+};
-+
-+static const int * const inter_pred_setup_y10_qpu[12] = {
-+    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
-+};
-+
-+static const int * const inter_pred_sync_qpu[12] = {
-+    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
-+    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
-+    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
-+};
-+
-+static const int * const inter_pred_sync10_qpu[12] = {
-+    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
-+    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
-+    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
-+};
-+
-+static const int * const inter_pred_exit_c_qpu[12] = {
-+    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
-+};
-+
-+static const int * const inter_pred_exit_c10_qpu[12] = {
-+    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
-+};
-+
-+static const int * const inter_pred_exit_y_qpu[12] = {
-+    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
-+};
-+
-+static const int * const inter_pred_exit_y10_qpu[12] = {
-+    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
-+};
-+
-+typedef struct ipe_chan_info_s
-+{
-+    const uint8_t bit_depth;
-+    const uint8_t n;
-+    const int * const * setup_fns;
-+    const int * const * sync_fns;
-+    const int * const * exit_fns;
-+} ipe_chan_info_t;
-+
-+typedef struct ipe_init_info_s
-+{
-+    ipe_chan_info_t luma;
-+    ipe_chan_info_t chroma;
-+} ipe_init_info_t;
-+
-+static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
-+{
-+    switch (ln)
-+    {
-+        default:  // normally 0
-+            *b = a;
-+            break;
-+        case 1:
-+            a |= a << 8;
-+            *(uint16_t *)b = a;
-+            b += stride;
-+            *(uint16_t *)b = a;
-+            break;
-+        case 2:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            break;
-+        case 3:
-+        {
-+            unsigned int i;
-+            uint64_t d;
-+            a |= a << 8;
-+            a |= a << 16;
-+            d = ((uint64_t)a << 32) | a;
-+            for (i = 0; i != 8; ++i, b += stride)
-+                *(uint64_t *)b = d;
-+            break;
-+        }
-+        case 4:
-+        {
-+            unsigned int i;
-+            uint64_t d;
-+            a |= a << 8;
-+            a |= a << 16;
-+            d = ((uint64_t)a << 32) | a;
-+            for (i = 0; i != 16; ++i, b += stride)
-+            {
-+                *(uint64_t *)b = d;
-+                *(uint64_t *)(b + 8) = d;
-+            }
-+            break;
-+        }
-+    }
-+}
-+
-+// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
-+// (4 not required)
-+static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
-+{
-+    switch (ln)
-+    {
-+        default:  // 0 or -1
-+            *b_u = a;
-+            *b_l = a;
-+            break;
-+        case 1:
-+            a |= a << 8;
-+            *(uint16_t *)b_u = a;
-+            *(uint16_t *)b_l = a;
-+            break;
-+        case 2:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)b_l = a;
-+            break;
-+        case 3:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)(b_u + 4) = a;
-+            *(uint32_t *)b_l = a;
-+            *(uint32_t *)(b_l + 4) = a;
-+            break;
-+        case 4:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)(b_u + 4) = a;
-+            *(uint32_t *)(b_u + 8) = a;
-+            *(uint32_t *)(b_u + 12) = a;
-+            *(uint32_t *)b_l = a;
-+            *(uint32_t *)(b_l + 4) = a;
-+            *(uint32_t *)(b_l + 8) = a;
-+            *(uint32_t *)(b_l + 12) = a;
-+            break;
-+    }
-+}
-+
-+static void zap_cabac_stash(uint8_t * b, const int ln)
-+{
-+    switch (ln)
-+    {
-+        default:  // 0
-+            *b = 0;
-+            break;
-+        case 1:
-+            *(uint16_t *)b = 0;
-+            break;
-+        case 2:
-+            *(uint32_t *)b = 0;
-+            break;
-+        case 3:
-+            *(uint32_t *)b = 0;
-+            *(uint32_t *)(b + 4) = 0;
-+            break;
-+    }
-+}
-+
-+
-+
-+// Set a small square block of bits in a bitmap
-+// Bits must be aligned on their size boundry (which will be true of all split CBs)
-+static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
-+{
-+    unsigned int n;
-+    const unsigned int sh = (x & 7);
-+
-+    f += (x >> 3);
-+
-+    av_assert2(ln <= 3);
-+    av_assert2((x & ((1 << ln) - 1)) == 0);
-+
-+    switch (ln)
-+    {
-+        default:  // 1
-+            f[0] |= 1 << sh;
-+            break;
-+        case 1:  // 3 * 2
-+            n = 3 << sh;
-+            f[0] |= n;
-+            f[stride] |= n;
-+            break;
-+        case 2:  // 0xf * 4
-+            n = 0xf << sh;
-+            f[0] |= n;
-+            f[stride] |= n;
-+            f[stride * 2] |= n;
-+            f[stride * 3] |= n;
-+            break;
-+        case 3:  // 0xff * 8
-+            for (n = 0; n != 8; ++n, f += stride)
-+                *f = 0xff;
-+            break;
-+    }
-+}
-+
-+static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
-+   {  // 8
-+      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
-+      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
-+   },
-+   {  // 9
-+      .luma =   {0},
-+      .chroma = {0}
-+   },
-+   {  // 10
-+      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
-+      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
-+   }
-+
-+};
-+
-+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
-+{
-+    const unsigned int n = ici->n;
-+    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
-+
-+    ipe->n = n;
-+    ipe->max_fill = q1_size - ipe->min_gap;
-+    for(unsigned int i = 0; i < n; i++) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base =
-+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
-+        q->code_setup = qpu_fn(ici->setup_fns[i]);
-+        q->code_sync = qpu_fn(ici->sync_fns[i]);
-+        q->code_exit = qpu_fn(ici->exit_fns[i]);
-+    }
-+}
-+
-+static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
-+{
-+    av_assert0(bit_depth >= 8 && bit_depth <= 16);
-+
-+    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
-+}
-+
-+// Unsigned Trivial MOD
-+static inline unsigned int utmod(const unsigned int x, const unsigned int n)
-+{
-+    return x >= n ? x - n : x;
-+}
-+
-+// returns pq->job_n++
-+static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
-+{
-+    unsigned int const x2 = pq->job_n;
-+    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
-+    return x2;
-+}
-+
-+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
-+{
-+    pq->terminate = 0;
-+    pq->job_n = 0;
-+    pq->context = s;
-+    pq->worker = worker;
-+    pq->psem_out = psem_out;
-+    pq->pass_n = n;
-+    pq->started = 0;
-+    sem_init(&pq->sem_in, 0, 0);
-+}
-+
-+static void pass_queue_kill(HEVCRpiPassQueue * const pq)
-+{
-+    sem_destroy(&pq->sem_in);
-+}
-+
-+static inline void rpi_sem_wait(sem_t * const sem)
-+{
-+    while (sem_wait(sem) != 0) {
-+        av_assert0(errno == EINTR);
-+    }
-+}
-+
-+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
-+{
-+    sem_post(&pq->sem_in);
-+}
-+
-+static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    // Do the various passes - common with the worker code
-+    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
-+        s->passq[i].worker(s, jb);
-+    }
-+}
-+
-+
-+#if 0
-+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
-+{
-+    int x;
-+    sem_getvalue((sem_t *)&jbc->sem_out, &x);
-+    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
-+}
-+#endif
-+
-+
-+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJob * jb;
-+    HEVCRpiJobGlobal * const jbg = jbc->jbg;
-+
-+    pthread_mutex_lock(&jbg->lock);
-+    // Check local 1st
-+    if ((jb = jbc->jb1) != NULL)
-+    {
-+        // Only 1 - very easy :-)
-+        jbc->jb1 = NULL;
-+    }
-+    else
-+    {
-+        // Now look for global free chain
-+        if ((jb = jbg->free1) != NULL)
-+        {
-+            // Found one - unlink it
-+            jbg->free1 = jb->next;
-+            jb->next = NULL;
-+        }
-+        else
-+        {
-+            // Out of places to look - wait for one to become free - add to Qs
-+
-+            // Global
-+            // If "good" lc then add after the last "good" el in the chain
-+            // otherwise add to the tail
-+            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
-+            {
-+                // Add to end as we had to wait last time or wait Q empty
-+                if ((lc->jw_prev = jbg->wait_tail) == NULL)
-+                    jbg->wait_head = lc;
-+                else
-+                    lc->jw_prev->jw_next = lc;
-+                lc->jw_next = NULL;
-+                jbg->wait_tail = lc;
-+            }
-+            else
-+            {
-+                // This is a "good" lc that we need to poke into the middle
-+                // of the Q
-+                // We know that the Q isn't empty and there is at least one
-+                // !last_progess_good el in it from the previous test
-+
-+                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
-+
-+                if (p == NULL)
-+                {
-+                    // No current good els - add to head
-+                    lc->jw_next = jbg->wait_head;
-+                    jbg->wait_head = lc;
-+                }
-+                else
-+                {
-+                    lc->jw_next = p->jw_next;
-+                    p->jw_next = lc;
-+                }
-+
-+                lc->jw_next->jw_prev = lc;
-+                lc->jw_prev = p;
-+            }
-+
-+            // If "good" then we are now the last good waiting el
-+            if (lc->last_progress_good)
-+                jbg->wait_good = lc;
-+
-+            // Local
-+            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
-+                jbc->lcw_head = lc;
-+            else
-+                lc->ljw_prev->ljw_next = lc;
-+            lc->ljw_next = NULL;
-+            jbc->lcw_tail = lc;
-+        }
-+    }
-+
-+    pthread_mutex_unlock(&jbg->lock);
-+
-+    if (jb == NULL)  // Need to wait
-+    {
-+        rpi_sem_wait(&lc->jw_sem);
-+        jb = lc->jw_job;  // Set by free code
-+    }
-+
-+    return jb;
-+}
-+
-+
-+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
-+{
-+    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
-+    HEVCRpiJobCtl * jbc = jb->jbc_local;
-+    HEVCRpiLocalContext * lc = NULL;
-+
-+    pthread_mutex_lock(&jbg->lock);
-+
-+    if (jbc != NULL)
-+    {
-+        av_assert1(jbc->jb1 == NULL);
-+
-+        // Release to Local if nothing waiting there
-+        if ((lc = jbc->lcw_head) == NULL)
-+            jbc->jb1 = jb;
-+    }
-+    else
-+    {
-+        // Release to global if nothing waiting there
-+        if ((lc = jbg->wait_head) == NULL)
-+        {
-+            jb->next = jbg->free1;
-+            jbg->free1 = jb;
-+        }
-+        else
-+        {
-+            // ? seems somehow mildy ugly...
-+            jbc = lc->context->jbc;
-+        }
-+    }
-+
-+    if (lc != NULL)
-+    {
-+        // Something was waiting
-+
-+        // Unlink
-+        // Global
-+        if (lc->jw_next == NULL)
-+            jbg->wait_tail = lc->jw_prev;
-+        else
-+            lc->jw_next->jw_prev = lc->jw_prev;
-+
-+        if (lc->jw_prev == NULL)
-+            jbg->wait_head = lc->jw_next;
-+        else
-+            lc->jw_prev->jw_next = lc->jw_next;
-+
-+        // Local
-+        if (lc->ljw_next == NULL)
-+            jbc->lcw_tail = lc->ljw_prev;
-+        else
-+            lc->ljw_next->ljw_prev = lc->ljw_prev;
-+
-+        if (lc->ljw_prev == NULL)
-+            jbc->lcw_head = lc->ljw_next;
-+        else
-+            lc->ljw_prev->ljw_next = lc->ljw_next;
-+
-+        // Update good if required
-+        if (jbg->wait_good == lc)
-+            jbg->wait_good = lc->jw_prev;
-+
-+        // Prod
-+        lc->jw_job = jb;
-+        sem_post(&lc->jw_sem);
-+    }
-+
-+    pthread_mutex_unlock(&jbg->lock);
-+}
-+
-+static void job_lc_kill(HEVCRpiLocalContext * const lc)
-+{
-+    sem_destroy(&lc->jw_sem);
-+}
-+
-+static void job_lc_init(HEVCRpiLocalContext * const lc)
-+{
-+    lc->jw_next = NULL;
-+    lc->jw_prev = NULL;
-+    lc->ljw_next = NULL;
-+    lc->ljw_prev = NULL;
-+    lc->jw_job = NULL;
-+    sem_init(&lc->jw_sem,  0, 0);
-+}
-+
-+// Returns:
-+//  0 if we have waited for MV or expect to wait for recon
-+//  1 if we haven't waited for MV & do not need to wait for recon
-+static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
-+{
-+    if (jb->waited) // reset by rpi_begin
-+        return 0;
-+    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
-+    {
-+        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
-+                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
-+            return 0;
-+    }
-+    return 1;
-+}
-+
-+// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
-+static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl *const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    av_assert1(jb != NULL);
-+
-+    if (jb->ctu_ts_last < 0) {
-+        return;
-+    }
-+
-+    lc->last_progress_good = progress_good(s, jb);
-+    jb->waited = !lc->last_progress_good;
-+    lc->jb0 = NULL;
-+
-+    if (s->offload_recon)
-+    {
-+        pthread_mutex_lock(&jbc->in_lock);
-+        jbc->offloadq[jbc->offload_in] = jb;
-+        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
-+        pthread_mutex_unlock(&jbc->in_lock);
-+
-+        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
-+    }
-+    else
-+    {
-+        pass_queue_do_all(s, jb);  // Consumes job before return
-+    }
-+}
-+
-+
-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-+// available to receive the next job.
-+//
-+// Now safe against multiple callers - needed for tiles
-+// "normal" and WPP will only call here one at a time
-+static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+
-+    // It is legit for us to already have a job allocated - do nothing in this case
-+    if (lc->jb0 != NULL)
-+        return;
-+
-+    if (s->offload_recon)
-+        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
-+
-+    lc->jb0 = job_alloc(jbc, lc);
-+
-+    rpi_begin(s, lc->jb0, lc->ts);
-+}
-+
-+// Free up a job without submission
-+static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    if (jb == NULL) {
-+        return;
-+    }
-+
-+    lc->jb0 = NULL;
-+
-+    job_free(jbc, jb);
-+
-+    // If offload then poke sem_out too
-+    if (s->offload_recon) {
-+        sem_post(&jbc->sem_out);
-+    }
-+}
-+
-+
-+// Call this to wait for all jobs to have completed at the end of a frame
-+// Slightly icky as there is no clean way to wait for a sem to count up
-+// Not reentrant - call on main thread only
-+static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    int i = 0;
-+
-+    // We shouldn't reach here with an unsubmitted job
-+    av_assert1(lc->jb0 == NULL);
-+
-+    // If no offload then there can't be anything to wait for
-+    if (!s->offload_recon) {
-+        return;
-+    }
-+
-+    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
-+    {
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            rpi_sem_wait(&jbc->sem_out);
-+        }
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            sem_post(&jbc->sem_out);
-+        }
-+    }
-+}
-+
-+static void * pass_worker(void *arg)
-+{
-+    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
-+    HEVCRpiContext *const s = pq->context;
-+
-+    for (;;)
-+    {
-+        rpi_sem_wait(&pq->sem_in);
-+
-+        if (pq->terminate)
-+            break;
-+
-+        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
-+        // * should really set jb->passes_done here
-+
-+        sem_post(pq->psem_out);
-+    }
-+    return NULL;
-+}
-+
-+static void pass_queues_start_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
-+        pqs[i].started = 1;
-+    }
-+}
-+
-+static void pass_queues_term_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pqs[i].terminate = 1;
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started)
-+            sem_post(&pqs[i].sem_in);
-+    }
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started) {
-+            pthread_join(pqs[i].thread, NULL);
-+            pqs[i].started = 0;
-+        }
-+    }
-+}
-+
-+static void pass_queues_kill_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pass_queue_kill(pqs + i);
-+}
-+
-+
-+static void worker_pic_free_one(HEVCRpiJob * const jb)
-+{
-+    // Free coeff stuff - allocation not the same for all buffers
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+    if (cf->s[0].buf != NULL)
-+        av_freep(&cf->mptr);
-+    if (cf->s[2].buf != NULL)
-+        gpu_free(&cf->gptr);
-+    memset(cf, 0, sizeof(*cf));
-+}
-+
-+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
-+{
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
-+        goto fail;
-+    cf->s[2].buf = (int16_t *)cf->gptr.arm;
-+    cf->s[3].buf = cf->s[2].buf + coeff_count;
-+
-+    // Must be 64 byte aligned for our zero zapping code so over-allocate &
-+    // round
-+    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
-+        goto fail;
-+    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
-+    return 0;
-+
-+fail:
-+    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
-+    worker_pic_free_one(jb);
-+    return -1;
-+}
-+
-+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
-+{
-+    unsigned int i;
-+    for (i = 0; i != 4; ++i) {
-+        cf->s[i].n = 0;
-+#if RPI_COMPRESS_COEFFS        
-+        cf->s[i].packed = 1;
-+        cf->s[i].packed_n = 0;
-+#endif
-+    }
-+}
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
-+{
-+    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
-+    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
-+    cfe->n += n;
-+    return coeffs;
-+}
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int val, const int field)
-+{
-+    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
-+        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
-+        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
-+        sem_t * sem = NULL;
-+
-+        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+        if (((volatile int *)ref->tf.progress->data)[field] < val) {
-+            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
-+
-+            av_assert1(pwait->req == -1 && pwait->next == NULL);
-+            jb->waited = 1;  // Remember that we had to wait for later scheduling
-+
-+            pwait->req = val;
-+            pwait->next = NULL;
-+            if (pstate->first == NULL)
-+                pstate->first = pwait;
-+            else
-+                pstate->last->next = pwait;
-+            pstate->last = pwait;
-+            sem = &pwait->sem;
-+        }
-+        pthread_mutex_unlock(&pstate->lock);
-+
-+        if (sem != NULL) {
-+            rpi_sem_wait(sem);
-+        }
-+    }
-+}
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
-+{
-+    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
-+
-+    ((int *)s->ref->tf.progress->data)[field] = val;
-+
-+    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+    {
-+        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
-+        HEVCRpiFrameProgressWait * pwait;
-+
-+        while ((pwait = *ppwait) != NULL) {
-+            if (pwait->req > val)
-+            {
-+                ppwait = &pwait->next;
-+                pstate->last = pwait;
-+            }
-+            else
-+            {
-+                *ppwait = pwait->next;
-+                pwait->req = -1;
-+                pwait->next = NULL;
-+                sem_post(&pwait->sem);
-+            }
-+        }
-+    }
-+    pthread_mutex_unlock(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+    pstate->first = NULL;
-+    pstate->last = NULL;
-+    pthread_mutex_init(&pstate->lock, NULL);
-+}
-+
-+static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+    pwait->req = -1;
-+    pwait->next = NULL;
-+    sem_init(&pwait->sem, 0, 0);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+    av_assert1(pstate->first == NULL);
-+    pthread_mutex_destroy(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+    sem_destroy(&pwait->sem);
-+}
-+
-+
-+/**
-+ * NOTE: Each function hls_foo correspond to the function foo in the
-+ * specification (HLS stands for High Level Syntax).
-+ */
-+
-+/**
-+ * Section 5.7
-+ */
-+
-+// Realloc the entry point arrays
-+static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
-+{
-+    if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
-+    {
-+        // Round up alloc to multiple of 32
-+        int a = (n + 31) & ~31;
-+
-+        // We don't care about the previous contents so probably fastest to simply discard
-+        av_freep(&sh->entry_point_offset);
-+        av_freep(&sh->offset);
-+        av_freep(&sh->size);
-+
-+        if (a != 0)
-+        {
-+            sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
-+            sh->offset = av_malloc_array(a, sizeof(int));
-+            sh->size = av_malloc_array(a, sizeof(int));
-+
-+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
-+                sh->num_entry_point_offsets = 0;
-+                sh->offsets_allocated = 0;
-+                return AVERROR(ENOMEM);
-+            }
-+        }
-+
-+        sh->offsets_allocated = a;
-+    }
-+
-+    return 0;
-+}
-+
-+/* free everything allocated  by pic_arrays_init() */
-+static void pic_arrays_free(HEVCRpiContext *s)
-+{
-+    av_freep(&s->sao);
-+    av_freep(&s->deblock);
-+
-+    av_freep(&s->cabac_stash_up);
-+    s->cabac_stash_left = NULL;  // freed with _up
-+
-+    av_freep(&s->mvf_up);
-+    av_freep(&s->mvf_left);
-+
-+    av_freep(&s->is_pcm);
-+    av_freep(&s->is_intra_store);
-+    s->is_intra = NULL;
-+    av_freep(&s->rpl_tab);
-+    s->rpl_tab_size = 0;
-+
-+    av_freep(&s->qp_y_tab);
-+    av_freep(&s->tab_slice_address);
-+    av_freep(&s->filter_slice_edges);
-+
-+    av_freep(&s->bs_horizontal);
-+    s->bs_vertical = NULL;  // freed with H
-+    av_freep(&s->bsf_stash_left);
-+    av_freep(&s->bsf_stash_up);
-+
-+    av_freep(&s->rpl_up);
-+    av_freep(&s->rpl_left);
-+
-+    alloc_entry_points(&s->sh, 0);
-+
-+    av_buffer_pool_uninit(&s->col_mvf_pool);
-+}
-+
-+/* allocate arrays that depend on frame dimensions */
-+static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
-+{
-+    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
-+    const unsigned int width            = sps->width;
-+    const unsigned int height           = sps->height;
-+    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
-+                           ((height >> log2_min_cb_size) + 1);
-+    const unsigned int ctb_count        = sps->ctb_size;
-+
-+    {
-+        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
-+        unsigned int h = ((height + 15) & ~15);
-+
-+        s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
-+        s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
-+    }
-+
-+    s->sao           = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
-+    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
-+    if (!s->sao || !s->deblock)
-+        goto fail;
-+
-+    s->cabac_stash_up  = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
-+    s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
-+    if (s->cabac_stash_up == NULL)
-+        goto fail;
-+
-+    // Round width up to max ctb size
-+    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+    // * Only needed if we have H tiles
-+    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+
-+    // We can overread by 1 line & one byte in deblock so alloc & zero
-+    // We don't need to zero the extra @ start of frame as it will never be
-+    // written
-+    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+    if (s->is_pcm == NULL || s->is_intra_store == NULL)
-+        goto fail;
-+
-+    s->filter_slice_edges = av_mallocz(ctb_count);
-+    s->tab_slice_address  = av_malloc_array(ctb_count,
-+                                      sizeof(*s->tab_slice_address));
-+    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
-+                                      sizeof(*s->qp_y_tab));
-+    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
-+        goto fail;
-+
-+    s->bs_horizontal = av_mallocz(s->bs_size * 2);
-+    s->bs_vertical   = s->bs_horizontal + s->bs_size;
-+    if (s->bs_horizontal == NULL)
-+        goto fail;
-+
-+    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
-+    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
-+    if (s->rpl_left == NULL || s->rpl_up == NULL)
-+        goto fail;
-+
-+    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
-+        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
-+        goto fail;
-+
-+    s->col_mvf_stride = (width + 15) >> 4;
-+    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
-+                                          av_buffer_allocz);
-+    if (s->col_mvf_pool == NULL)
-+        goto fail;
-+
-+    return 0;
-+
-+fail:
-+    pic_arrays_free(s);
-+    return AVERROR(ENOMEM);
-+}
-+
-+static void default_pred_weight_table(HEVCRpiContext * const s)
-+{
-+  unsigned int i;
-+  const unsigned int wt = 1 << QPU_MC_DENOM;
-+  s->sh.luma_log2_weight_denom = 0;
-+  s->sh.chroma_log2_weight_denom = 0;
-+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
-+      s->sh.luma_weight_l0[i] = wt;
-+      s->sh.luma_offset_l0[i] = 0;
-+      s->sh.chroma_weight_l0[i][0] = wt;
-+      s->sh.chroma_weight_l0[i][1] = wt;
-+      s->sh.chroma_offset_l0[i][0] = 0;
-+      s->sh.chroma_offset_l0[i][1] = 0;
-+  }
-+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
-+      s->sh.luma_weight_l1[i] = wt;
-+      s->sh.luma_offset_l1[i] = 0;
-+      s->sh.chroma_weight_l1[i][0] = wt;
-+      s->sh.chroma_weight_l1[i][1] = wt;
-+      s->sh.chroma_offset_l1[i][0] = 0;
-+      s->sh.chroma_offset_l1[i][1] = 0;
-+  }
-+}
-+
-+static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
-+                       const unsigned int refs,
-+                       int16_t * luma_weight,   int16_t * luma_offset,
-+                       int16_t * chroma_weight, int16_t * chroma_offset)
-+{
-+    unsigned int luma_flags;
-+    unsigned int chroma_flags;
-+    unsigned int i;
-+    const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
-+    const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
-+    const unsigned int luma_weight_base    = 1 << QPU_MC_DENOM;
-+    const unsigned int chroma_weight_base  = 1 << QPU_MC_DENOM;
-+    const unsigned int luma_weight_shift   = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
-+    const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
-+
-+    if (refs == 0)
-+        return 0;
-+
-+    luma_flags = get_bits(gb, refs);
-+    chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
-+    i = 1 << (refs - 1);
-+
-+    do
-+    {
-+        if ((luma_flags & i) != 0)
-+        {
-+            const int delta_weight = get_se_golomb(gb);
-+            const int offset = get_se_golomb(gb);
-+            if (delta_weight < -128 || delta_weight > 127 ||
-+                offset < -wp_offset_half_range || offset >= wp_offset_half_range)
-+            {
-+                return AVERROR_INVALIDDATA;
-+            }
-+            *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
-+            *luma_offset++ = offset << wp_offset_bd_shift;
-+        }
-+        else
-+        {
-+            *luma_weight++ = luma_weight_base;
-+            *luma_offset++ = 0;
-+        }
-+
-+        if ((chroma_flags & i) != 0)
-+        {
-+            unsigned int j;
-+            for (j = 0; j != 2; ++j)
-+            {
-+                const int delta_weight = get_se_golomb(gb);
-+                const int delta_offset = get_se_golomb(gb);
-+
-+                if (delta_weight < -128 || delta_weight > 127 ||
-+                    delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
-+                {
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
-+                *chroma_offset++ = av_clip(
-+                    wp_offset_half_range + delta_offset -
-+                        ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
-+                    -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
-+            }
-+        }
-+        else
-+        {
-+            *chroma_weight++ = chroma_weight_base;
-+            *chroma_weight++ = chroma_weight_base;
-+            *chroma_offset++ = 0;
-+            *chroma_offset++ = 0;
-+        }
-+    } while ((i >>= 1) != 0);
-+
-+    return 0;
-+}
-+
-+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
-+{
-+    int err;
-+    const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
-+    const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
-+
-+    if (luma_log2_weight_denom > 7 ||
-+        chroma_log2_weight_denom > 7)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
-+               luma_log2_weight_denom, chroma_log2_weight_denom);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
-+    s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
-+
-+    if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
-+                s->sh.luma_weight_l0,      s->sh.luma_offset_l0,
-+                s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
-+        (err = get_weights(s, gb, s->sh.nb_refs[L1],
-+                s->sh.luma_weight_l1,      s->sh.luma_offset_l1,
-+                s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
-+        return err;
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
-+{
-+    const HEVCRpiSPS *sps = s->ps.sps;
-+    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
-+    int prev_delta_msb = 0;
-+    unsigned int nb_sps = 0, nb_sh;
-+    int i;
-+
-+    rps->nb_refs = 0;
-+    if (!sps->long_term_ref_pics_present_flag)
-+        return 0;
-+
-+    if (sps->num_long_term_ref_pics_sps > 0)
-+        nb_sps = get_ue_golomb_long(gb);
-+    nb_sh = get_ue_golomb_long(gb);
-+
-+    if (nb_sps > sps->num_long_term_ref_pics_sps)
-+        return AVERROR_INVALIDDATA;
-+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
-+        return AVERROR_INVALIDDATA;
-+
-+    rps->nb_refs = nb_sh + nb_sps;
-+
-+    for (i = 0; i < rps->nb_refs; i++) {
-+        uint8_t delta_poc_msb_present;
-+
-+        if (i < nb_sps) {
-+            uint8_t lt_idx_sps = 0;
-+
-+            if (sps->num_long_term_ref_pics_sps > 1)
-+                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
-+
-+            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
-+            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
-+        } else {
-+            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
-+            rps->used[i] = get_bits1(gb);
-+        }
-+
-+        delta_poc_msb_present = get_bits1(gb);
-+        if (delta_poc_msb_present) {
-+            int64_t delta = get_ue_golomb_long(gb);
-+            int64_t poc;
-+
-+            if (i && i != nb_sps)
-+                delta += prev_delta_msb;
-+
-+            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
-+            if (poc != (int32_t)poc)
-+                return AVERROR_INVALIDDATA;
-+            rps->poc[i] = poc;
-+            prev_delta_msb = delta;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
-+                                 const HEVCRpiSPS *sps)
-+{
-+    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
-+    const HEVCRpiWindow *ow = &sps->output_window;
-+    unsigned int num = 0, den = 0;
-+
-+    avctx->pix_fmt             = sps->pix_fmt;
-+    avctx->coded_width         = sps->width;
-+    avctx->coded_height        = sps->height;
-+    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
-+    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
-+    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
-+    avctx->profile             = sps->ptl.general_ptl.profile_idc;
-+    avctx->level               = sps->ptl.general_ptl.level_idc;
-+
-+    ff_set_sar(avctx, sps->vui.sar);
-+
-+    if (sps->vui.video_signal_type_present_flag)
-+        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
-+                                                            : AVCOL_RANGE_MPEG;
-+    else
-+        avctx->color_range = AVCOL_RANGE_MPEG;
-+
-+    if (sps->vui.colour_description_present_flag) {
-+        avctx->color_primaries = sps->vui.colour_primaries;
-+        avctx->color_trc       = sps->vui.transfer_characteristic;
-+        avctx->colorspace      = sps->vui.matrix_coeffs;
-+    } else {
-+        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
-+        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
-+        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
-+    }
-+
-+    if (vps->vps_timing_info_present_flag) {
-+        num = vps->vps_num_units_in_tick;
-+        den = vps->vps_time_scale;
-+    } else if (sps->vui.vui_timing_info_present_flag) {
-+        num = sps->vui.vui_num_units_in_tick;
-+        den = sps->vui.vui_time_scale;
-+    }
-+
-+    if (num != 0 && den != 0)
-+        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
-+                  num, den, 1 << 30);
-+}
-+
-+static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
-+{
-+    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
-+
-+    // Admit to no h/w formats
-+
-+    *fmt++ = sps->pix_fmt;
-+    *fmt = AV_PIX_FMT_NONE;
-+
-+    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
-+}
-+
-+static int is_sps_supported(const HEVCRpiSPS * const sps)
-+{
-+    return av_rpi_is_sand_format(sps->pix_fmt) &&
-+           sps->width <= HEVC_RPI_MAX_WIDTH &&
-+           sps->height <= HEVC_RPI_MAX_HEIGHT;
-+}
-+
-+static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
-+                   const enum AVPixelFormat pix_fmt)
-+{
-+    int ret;
-+
-+    pic_arrays_free(s);
-+    s->ps.sps = NULL;
-+    s->ps.vps = NULL;
-+
-+    if (sps == NULL)
-+        return 0;
-+
-+    if (!is_sps_supported(sps))
-+        return AVERROR_DECODER_NOT_FOUND;
-+
-+    ret = pic_arrays_init(s, sps);
-+    if (ret < 0)
-+        goto fail;
-+
-+    export_stream_params(s->avctx, &s->ps, sps);
-+
-+    s->avctx->pix_fmt = pix_fmt;
-+
-+    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
-+    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
-+
-+    // * We don't support cross_component_prediction_enabled_flag but as that
-+    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
-+    //   only deal with sand which is never 4:4:4
-+    //   [support wouldn't be hard]
-+
-+    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
-+
-+    av_freep(&s->sao_pixel_buffer_h[0]);
-+    av_freep(&s->sao_pixel_buffer_v[0]);
-+
-+    if (sps->sao_enabled)
-+    {
-+        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
-+        unsigned int c_idx;
-+        size_t vsize[3] = {0};
-+        size_t hsize[3] = {0};
-+
-+        for(c_idx = 0; c_idx < c_count; c_idx++) {
-+            int w = sps->width >> ctx_hshift(s, c_idx);
-+            int h = sps->height >> ctx_vshift(s, c_idx);
-+            // ctb height & width are a min of 8 so this must a multiple of 16
-+            // so no point rounding up!
-+            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
-+            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
-+        }
-+
-+        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
-+        // when we have plaited chroma
-+        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
-+        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
-+        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
-+        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
-+        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
-+        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
-+    }
-+
-+    s->ps.sps = sps;
-+    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
-+
-+    return 0;
-+
-+fail:
-+    pic_arrays_free(s);
-+    s->ps.sps = NULL;
-+    return ret;
-+}
-+
-+static inline int qp_offset_valid(const int qp_offset)
-+{
-+    return qp_offset >= -12 && qp_offset <= 12;
-+}
-+
-+static int hls_slice_header(HEVCRpiContext * const s)
-+{
-+    GetBitContext * const gb = &s->HEVClc->gb;
-+    RpiSliceHeader * const sh   = &s->sh;
-+    int i, ret;
-+
-+    // Coded parameters
-+    sh->first_slice_in_pic_flag = get_bits1(gb);
-+    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+        if (IS_IDR(s))
-+            ff_hevc_rpi_clear_refs(s);
-+    }
-+    sh->no_output_of_prior_pics_flag = 0;
-+    if (IS_IRAP(s))
-+        sh->no_output_of_prior_pics_flag = get_bits1(gb);
-+
-+    sh->pps_id = get_ue_golomb_long(gb);
-+    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
-+        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (!sh->first_slice_in_pic_flag &&
-+        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
-+        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
-+    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
-+        sh->no_output_of_prior_pics_flag = 1;
-+
-+    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
-+        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
-+        const HEVCRpiSPS *last_sps = s->ps.sps;
-+        enum AVPixelFormat pix_fmt;
-+
-+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
-+            if (sps->width != last_sps->width || sps->height != last_sps->height ||
-+                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
-+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
-+                sh->no_output_of_prior_pics_flag = 0;
-+        }
-+        ff_hevc_rpi_clear_refs(s);
-+
-+        ret = set_sps(s, sps, sps->pix_fmt);
-+        if (ret < 0)
-+            return ret;
-+
-+        pix_fmt = get_format(s, sps);
-+        if (pix_fmt < 0)
-+            return pix_fmt;
-+
-+//        ret = set_sps(s, sps, pix_fmt);
-+//        if (ret < 0)
-+//            return ret;
-+
-+        s->avctx->pix_fmt = pix_fmt;
-+
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+    }
-+
-+    sh->dependent_slice_segment_flag = 0;
-+    if (!sh->first_slice_in_pic_flag) {
-+        int slice_address_length;
-+
-+        if (s->ps.pps->dependent_slice_segments_enabled_flag)
-+            sh->dependent_slice_segment_flag = get_bits1(gb);
-+
-+        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
-+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
-+        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "Invalid slice segment address: %u.\n",
-+                   sh->slice_segment_addr);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (!sh->dependent_slice_segment_flag) {
-+            sh->slice_addr = sh->slice_segment_addr;
-+            s->slice_idx++;
-+        }
-+    } else {
-+        sh->slice_segment_addr = sh->slice_addr = 0;
-+        s->slice_idx           = 0;
-+        s->slice_initialized   = 0;
-+    }
-+
-+    if (!sh->dependent_slice_segment_flag) {
-+        s->slice_initialized = 0;
-+
-+        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
-+            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
-+
-+        sh->slice_type = get_ue_golomb_long(gb);
-+        if (!(sh->slice_type == HEVC_SLICE_I ||
-+              sh->slice_type == HEVC_SLICE_P ||
-+              sh->slice_type == HEVC_SLICE_B)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
-+                   sh->slice_type);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        // when flag is not present, picture is inferred to be output
-+        sh->pic_output_flag = 1;
-+        if (s->ps.pps->output_flag_present_flag)
-+            sh->pic_output_flag = get_bits1(gb);
-+
-+        if (s->ps.sps->separate_colour_plane_flag)
-+            sh->colour_plane_id = get_bits(gb, 2);
-+
-+        if (!IS_IDR(s)) {
-+            int poc, pos;
-+
-+            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
-+            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
-+            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
-+                av_log(s->avctx, AV_LOG_WARNING,
-+                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
-+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+                    return AVERROR_INVALIDDATA;
-+                poc = s->poc;
-+            }
-+            s->poc = poc;
-+
-+            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
-+            pos = get_bits_left(gb);
-+            if (!sh->short_term_ref_pic_set_sps_flag) {
-+                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
-+                if (ret < 0)
-+                    return ret;
-+
-+                sh->short_term_rps = &sh->slice_rps;
-+            } else {
-+                int numbits, rps_idx;
-+
-+                if (!s->ps.sps->nb_st_rps) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
-+                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
-+                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
-+            }
-+            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+            pos = get_bits_left(gb);
-+            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
-+            if (ret < 0) {
-+                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
-+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+                    return AVERROR_INVALIDDATA;
-+            }
-+            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
-+                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
-+            else
-+                sh->slice_temporal_mvp_enabled_flag = 0;
-+        } else {
-+            s->sh.short_term_rps = NULL;
-+            s->poc               = 0;
-+        }
-+
-+        /* 8.3.1 */
-+        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
-+            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
-+            s->nal_unit_type != HEVC_NAL_TSA_N   &&
-+            s->nal_unit_type != HEVC_NAL_STSA_N  &&
-+            s->nal_unit_type != HEVC_NAL_RADL_N  &&
-+            s->nal_unit_type != HEVC_NAL_RADL_R  &&
-+            s->nal_unit_type != HEVC_NAL_RASL_N  &&
-+            s->nal_unit_type != HEVC_NAL_RASL_R)
-+            s->pocTid0 = s->poc;
-+
-+        if (s->ps.sps->sao_enabled) {
-+            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
-+            if (ctx_cfmt(s) != 0) {
-+                sh->slice_sample_adaptive_offset_flag[1] =
-+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
-+            }
-+        } else {
-+            sh->slice_sample_adaptive_offset_flag[0] = 0;
-+            sh->slice_sample_adaptive_offset_flag[1] = 0;
-+            sh->slice_sample_adaptive_offset_flag[2] = 0;
-+        }
-+
-+        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
-+        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
-+            int nb_refs;
-+
-+            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
-+            if (sh->slice_type == HEVC_SLICE_B)
-+                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
-+
-+            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
-+                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
-+                if (sh->slice_type == HEVC_SLICE_B)
-+                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
-+            }
-+            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
-+                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
-+                       sh->nb_refs[L0], sh->nb_refs[L1]);
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            sh->rpl_modification_flag[0] = 0;
-+            sh->rpl_modification_flag[1] = 0;
-+            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
-+            if (!nb_refs) {
-+                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
-+                sh->rpl_modification_flag[0] = get_bits1(gb);
-+                if (sh->rpl_modification_flag[0]) {
-+                    for (i = 0; i < sh->nb_refs[L0]; i++)
-+                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+                }
-+
-+                if (sh->slice_type == HEVC_SLICE_B) {
-+                    sh->rpl_modification_flag[1] = get_bits1(gb);
-+                    if (sh->rpl_modification_flag[1] == 1)
-+                        for (i = 0; i < sh->nb_refs[L1]; i++)
-+                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+                }
-+            }
-+
-+            if (sh->slice_type == HEVC_SLICE_B)
-+                sh->mvd_l1_zero_flag = get_bits1(gb);
-+
-+            if (s->ps.pps->cabac_init_present_flag)
-+                sh->cabac_init_flag = get_bits1(gb);
-+            else
-+                sh->cabac_init_flag = 0;
-+
-+            sh->collocated_ref_idx = 0;
-+            if (sh->slice_temporal_mvp_enabled_flag) {
-+                sh->collocated_list = L0;
-+                if (sh->slice_type == HEVC_SLICE_B)
-+                    sh->collocated_list = !get_bits1(gb);
-+
-+                if (sh->nb_refs[sh->collocated_list] > 1) {
-+                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
-+                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
-+                        av_log(s->avctx, AV_LOG_ERROR,
-+                               "Invalid collocated_ref_idx: %d.\n",
-+                               sh->collocated_ref_idx);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+                }
-+            }
-+
-+            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
-+                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
-+            {
-+                if ((ret = pred_weight_table(s, gb)) != 0)
-+                    return ret;
-+            }
-+            else
-+            {
-+                // Give us unit weights
-+                default_pred_weight_table(s);
-+            }
-+
-+            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
-+            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-+                av_log(s->avctx, AV_LOG_ERROR,
-+                       "Invalid number of merging MVP candidates: %d.\n",
-+                       sh->max_num_merge_cand);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        }
-+
-+        sh->slice_qp_delta = get_se_golomb(gb);
-+
-+        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
-+            sh->slice_cb_qp_offset = get_se_golomb(gb);
-+            sh->slice_cr_qp_offset = get_se_golomb(gb);
-+            if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
-+                !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
-+                !qp_offset_valid(sh->slice_cr_qp_offset) ||
-+                !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
-+                       sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
-+                       s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        } else
-+        {
-+            sh->slice_cb_qp_offset = 0;
-+            sh->slice_cr_qp_offset = 0;
-+        }
-+
-+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
-+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
-+        else
-+            sh->cu_chroma_qp_offset_enabled_flag = 0;
-+
-+        if (s->ps.pps->deblocking_filter_control_present_flag) {
-+            int deblocking_filter_override_flag = 0;
-+
-+            if (s->ps.pps->deblocking_filter_override_enabled_flag)
-+                deblocking_filter_override_flag = get_bits1(gb);
-+
-+            if (deblocking_filter_override_flag) {
-+                sh->disable_deblocking_filter_flag = get_bits1(gb);
-+                if (!sh->disable_deblocking_filter_flag) {
-+                    int beta_offset_div2 = get_se_golomb(gb);
-+                    int tc_offset_div2   = get_se_golomb(gb) ;
-+                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
-+                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
-+                        av_log(s->avctx, AV_LOG_ERROR,
-+                            "Invalid deblock filter offsets: %d, %d\n",
-+                            beta_offset_div2, tc_offset_div2);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+                    sh->beta_offset = beta_offset_div2 * 2;
-+                    sh->tc_offset   =   tc_offset_div2 * 2;
-+                }
-+            } else {
-+                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
-+                sh->beta_offset                    = s->ps.pps->beta_offset;
-+                sh->tc_offset                      = s->ps.pps->tc_offset;
-+            }
-+        } else {
-+            sh->disable_deblocking_filter_flag = 0;
-+            sh->beta_offset                    = 0;
-+            sh->tc_offset                      = 0;
-+        }
-+
-+        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
-+            (sh->slice_sample_adaptive_offset_flag[0] ||
-+             sh->slice_sample_adaptive_offset_flag[1] ||
-+             !sh->disable_deblocking_filter_flag)) {
-+            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+        } else {
-+            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
-+        }
-+        sh->no_dblk_boundary_flags =
-+            (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
-+                BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
-+            (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
-+                BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
-+
-+
-+    } else if (!s->slice_initialized) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sh->num_entry_point_offsets = 0;
-+    sh->offload_wpp = 0;
-+    sh->offload_tiles = 0;
-+
-+    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
-+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
-+        // It would be possible to bound this tighter but this here is simpler
-+        if (num_entry_point_offsets > get_bits_left(gb)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        sh->num_entry_point_offsets = num_entry_point_offsets;
-+        if (sh->num_entry_point_offsets > 0) {
-+            int offset_len = get_ue_golomb_long(gb) + 1;
-+
-+            if (offset_len < 1 || offset_len > 32) {
-+                sh->num_entry_point_offsets = 0;
-+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
-+                return ret;
-+            }
-+
-+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
-+                uint32_t val_minus1 = get_bits_long(gb, offset_len);
-+                if (val_minus1 > (1 << 28))
-+                {
-+                    // We can declare offsets of > 2^28 bad without loss of generality
-+                    // Will check actual bounds wrt NAL later, but this keeps
-+                    // the values within bounds we can deal with easily
-+                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
-+            }
-+
-+            // Do we want to offload this
-+            if (s->threads_type != 0)
-+            {
-+                sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
-+                    s->ps.pps->num_tile_columns > 1;
-+                // * We only cope with WPP in a single column
-+                //   Probably want to deal with that case as tiles rather than WPP anyway
-+                // ?? Not actually sure that the main code deals with WPP + multi-col correctly
-+                sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
-+                    s->ps.pps->num_tile_columns == 1;
-+            }
-+        }
-+    }
-+
-+    if (s->ps.pps->slice_header_extension_present_flag) {
-+        unsigned int length = get_ue_golomb_long(gb);
-+        if (length*8LL > get_bits_left(gb)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        for (i = 0; i < length; i++)
-+            skip_bits(gb, 8);  // slice_header_extension_data_byte
-+    }
-+
-+    // Inferred parameters
-+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
-+    if (sh->slice_qp > 51 ||
-+        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "The slice_qp %d is outside the valid range "
-+               "[%d, 51].\n",
-+               sh->slice_qp,
-+               -s->ps.sps->qp_bd_offset);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Overread slice header by %d bits\n", -get_bits_left(gb));
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    s->slice_initialized = 1;
-+    return 0;
-+}
-+
-+static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
-+{
-+    RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
-+    int c_idx, i;
-+
-+    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
-+        s->sh.slice_sample_adaptive_offset_flag[1]) {
-+        if ((lc->ctb_avail & AVAIL_L) != 0)
-+        {
-+            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+            if (sao_merge_left_flag) {
-+                *sao = sao[-1];
-+                return;
-+            }
-+        }
-+        if ((lc->ctb_avail & AVAIL_U) != 0)
-+        {
-+            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+            if (sao_merge_up_flag) {
-+                *sao = sao[-(int)s->ps.sps->ctb_width];
-+                return;
-+            }
-+        }
-+    }
-+
-+    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
-+        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
-+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
-+        int offset_abs[4];
-+        char offset_sign[4] = {0};
-+
-+        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
-+            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
-+            continue;
-+        }
-+
-+        if (c_idx == 2) {
-+            sao->type_idx[2] = sao->type_idx[1];
-+            sao->eo_class[2] = sao->eo_class[1];
-+        } else {
-+            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
-+        }
-+
-+        // ** Could use BY22 here quite plausibly - this is all bypass stuff
-+        //    though only per CTB so not very timing critical
-+
-+        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
-+            continue;
-+
-+        for (i = 0; i < 4; i++)
-+            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
-+
-+        if (sao->type_idx[c_idx] == SAO_BAND) {
-+            for (i = 0; i < 4; i++) {
-+                if (offset_abs[i] != 0)
-+                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
-+            }
-+            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
-+        } else if (c_idx != 2) {
-+            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
-+        }
-+
-+        // Inferred parameters
-+        sao->offset_val[c_idx][0] = 0;
-+        for (i = 0; i < 4; i++) {
-+            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
-+            if (sao->type_idx[c_idx] == SAO_EDGE) {
-+                if (i > 1)
-+                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+            } else if (offset_sign[i]) {
-+                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+            }
-+        }
-+    }
-+}
-+
-+#if 0
-+static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
-+    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);  // 0..4
-+
-+    if (log2_res_scale_abs_plus1 !=  0) {
-+        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
-+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
-+                               (1 - 2 * res_scale_sign_flag);
-+    } else {
-+        lc->tu.res_scale_val = 0;
-+    }
-+
-+
-+    return 0;
-+}
-+#endif
-+
-+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
-+{
-+    return jb->intra.cmds + jb->intra.n++;
-+}
-+
-+#define A0(x, y, U, L, UL, UR, DL) \
-+    [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
-+
-+#define A1(x, y, U, L, UL, UR, DL) \
-+    A0((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A0((x) + 1, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A0((x) + 0, (y) + 1,  1,   (L),  (L),   1,   (DL)),  A0((x) + 1, (y) + 1,  1,    1,    1,    0,    0  )
-+
-+#define A2(x, y, U, L, UL, UR, DL) \
-+    A1((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A1((x) + 2, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A1((x) + 0, (y) + 2,  1,   (L),  (L),   1,   (DL)),  A1((x) + 2, (y) + 2,  1,    1,    1,    0,    0  )
-+
-+#define A3(x, y, U, L, UL, UR, DL) \
-+    A2((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A2((x) + 4, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A2((x) + 0, (y) + 4,  1,   (L),  (L),   1,   (DL)),  A2((x) + 4, (y) + 4,  1,    1,    1,    0,    0  )
-+
-+#define A4(x, y, U, L, UL, UR, DL) \
-+    A3((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A3((x) + 8, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A3((x) + 0, (y) + 8,  1,   (L),  (L),   1,   (DL)),  A3((x) + 8, (y) + 8,  1,    1,    1,    0,    0  )
-+
-+static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
-+{
-+    const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
-+    const unsigned int tb_x = x & ~ctb_mask;
-+    const unsigned int tb_y = y & ~ctb_mask;
-+    const unsigned int ctb_avail = lc->ctb_avail;
-+
-+    const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
-+
-+    unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
-+
-+    // This deals with both the U & L edges
-+    if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
-+        f |= AVAIL_UL;
-+
-+    if (x + w < lc->end_of_ctb_x)
-+        f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
-+    else if (tb_y == 0)
-+        f |= (ctb_avail & AVAIL_UR);
-+#if AVAIL_S_U - AVAIL_S_UR < 0
-+#error Shift problem
-+#endif
-+
-+    // Never any D if Y beyond eoctb
-+    if (y + h < lc->end_of_ctb_y)
-+        f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
-+#if AVAIL_S_DL - AVAIL_S_L < 0
-+#error Shift problem
-+#endif
-+
-+//    printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
-+//           lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
-+//           lc->end_of_ctb_x, lc->end_of_ctb_y);
-+
-+    return f;
-+}
-+
-+#undef A0
-+#undef A1
-+#undef A2
-+#undef A3
-+#undef A4
-+
-+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
-+                          unsigned int avail)
-+{
-+    // If rpi_enabled then sand - U & V done on U call
-+    if (c_idx <= 1)
-+    {
-+        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+        cmd->type = RPI_PRED_INTRA + c_idx;
-+        cmd->size = log2_trafo_size;
-+        cmd->avail = avail;
-+        cmd->i_pred.x = x0;
-+        cmd->i_pred.y = y0;
-+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-+
-+//        printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
-+    }
-+}
-+
-+#define CBF_CB0_S 0
-+#define CBF_CB1_S 1 // CB1 must be CB0 + 1
-+#define CBF_CR0_S 2
-+#define CBF_CR1_S 3
-+
-+#define CBF_CB0 (1 << CBF_CB0_S)
-+#define CBF_CR0 (1 << CBF_CR0_S)
-+#define CBF_CB1 (1 << CBF_CB1_S)
-+#define CBF_CR1 (1 << CBF_CR1_S)
-+
-+// * Only good for chroma_idx == 1
-+static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                              const unsigned int x0, const unsigned int y0,
-+                              const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
-+                              const unsigned int blk_idx, const int cbf_luma,
-+                              const unsigned int const cbf_chroma)
-+{
-+    const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
-+    const unsigned int x0_c = x0 & ~7;
-+    const unsigned int y0_c = y0 & ~7;
-+
-+    enum ScanType scan_idx   = SCAN_DIAG;
-+    enum ScanType scan_idx_c = SCAN_DIAG;
-+
-+    if (lc->cu.pred_mode == MODE_INTRA)
-+    {
-+        const unsigned int trafo_size = 1 << log2_trafo_size;
-+        const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
-+
-+        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
-+
-+        if (log2_trafo_size > 2)
-+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
-+        else if (blk_idx == 3)
-+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
-+                          ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
-+
-+        if (log2_trafo_size < 4) {
-+            if (lc->tu.intra_pred_mode >= 6 &&
-+                lc->tu.intra_pred_mode <= 14) {
-+                scan_idx = SCAN_VERT;
-+            } else if (lc->tu.intra_pred_mode >= 22 &&
-+                       lc->tu.intra_pred_mode <= 30) {
-+                scan_idx = SCAN_HORIZ;
-+            }
-+
-+            if (lc->tu.intra_pred_mode_c >=  6 &&
-+                lc->tu.intra_pred_mode_c <= 14) {
-+                scan_idx_c = SCAN_VERT;
-+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
-+                       lc->tu.intra_pred_mode_c <= 30) {
-+                scan_idx_c = SCAN_HORIZ;
-+            }
-+        }
-+    }
-+
-+    if (!cbf_luma && cbf_chroma == 0)
-+        return 0;
-+
-+    if (lc->tu.is_cu_qp_delta_wanted)
-+    {
-+        const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
-+        const unsigned int cb_mask = ~0U << log2_cb_size;
-+
-+        if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
-+            qp_delta >  (25 + (s->ps.sps->qp_bd_offset >> 1)))
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "The cu_qp_delta %d is outside the valid range "
-+                   "[%d, %d].\n",
-+                   qp_delta,
-+                   -(26 + (s->ps.sps->qp_bd_offset >> 1)),
-+                    (25 + (s->ps.sps->qp_bd_offset >> 1)));
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        lc->tu.is_cu_qp_delta_wanted = 0;
-+        lc->tu.cu_qp_delta = qp_delta;
-+        ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
-+    }
-+
-+    // * Not main profile & untested due to no conform streams
-+    if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
-+        !lc->cu.cu_transquant_bypass_flag) {
-+        int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
-+        if (cu_chroma_qp_offset_flag) {
-+            int cu_chroma_qp_offset_idx  = 0;
-+            if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
-+                cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
-+            }
-+            lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
-+            lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
-+        }
-+        lc->tu.cu_chroma_qp_offset_wanted = 0;
-+    }
-+
-+    if (cbf_luma)
-+        ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
-+
-+    if (log2_trafo_size > 2 || blk_idx == 3)
-+    {
-+        if ((cbf_chroma & CBF_CB0) != 0)
-+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+                                        log2_trafo_size_c, scan_idx_c, 1);
-+        if ((cbf_chroma & CBF_CR0) != 0)
-+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+                                        log2_trafo_size_c, scan_idx_c, 2);
-+    }
-+
-+    return 0;
-+}
-+
-+static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
-+{
-+    set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
-+}
-+
-+
-+static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                              const unsigned int x0, const unsigned int y0,
-+                              const unsigned int log2_trafo_size,
-+                              const unsigned int trafo_depth, const unsigned int blk_idx,
-+                              const unsigned int cbf_c0)
-+{
-+    // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
-+    unsigned int cbf_c1 = cbf_c0;
-+    int split_transform_flag;
-+    int ret;
-+
-+    if (lc->cu.intra_split_flag) {
-+        if (trafo_depth == 1) {
-+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
-+            if (ctx_cfmt(s) == 3) {
-+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
-+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
-+            } else {
-+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
-+            }
-+        }
-+    } else {
-+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
-+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
-+    }
-+
-+    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
-+        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
-+        trafo_depth     < lc->cu.max_trafo_depth       &&
-+        !(lc->cu.intra_split_flag && trafo_depth == 0))
-+    {
-+        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
-+    } else {
-+        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
-+                          lc->cu.pred_mode == MODE_INTER &&
-+                          lc->cu.part_mode != PART_2Nx2N &&
-+                          trafo_depth == 0;
-+
-+        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
-+                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
-+                               inter_split;
-+    }
-+
-+    if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
-+    {
-+        const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
-+        cbf_c1 = 0;
-+
-+        if ((cbf_c0 & CBF_CB0) != 0)
-+        {
-+            cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
-+            if (wants_c1)
-+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
-+        }
-+
-+        if ((cbf_c0 & CBF_CR0) != 0)
-+        {
-+            cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
-+            if (wants_c1)
-+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
-+        }
-+    }
-+
-+    if (split_transform_flag) {
-+        const int trafo_size_split = 1 << (log2_trafo_size - 1);
-+        const int x1 = x0 + trafo_size_split;
-+        const int y1 = y0 + trafo_size_split;
-+
-+#define SUBDIVIDE(x, y, idx)                                                    \
-+do {                                                                            \
-+    ret = hls_transform_tree(s, lc, x, y,                                       \
-+                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
-+                             cbf_c1);                                           \
-+    if (ret < 0)                                                                \
-+        return ret;                                                             \
-+} while (0)
-+
-+        SUBDIVIDE(x0, y0, 0);
-+        SUBDIVIDE(x1, y0, 1);
-+        SUBDIVIDE(x0, y1, 2);
-+        SUBDIVIDE(x1, y1, 3);
-+
-+#undef SUBDIVIDE
-+    } else {
-+        // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
-+        // trafo_size == 2 with depth == 0 the issue is moot
-+        const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
-+            ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
-+
-+        ret = hls_transform_unit(s, lc, x0, y0,
-+                                 log2_trafo_size + trafo_depth, log2_trafo_size,
-+                                 blk_idx, cbf_luma, cbf_c1);
-+        if (ret < 0)
-+            return ret;
-+
-+        if (!s->sh.disable_deblocking_filter_flag) {
-+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
-+        }
-+    }
-+    return 0;
-+}
-+
-+
-+static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
-+{
-+    GetBitContext gb;
-+    int ret;
-+
-+    ret = init_get_bits(&gb, pcm, length);
-+    if (ret < 0)
-+        return ret;
-+
-+    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
-+                       frame_stride1(s->frame, 0),
-+                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
-+
-+    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
-+                       s->frame->linesize[1],
-+                       cb_size >> ctx_hshift(s, 1),
-+                       cb_size >> ctx_vshift(s, 1),
-+                       &gb, s->ps.sps->pcm.bit_depth_chroma);
-+
-+    return 0;
-+}
-+
-+
-+// x * 2^(y*2)
-+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
-+{
-+    return x << (y * 2);
-+}
-+
-+static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
-+{
-+    // Length in bits
-+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
-+
-+    const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
-+
-+    if (!s->sh.disable_deblocking_filter_flag)
-+        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+
-+    // Copy coeffs
-+    {
-+        const int blen = (length + 7) >> 3;
-+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
-+        // Allocation is in int16_t s
-+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
-+        // sample this rounding doesn't affect the total size we need to allocate for
-+        // the coeff buffer
-+        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
-+        memcpy(coeffs, pcm, blen);
-+
-+        // Our coeff stash assumes that any partially allocated 64byte lump
-+        // is zeroed so make that true.
-+        {
-+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
-+            if ((-(intptr_t)eopcm & 63) != 0)
-+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
-+        }
-+
-+        // Add command
-+        {
-+            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+            cmd->type = RPI_PRED_I_PCM;
-+            cmd->size = log2_cb_size;
-+            cmd->i_pcm.src = coeffs;
-+            cmd->i_pcm.x = x0;
-+            cmd->i_pcm.y = y0;
-+            cmd->i_pcm.src_len = length;
-+        }
-+        return 0;
-+    }
-+}
-+
-+
-+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
-+                                const MvXY xy, const int y0, const int height)
-+{
-+    if (s->threads_type != 0) {
-+        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
-+
-+        // Progress has to be attached to current job as the actual wait
-+        // is in worker_core which can't use lc
-+        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
-+        if (*pr < y) {
-+            *pr = y;
-+        }
-+    }
-+}
-+
-+static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                  const int x0, const int y0, const int nPbW,
-+                                  const int nPbH,
-+                                  HEVCRpiMvField * const mv)
-+{
-+    enum InterPredIdc inter_pred_idc = PRED_L0;
-+    int mvp_flag;
-+    const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
-+
-+    mv->pred_flag = 0;
-+    if (s->sh.slice_type == HEVC_SLICE_B)
-+        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
-+
-+    if (inter_pred_idc != PRED_L1) {
-+        MvXY mvd;
-+
-+        if (s->sh.nb_refs[L0])
-+            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
-+
-+        mv->pred_flag = PF_L0;
-+        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+                                 mv, mvp_flag, 0);
-+        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
-+    }
-+
-+    if (inter_pred_idc != PRED_L0) {
-+        MvXY mvd = 0;
-+
-+        if (s->sh.nb_refs[L1])
-+            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
-+
-+        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
-+            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+
-+        mv->pred_flag += PF_L1;
-+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+                                 mv, mvp_flag, 1);
-+        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
-+    }
-+}
-+
-+
-+static HEVCRpiInterPredQ *
-+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
-+{
-+    HEVCRpiInterPredQ * yp = NULL;
-+    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
-+    const unsigned int max_fill = ipe->max_fill;
-+    unsigned int load = UINT_MAX;
-+
-+    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
-+        // We will always have enough room between the Qs but if we are
-+        // running critically low due to poor scheduling then use fill size
-+        // rather than load to determine QPU.  This has obvious dire
-+        // performance implications but (a) it is better than crashing
-+        // and (b) it should (almost) never happen
-+        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
-+        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
-+
-+        if (tload < load)
-+        {
-+            yp = ypt;
-+            load = tload;
-+        }
-+    }
-+
-+    yp->load += load_val;
-+    ipe->used_grp = 1;
-+    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
-+
-+    return yp;
-+}
-+
-+
-+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
-+{
-+    for (unsigned int i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
-+
-+        q->qpu_mc_curr->data[-1] = q->code_sync;
-+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
-+        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
-+    }
-+}
-+
-+// Returns 0 on success
-+// We no longer check for Q fullness as wew have emergncy code in ctu alloc
-+// * However it might be an idea to have some means of spotting that we've used it
-+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
-+{
-+    if (!ipe->used_grp)
-+        return 0;
-+
-+    if ((ipe->curr += ipe->n_grp) >= ipe->n)
-+    {
-+        ipe->curr = 0;
-+        rpi_inter_pred_sync(ipe);
-+    }
-+    ipe->used = 1;
-+    ipe->used_grp = 0;
-+
-+    return 0;
-+}
-+
-+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+
-+    ipe->curr = 0;
-+    ipe->used = 0;
-+    ipe->used_grp = 0;
-+    for (i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base;
-+        q->load = 0;
-+        q->last_l0 = NULL;
-+        q->last_l1 = NULL;
-+    }
-+}
-+
-+static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
-+                                 const unsigned int n_max, const unsigned int n_grp,
-+                                 const unsigned int total_size, const unsigned int min_gap)
-+{
-+    memset(ipe, 0, sizeof(*ipe));
-+    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
-+    ipe->n_grp = n_grp;
-+    ipe->min_gap = min_gap;
-+
-+    gpu_malloc_cached(total_size, &ipe->gptr);
-+}
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define get_mc_address_y(f) ((f)->data[0])
-+#else
-+#define get_mc_address_y(f) get_vc_address_y(f)
-+#endif
-+#if RPI_QPU_EMU_C
-+#define get_mc_address_u(f) ((f)->data[1])
-+#else
-+#define get_mc_address_u(f) get_vc_address_u(f)
-+#endif
-+
-+static inline uint32_t pack_wo_p(const int off, const int mul)
-+{
-+    return PACK2(off * 2 + 1, mul);
-+}
-+
-+static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
-+{
-+    return PACK2(off0 + off1 + 1, mul);
-+}
-+
-+
-+static void
-+rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const MvXY mv_xy,
-+           const int weight_mul,
-+           const int weight_offset,
-+           AVFrame *const src_frame)
-+{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const unsigned int mx          = MV_X(mv_xy) & 3;
-+    const unsigned int my          = MV_Y(mv_xy) & 3;
-+    const unsigned int my_mx       = (my << 8) | mx;
-+    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
-+    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
-+    const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+
-+    if (my_mx == 0)
-+    {
-+        const int x1 = x0 + (MV_X(mv_xy) >> 2);
-+        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
-+        const int bh = nPbH;
-+
-+        for (int start_x = 0; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
-+
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                ++ts->y_pred1_x0y0;
-+
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
-+
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src_vc_address_y;
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->wo1 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+    else
-+    {
-+        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
-+        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
-+        const unsigned int bh = nPbH;
-+        int start_x = 0;
-+
-+#if 1
-+        // As Y-pred operates on two independant 8-wide src blocks we can merge
-+        // this pred with the previous one if it the previous one is 8 pel wide,
-+        // the same height as the current block, immediately to the left of our
-+        // current dest block and mono-pred.
-+
-+        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
-+        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
-+        {
-+            const int bw = FFMIN(nPbW, 8);
-+            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
-+
-+            last_y8_src2->x = x1_m3;
-+            last_y8_src2->y = y1_m3;
-+            last_y8_src2->base = src_vc_address_y;
-+            last_y8_p->w += bw;
-+            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
-+            last_y8_p->wo2 = wo;
-+
-+            jb->last_y8_p = NULL;
-+            jb->last_y8_l1 = NULL;
-+            start_x = bw;
-+#if RPI_TSTATS
-+            ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
-+#endif
-+        }
-+#endif
-+
-+        for (; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                if (mx == 0 && my == 0)
-+                    ++ts->y_pred1_x0y0;
-+                else if (mx == 0)
-+                    ++ts->y_pred1_x0;
-+                else if (my == 0)
-+                    ++ts->y_pred1_y0;
-+                else
-+                    ++ts->y_pred1_xy;
-+
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
-+            src1->x = x1_m3 + start_x;
-+            src1->y = y1_m3;
-+            src1->base = src_vc_address_y;
-+            if (bw <= 8)
-+            {
-+                src2->x = MC_DUMMY_X;
-+                src2->y = MC_DUMMY_Y;
-+#if RPI_QPU_EMU_Y
-+                src2->base = s->qpu_dummy_frame_emu;
-+#else
-+                src2->base = s->qpu_dummy_frame_qpu;
-+#endif
-+            }
-+            else
-+            {
-+                src2->x = x1_m3 + start_x + 8;
-+                src2->y = y1_m3;
-+                src2->base = src_vc_address_y;
-+            }
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo;
-+            cmd_y->wo2 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+
-+            if (bw == 8) {
-+                jb->last_y8_l1 = src2;
-+                jb->last_y8_p = cmd_y;
-+            }
-+        }
-+    }
-+}
-+
-+static void
-+rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const struct HEVCRpiMvField *const mv_field,
-+           const AVFrame *const src_frame,
-+           const AVFrame *const src_frame2)
-+{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const MvXY const mv  = mv_field->xy[0];
-+    const MvXY const mv2 = mv_field->xy[1];
-+
-+    const unsigned int mx          = MV_X(mv) & 3;
-+    const unsigned int my          = MV_Y(mv) & 3;
-+    const unsigned int my_mx = (my<<8) | mx;
-+    const unsigned int mx2          = MV_X(mv2) & 3;
-+    const unsigned int my2          = MV_Y(mv2) & 3;
-+    const unsigned int my2_mx2 = (my2<<8) | mx2;
-+    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
-+    const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+    const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+    const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
-+    const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
-+
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+
-+    if (my2_mx2_my_mx == 0)
-+    {
-+        const int x1 = x0 + (MV_X(mv) >> 2);
-+        const int y1 = y0 + (MV_Y(mv) >> 2);
-+        const int x2 = x0 + (MV_X(mv2) >> 2);
-+        const int y2 = y0 + (MV_Y(mv2) >> 2);
-+        const int bh = nPbH;
-+
-+        // Can do chunks a full 16 wide if we don't want the H filter
-+        for (int start_x=0; start_x < nPbW; start_x += 16)
-+        {
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                ++ts->y_pred2_x0y0;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 16);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = 0;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+    else
-+    {
-+        // Filter requires a run-up of 3
-+        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
-+        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
-+        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
-+        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
-+        const int bh = nPbH;
-+
-+        for (int start_x=0; start_x < nPbW; start_x += 8)
-+        { // B blocks work 8 at a time
-+            // B weights aren't doubled as the QPU code does the same
-+            // amount of work as it does for P
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                const unsigned int mmx = mx | mx2;
-+                const unsigned int mmy = my | my2;
-+                if (mmx == 0 && mmy == 0)
-+                    ++ts->y_pred2_x0y0;
-+                else if (mmx == 0)
-+                    ++ts->y_pred2_x0;
-+                else if (mmy == 0)
-+                    ++ts->y_pred2_y0;
-+                else
-+                    ++ts->y_pred2_xy;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 8);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+  const unsigned int lx, const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const MvXY const mv,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  AVFrame * const src_frame)
-+{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // = s->ps.sps->hshift[1];
-+    const int vshift = 1; // = s->ps.sps->vshift[1];
-+
-+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
-+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
-+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
-+    const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
-+    const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
-+    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
-+    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
-+
-+    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
-+        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
-+        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
-+        qpu_mc_src_t * const last_lx = *plast_lx;
-+        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+        last_lx->x = x1_c + start_x;
-+        last_lx->y = y1_c;
-+        last_lx->base = src_base_u;
-+        cmd_c->h = bh;
-+        cmd_c->w = bw;
-+        cmd_c->coeffs_x = x_coeffs;
-+        cmd_c->coeffs_y = y_coeffs;
-+        cmd_c->wo_u = wo_u;
-+        cmd_c->wo_v = wo_v;
-+        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
-+        *plast_lx = &cmd_c->next_src;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
-+    }
-+    return;
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+  const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const struct HEVCRpiMvField * const mv_field,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  const int16_t * const c_weights2,
-+  const int16_t * const c_offsets2,
-+  AVFrame * const src_frame,
-+  AVFrame * const src_frame2)
-+{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // s->ps.sps->hshift[1];
-+    const int vshift = 1; // s->ps.sps->vshift[1];
-+    const MvXY const mv = mv_field->xy[0];
-+    const MvXY const mv2 = mv_field->xy[1];
-+
-+    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
-+    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
-+    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+
-+    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
-+    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
-+    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
-+
-+    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
-+    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
-+
-+    const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
-+    const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
-+
-+    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
-+
-+    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
-+        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
-+        qpu_mc_src_t * const src_l0 = cp->last_l0;
-+        qpu_mc_src_t * const src_l1 = cp->last_l1;
-+
-+        src_l0->x = x1_c + start_x;
-+        src_l0->y = y1_c;
-+        src_l0->base = src1_base;
-+        src_l1->x = x2_c + start_x;
-+        src_l1->y = y2_c;
-+        src_l1->base = src2_base;
-+
-+        u[0].h = bh;
-+        u[0].w = bw;
-+        u[0].coeffs_x1 = coefs0_x;
-+        u[0].coeffs_y1 = coefs0_y;
-+        u[0].weight_u1 = c_weights[0]; // Weight L0 U
-+        u[0].weight_v1 = c_weights[1]; // Weight L0 V
-+        u[0].coeffs_x2 = coefs1_x;
-+        u[0].coeffs_y2 = coefs1_y;
-+        u[0].wo_u2 = wo_u2;
-+        u[0].wo_v2 = wo_v2;
-+        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
-+
-+        cp->last_l0 = &u[0].next_src1;
-+        cp->last_l1 = &u[0].next_src2;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+    }
-+}
-+
-+
-+static inline void
-+col_stash(const HEVCRpiContext * const s,
-+          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
-+          const HEVCRpiMvField * const mvf)
-+{
-+    ColMvField * const col_mvf = s->ref->col_mvf;
-+    const unsigned int x = (x0 + 15) >> 4;
-+    const unsigned int y = (y0 + 15) >> 4;
-+    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
-+    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
-+
-+    if (col_mvf != NULL && w != 0 && h != 0)
-+    {
-+        // Only record MV from the top left of the 16x16 block
-+
-+        const RefPicList * const rpl = s->refPicList;
-+        const ColMvField cmv = {
-+            .L = {
-+                {
-+                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
-+                            COL_POC_INTRA :
-+                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
-+                    .xy = mvf->xy[0]
-+                },
-+                {
-+                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
-+                            COL_POC_INTRA :
-+                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
-+                    .xy = mvf->xy[1]
-+                }
-+            }
-+        };
-+
-+        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
-+        const unsigned int stride = s->col_mvf_stride - w;
-+        unsigned int j = h;
-+
-+        do
-+        {
-+            unsigned int k = w;
-+            do
-+            {
-+                *p++ = cmv;
-+            } while (--k != 0);
-+            p += stride;
-+        } while (--j != 0);
-+    }
-+}
-+
-+static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const unsigned int x0, const unsigned int y0,
-+                                const unsigned int nPbW, const unsigned int nPbH,
-+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
-+{
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    struct HEVCRpiMvField current_mv = {{0}};
-+    const RefPicList  *const refPicList = s->refPicList;
-+    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
-+
-+    if (lc->cu.pred_mode != MODE_SKIP)
-+        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
-+
-+    if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
-+        const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
-+            ff_hevc_rpi_merge_idx_decode(s, lc);
-+
-+        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-+                                   partIdx, merge_idx, &current_mv);
-+    } else {
-+        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
-+    }
-+
-+    {
-+        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+        unsigned int i, j;
-+
-+        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
-+        {
-+            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
-+                p[i] = current_mv;
-+            p += MVF_STASH_WIDTH_PU;
-+        }
-+    }
-+
-+    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
-+
-+    if (current_mv.pred_flag & PF_L0) {
-+        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
-+        if (!ref0)
-+            return;
-+        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
-+    }
-+    if (current_mv.pred_flag & PF_L1) {
-+        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
-+        if (!ref1)
-+            return;
-+        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
-+    }
-+
-+    if (current_mv.pred_flag == PF_L0) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
-+          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-+          ref0->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
-+              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+              ref0->frame);
-+            return;
-+        }
-+    } else if (current_mv.pred_flag == PF_L1) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
-+          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-+          ref1->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
-+              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+              ref1->frame);
-+            return;
-+        }
-+    } else if (current_mv.pred_flag == PF_BI) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
-+                       &current_mv,
-+                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
-+                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
-+                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+                       ref0->frame,
-+                       ref1->frame);
-+            return;
-+        }
-+    }
-+}
-+
-+static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                    const unsigned int x0, const unsigned int y0,
-+                    const unsigned int log2_cb_size,
-+                    const unsigned int ipm)
-+{
-+    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
-+    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
-+
-+    {
-+        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
-+        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
-+    }
-+
-+    // If IRAP then everything is Intra & we avoid ever looking at these
-+    // stashes so don't bother setting them
-+    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
-+    {
-+        if (s->is_intra != NULL)
-+        {
-+            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
-+        }
-+
-+        {
-+            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
-+            unsigned int n = size_in_pus;
-+
-+            do
-+            {
-+                memset(p, 0, size_in_pus * sizeof(*p));
-+                p += MVF_STASH_WIDTH_PU;
-+            } while (--n != 0);
-+        }
-+
-+
-+        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
-+        {
-+            // Only record top left stuff
-+            // Blocks should always be alinged on size boundries
-+            // so cannot have overflow from a small block
-+
-+            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
-+            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
-+            const unsigned int stride = s->col_mvf_stride - size_in_col;
-+            unsigned int j = size_in_col;
-+
-+            do
-+            {
-+                unsigned int k = size_in_col;
-+                do
-+                {
-+                    p->L[0].poc = COL_POC_INTRA;
-+                    p->L[0].xy = 0;
-+                    p->L[1].poc = COL_POC_INTRA;
-+                    p->L[1].xy = 0;
-+                    ++p;
-+                } while (--k != 0);
-+                p += stride;
-+            } while (--j != 0);
-+        }
-+    }
-+}
-+
-+static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                                const unsigned int x0, const unsigned int y0,
-+                                                const unsigned int log2_cb_size)
-+{
-+    set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
-+}
-+
-+
-+/**
-+ * 8.4.1
-+ */
-+static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                int x0, int y0, int log2_pu_size,
-+                                int prev_intra_luma_pred_flag,
-+                                const unsigned int idx)
-+{
-+    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+    const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+
-+    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
-+    // lc we can just keep 1 CTB lR stashes
-+    // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
-+    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
-+    const unsigned int cand_left = lc->ipm_left[yb_pu];
-+
-+    unsigned int intra_pred_mode;
-+    unsigned int a, b, c;
-+
-+    if (cand_left == cand_up) {
-+        if (cand_left < 2) {
-+            a = INTRA_PLANAR;
-+            b = INTRA_DC;
-+            c = INTRA_ANGULAR_26;
-+        } else {
-+            a = cand_left;
-+            b = 2 + ((cand_left - 2 - 1 + 32) & 31);
-+            c = 2 + ((cand_left - 2 + 1) & 31);
-+        }
-+    } else {
-+        a = cand_left;
-+        b = cand_up;
-+        c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
-+                INTRA_PLANAR :
-+            (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
-+                INTRA_DC :
-+                INTRA_ANGULAR_26;
-+    }
-+
-+    if (prev_intra_luma_pred_flag) {
-+        intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
-+    } else {
-+        // Sort lowest 1st
-+        if (a > b)
-+            FFSWAP(int, a, b);
-+        if (a > c)
-+            FFSWAP(int, a, c);
-+        if (b > c)
-+            FFSWAP(int, b, c);
-+
-+        intra_pred_mode = idx;
-+        if (intra_pred_mode >= a)
-+            intra_pred_mode++;
-+        if (intra_pred_mode >= b)
-+            intra_pred_mode++;
-+        if (intra_pred_mode >= c)
-+            intra_pred_mode++;
-+    }
-+
-+    /* write the intra prediction units into the mv array */
-+    set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
-+    return intra_pred_mode;
-+}
-+
-+static const uint8_t tab_mode_idx[] = {
-+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
-+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
-+
-+static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                  const unsigned int x0, const unsigned int y0,
-+                                  const unsigned int log2_cb_size)
-+{
-+    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
-+    uint8_t prev_intra_luma_pred_flag[4];
-+    int split   = lc->cu.part_mode == PART_NxN;
-+    const unsigned int split_size = (1 << (log2_cb_size - 1));
-+    int chroma_mode;
-+    const unsigned int n = split ? 4 : 1;
-+    unsigned int i;
-+
-+    for (i = 0; i != n; i++)
-+        prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
-+
-+    for (i = 0; i < n; i++) {
-+        // depending on mode idx is mpm or luma_pred_mode
-+        const unsigned int idx = prev_intra_luma_pred_flag[i] ?
-+            ff_hevc_rpi_mpm_idx_decode(lc) :
-+            ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
-+
-+        lc->pu.intra_pred_mode[i] =
-+            luma_intra_pred_mode(s, lc,
-+                                 x0 + ((i & 1) == 0 ? 0 : split_size),
-+                                 y0 + ((i & 2) == 0 ? 0 : split_size),
-+                                 log2_cb_size - split,
-+                                 prev_intra_luma_pred_flag[i], idx);
-+    }
-+
-+    if (ctx_cfmt(s) == 3) {
-+        for (i = 0; i < n; i++) {
-+            lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+            if (chroma_mode != 4) {
-+                if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
-+                    lc->pu.intra_pred_mode_c[i] = 34;
-+                else
-+                    lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
-+            } else {
-+                lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
-+            }
-+        }
-+    } else if (ctx_cfmt(s) == 2) {
-+        int mode_idx;
-+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+        if (chroma_mode != 4) {
-+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+                mode_idx = 34;
-+            else
-+                mode_idx = intra_chroma_table[chroma_mode];
-+        } else {
-+            mode_idx = lc->pu.intra_pred_mode[0];
-+        }
-+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
-+    } else if (ctx_cfmt(s) != 0) {
-+        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+        if (chroma_mode != 4) {
-+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+                lc->pu.intra_pred_mode_c[0] = 34;
-+            else
-+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
-+        } else {
-+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
-+        }
-+    }
-+}
-+
-+static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                           const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
-+{
-+    const unsigned int cb_size          = 1 << log2_cb_size;
-+    const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-+    const unsigned int min_cb_width     = s->ps.sps->min_cb_width;
-+    const unsigned int x_cb             = x0 >> log2_min_cb_size;
-+    const unsigned int y_cb             = y0 >> log2_min_cb_size;
-+    const unsigned int idx              = log2_cb_size - 2;
-+    const unsigned int qp_block_mask    = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+    int skip_flag = 0;
-+
-+    lc->cu.x                = x0;
-+    lc->cu.y                = y0;
-+    lc->cu.x_split          = x0;
-+    lc->cu.y_split          = y0;
-+
-+    lc->cu.pred_mode        = MODE_INTRA;
-+    lc->cu.part_mode        = PART_2Nx2N;
-+    lc->cu.intra_split_flag = 0;
-+    lc->cu.cu_transquant_bypass_flag = 0;
-+    lc->pu.intra_pred_mode[0] = 1;
-+    lc->pu.intra_pred_mode[1] = 1;
-+    lc->pu.intra_pred_mode[2] = 1;
-+    lc->pu.intra_pred_mode[3] = 1;
-+
-+    if (s->ps.pps->transquant_bypass_enable_flag) {
-+        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
-+        if (lc->cu.cu_transquant_bypass_flag)
-+            set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+    }
-+
-+    if (s->sh.slice_type != HEVC_SLICE_I) {
-+        lc->cu.pred_mode = MODE_INTER;
-+        skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
-+    }
-+
-+    if (skip_flag) {
-+        lc->cu.pred_mode = MODE_SKIP;
-+
-+        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+
-+        if (!s->sh.disable_deblocking_filter_flag)
-+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+    } else {
-+        int pcm_flag = 0;
-+
-+        if (s->sh.slice_type != HEVC_SLICE_I)
-+            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
-+        if (lc->cu.pred_mode != MODE_INTRA ||
-+            log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
-+            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
-+                                      lc->cu.pred_mode == MODE_INTRA;
-+        }
-+
-+        if (lc->cu.pred_mode == MODE_INTRA) {
-+            if (lc->cu.part_mode == PART_2Nx2N &&
-+                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size &&  // 0 if not enabled
-+                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
-+                ff_hevc_rpi_pcm_flag_decode(lc) != 0)
-+            {
-+                int ret;
-+                pcm_flag = 1;
-+                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+                if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
-+                    return ret;
-+
-+                if (s->ps.sps->pcm.loop_filter_disable_flag)
-+                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+            } else {
-+                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
-+            }
-+        } else {
-+            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+            switch (lc->cu.part_mode) {
-+            case PART_2Nx2N:
-+                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+                break;
-+            case PART_2NxN:
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
-+                break;
-+            case PART_Nx2N:
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
-+                lc->cu.x_split = x0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
-+                break;
-+            case PART_2NxnU:
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 4;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
-+                break;
-+            case PART_2NxnD:
-+                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 4 * 3;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
-+                break;
-+            case PART_nLx2N:
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
-+                lc->cu.x_split = x0 + cb_size / 4;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                break;
-+            case PART_nRx2N:
-+                hls_prediction_unit(s, lc, x0,                   y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
-+                lc->cu.x_split = x0 + cb_size / 4 * 3;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                break;
-+            case PART_NxN:
-+                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
-+                lc->cu.x_split = x0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
-+                lc->cu.y_split = y0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
-+                break;
-+            }
-+        }
-+
-+        if (!pcm_flag) {
-+            int rqt_root_cbf = 1;
-+
-+            if (lc->cu.pred_mode != MODE_INTRA &&
-+                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
-+                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
-+            }
-+            if (rqt_root_cbf) {
-+                const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
-+                int ret;
-+
-+                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
-+                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
-+                                         s->ps.sps->max_transform_hierarchy_depth_inter;
-+                // transform_tree does deblock_boundary_strengths
-+                ret = hls_transform_tree(s, lc, x0, y0,
-+                                         log2_cb_size, 0, 0, cbf_c);
-+                if (ret < 0)
-+                    return ret;
-+            } else {
-+                if (!s->sh.disable_deblocking_filter_flag)
-+                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+            }
-+        }
-+    }
-+
-+    // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
-+    if (lc->tu.is_cu_qp_delta_wanted)
-+        ff_hevc_rpi_set_qPy(s, lc, x0, y0);
-+
-+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
-+        lc->qPy_pred = lc->qp_y;
-+    }
-+
-+    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
-+
-+    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
-+
-+    return 0;
-+}
-+
-+// Returns:
-+//  < 0  Error
-+//  0    More data wanted
-+//  1    EoSlice / EoPicture
-+static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+                               const int log2_cb_size, const unsigned int cb_depth)
-+{
-+    const int cb_size    = 1 << log2_cb_size;
-+    int ret;
-+    int split_cu;
-+
-+    lc->ct_depth = cb_depth;
-+    split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
-+    if (x0 + cb_size <= s->ps.sps->width  &&
-+        y0 + cb_size <= s->ps.sps->height &&
-+        split_cu)
-+    {
-+        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
-+    }
-+
-+    // Qp delta (and offset) need to remain wanted if cb_size < min until
-+    // a coded block is found so we still initial state at depth 0 (outside
-+    // this fn) and only reset here
-+    if (s->ps.pps->cu_qp_delta_enabled_flag &&
-+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+    {
-+        lc->tu.is_cu_qp_delta_wanted = 1;
-+        lc->tu.cu_qp_delta          = 0;
-+    }
-+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
-+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+    {
-+        lc->tu.cu_chroma_qp_offset_wanted = 1;
-+    }
-+
-+    lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
-+    lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
-+    lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
-+
-+    if (split_cu) {
-+        int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+        const int cb_size_split = cb_size >> 1;
-+        const int x1 = x0 + cb_size_split;
-+        const int y1 = y0 + cb_size_split;
-+
-+        int more_data = 0;
-+
-+        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
-+        if (more_data < 0)
-+            return more_data;
-+
-+        if (more_data && x1 < s->ps.sps->width) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && x1 < s->ps.sps->width &&
-+            y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+
-+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
-+            lc->qPy_pred = lc->qp_y;
-+
-+        if (more_data)
-+            return ((x1 + cb_size_split) < s->ps.sps->width ||
-+                    (y1 + cb_size_split) < s->ps.sps->height);
-+        else
-+            return 0;
-+    } else {
-+        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
-+        if (ret < 0)
-+            return ret;
-+        if ((!((x0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (x0 + cb_size >= s->ps.sps->width)) &&
-+            (!((y0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (y0 + cb_size >= s->ps.sps->height))) {
-+            int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+            return !end_of_slice_flag;
-+        } else {
-+            return 1;
-+        }
-+    }
-+
-+    return 0;  // NEVER
-+}
-+
-+static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
-+{
-+    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
-+    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
-+    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+    const unsigned int line_w = s->ps.sps->ctb_width;
-+
-+    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
-+
-+    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
-+    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
-+
-+    lc->boundary_flags = 0;
-+
-+    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
-+        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
-+    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
-+        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
-+    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
-+        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
-+    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
-+        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
-+
-+    // Use line width rather than tile width for addr_in_slice test as
-+    // addr_in_slice is in raster units
-+
-+    lc->ctb_avail =
-+        ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
-+        ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
-+        ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
-+            (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
-+        ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
-+            (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
-+    // Down-left never avail at CTB level
-+}
-+
-+
-+static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
-+        (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
-+
-+    // Signal
-+    if (y > 0) {
-+        // Cast away const as progress is held in s, but this really shouldn't confuse anything
-+        ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
-+    }
-+
-+    // Job done now
-+    // ? Move outside this fn
-+    job_free(s->jbc, jb);
-+}
-+
-+// I-pred, transform_and_add for all blocks types done here
-+// All ARM
-+static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    unsigned int i;
-+    HEVCRpiIntraPredEnv * const iap = &jb->intra;
-+    const HEVCPredCmd *cmd = iap->cmds;
-+
-+#if !RPI_WORKER_WAIT_PASS_0
-+    rpi_sem_wait(&jb->sem);
-+    rpi_cache_flush_execute(jb->rfe);  // Invalidate data set up in pass1
-+#endif
-+
-+    for (i = iap->n; i > 0; i--, cmd++)
-+    {
-+        switch (cmd->type)
-+        {
-+            case RPI_PRED_INTRA:
-+                s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail);
-+                break;
-+            case RPI_PRED_INTRA_C:
-+                s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL:
-+                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+                break;
-+            case RPI_PRED_ADD_DC:
-+                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_U:
-+                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_V:
-+                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_C:
-+                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+                break;
-+            case RPI_PRED_ADD_DC_U:
-+            case RPI_PRED_ADD_DC_V:
-+                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+                break;
-+
-+            case RPI_PRED_I_PCM:
-+                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
-+                break;
-+
-+            default:
-+                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
-+                abort();
-+        }
-+    }
-+
-+    // Mark done
-+    iap->n = 0;
-+}
-+
-+
-+// Set initial uniform job values & zero ctu_count
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
-+{
-+    unsigned int i;
-+    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
-+    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
-+    const HEVCRpiSPS * const sps = s->ps.sps;
-+
-+    const uint16_t pic_width_y   = sps->width;
-+    const uint16_t pic_height_y  = sps->height;
-+
-+    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
-+    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
-+
-+    // We expect the pointer to change if we use another sps
-+    if (sps != jb->sps)
-+    {
-+        worker_pic_free_one(jb);
-+
-+        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
-+        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
-+
-+        {
-+            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
-+            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
-+            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
-+        }
-+
-+        jb->sps = sps;
-+    }
-+
-+    jb->waited = 0;
-+    jb->ctu_ts_first = ctu_ts_first;
-+    jb->ctu_ts_last = -1;
-+
-+    rpi_inter_pred_reset(cipe);
-+    for (i = 0; i < cipe->n; i++) {
-+        HEVCRpiInterPredQ * const cp = cipe->q + i;
-+        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
-+
-+        u->next_src1.x = 0;
-+        u->next_src1.y = 0;
-+        u->next_src1.base = 0;
-+        u->pic_cw = pic_width_c;
-+        u->pic_ch = pic_height_c;
-+        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        cp->last_l0 = &u->next_src1;
-+
-+        u->next_fn = 0;
-+        u->next_src2.x = 0;
-+        u->next_src2.y = 0;
-+        u->next_src2.base = 0;
-+        cp->last_l1 = &u->next_src2;
-+
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+    }
-+
-+    rpi_inter_pred_reset(yipe);
-+    for (i = 0; i < yipe->n; i++) {
-+        HEVCRpiInterPredQ * const yp = yipe->q + i;
-+        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
-+
-+        y->next_src1.x = 0;
-+        y->next_src1.y = 0;
-+        y->next_src1.base = 0;
-+        y->next_src2.x = 0;
-+        y->next_src2.y = 0;
-+        y->next_src2.base = 0;
-+        y->pic_h = pic_height_y;
-+        y->pic_w = pic_width_y;
-+        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        y->next_fn = 0;
-+        yp->last_l0 = &y->next_src1;
-+        yp->last_l1 = &y->next_src2;
-+
-+        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
-+    }
-+
-+    jb->last_y8_p = NULL;
-+    jb->last_y8_l1 = NULL;
-+
-+    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+        jb->progress_req[i] = -1;
-+    }
-+
-+    worker_pic_reset(&jb->coeffs);
-+}
-+
-+
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
-+    unsigned int max_block = 0;
-+
-+    if (!ipe->used) {
-+        return 0;
-+    }
-+
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
-+
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
-+        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
-+
-+        if (block_size > max_block)
-+            max_block = block_size;
-+
-+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_qpu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_qpu;
-+
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
-+
-+        // Add to mailbox list
-+        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
-+        mail[i][1] = yp->code_setup;
-+    }
-+
-+    // We don't need invalidate here as the uniforms aren't changed by the QPU
-+    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
-+    // new values which seems to give us a small performance advantage
-+    //
-+    // In most cases we will not have a completely packed set of uniforms and as
-+    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
-+    // fullest
-+    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
-+                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
-+                                  ipe->n, ipe->max_fill + ipe->min_gap);
-+    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
-+
-+    return 1;
-+}
-+#endif
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+    if (!ipe->used) {
-+        return 0;
-+    }
-+
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
-+
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
-+
-+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_emu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_emu;
-+
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
-+    }
-+
-+    return 1;
-+}
-+#endif
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define mc_terminate_add_y mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_y mc_terminate_add_qpu
-+#endif
-+#if RPI_QPU_EMU_C
-+#define mc_terminate_add_c mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_c mc_terminate_add_qpu
-+#endif
-+
-+
-+static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
-+{
-+    rpi_cache_buf_t cbuf;
-+    rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+    rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+    rpi_cache_flush_finish(rfe);
-+}
-+
-+static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
-+    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
-+    const unsigned int ctb_width = s->ps.sps->ctb_width;
-+    RpiBlk *const bounds = &jb->bounds;
-+    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
-+    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+}
-+
-+#if RPI_PASSES == 2
-+static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s, jb);
-+
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s, jb);
-+}
-+#endif
-+
-+// Core execution tasks
-+static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    int pred_y, pred_c;
-+    vpu_qpu_job_env_t qvbuf;
-+    const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
-+#if RPI_WORKER_WAIT_PASS_0
-+    int do_wait;
-+#endif
-+
-+    {
-+        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+        if (cf->s[3].n + cf->s[2].n != 0)
-+        {
-+            const unsigned int csize = sizeof(cf->s[3].buf[0]);
-+            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
-+            unsigned int n16 = (cf->s[2].n >> 8);
-+            unsigned int n32 = (cf->s[3].n >> 10);
-+#if RPI_COMPRESS_COEFFS
-+            if (cf->s[2].packed) {
-+                n16 = n16 | (n16<<16);
-+            } else {
-+                const unsigned int npack16 = (cf->s[2].packed_n>>8);
-+                n16 = n16 | (npack16<<16);
-+            }
-+            if (cf->s[3].packed) {
-+                n32 = n32 | (n32<<16);
-+            } else {
-+                const unsigned int npack32 = (cf->s[3].packed_n>>10);
-+                n32 = n32 | (npack32<<16);
-+            }
-+#endif
-+            vpu_qpu_job_add_vpu(vqj,
-+                vpu_get_fn(s->ps.sps->bit_depth),
-+                vpu_get_constants(),
-+                cf->gptr.vc,
-+                n16,
-+                cf->gptr.vc + offset32,
-+                n32,
-+                0);
-+
-+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
-+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
-+        }
-+    }
-+
-+    pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
-+
-+// We could take a sync here and try to locally overlap QPU processing with ARM
-+// but testing showed a slightly negative benefit with noticable extra complexity
-+
-+    pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
-+
-+    // Returns 0 if nothing to do, 1 if sync added
-+#if RPI_WORKER_WAIT_PASS_0
-+    do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
-+#else
-+    if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
-+        sem_post(&jb->sem);
-+#endif
-+
-+    rpi_cache_flush_execute(jb->rfe);
-+
-+    // Await progress as required
-+    // jb->waited will only be clear if we have already tested the progress values
-+    // (in worker_submit_job) and found we don't have to wait
-+    if (jb->waited)
-+    {
-+        unsigned int i;
-+        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+            if (jb->progress_req[i] >= 0) {
-+                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
-+            }
-+        }
-+    }
-+
-+    vpu_qpu_job_finish(vqj);
-+
-+    // We always work on a rectangular block
-+    if (pred_y || pred_c)
-+    {
-+        rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
-+                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
-+                                        ctx_vshift(s, 1), pred_y, pred_c);
-+    }
-+
-+    // If we have emulated VPU ops - do it here
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    if (av_rpi_is_sand8_frame(s->frame))
-+    {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
-+#else
-+        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
-+#endif
-+    }
-+    else
-+    {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
-+#else
-+        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
-+#endif
-+    }
-+#endif
-+
-+#if RPI_WORKER_WAIT_PASS_0
-+    if (do_wait)
-+        rpi_sem_wait(&jb->sem);
-+    rpi_cache_flush_execute(jb->rfe);
-+#endif
-+}
-+
-+
-+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
-+{
-+    av_freep(&ipe->q);
-+    gpu_free(&ipe->gptr);
-+}
-+
-+static HEVCRpiJob * job_new(void)
-+{
-+    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
-+
-+    sem_init(&jb->sem, 0, 0);
-+    jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
-+    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
-+
-+    jb->intra.n = 0;
-+    jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
-+
-+    // * Sizeof the union structure might be overkill but at the moment it
-+    //   is correct (it certainly isn't going to be too small)
-+    // Set max fill to slack/2 from the end of the Q
-+    // If we exceed this in any Q then we will schedule by size (which should
-+    // mean that we never use that Q again part from syncs)
-+    // * Given how agressive the overflow resonse is we could maybe put the
-+    //   threshold even nearer the end, but I don't expect us to ever hit
-+    //   it on any real stream anyway.
-+
-+    rpi_inter_pred_alloc(&jb->chroma_ip,
-+                         QPU_N_MAX, QPU_N_GRP,
-+                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
-+                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2);
-+    rpi_inter_pred_alloc(&jb->luma_ip,
-+                         QPU_N_MAX,  QPU_N_GRP,
-+                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
-+                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2);
-+
-+    return jb;
-+}
-+
-+static void job_delete(HEVCRpiJob * const jb)
-+{
-+    worker_pic_free_one(jb);
-+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
-+    av_freep(&jb->intra.cmds);
-+    rpi_free_inter_pred(&jb->chroma_ip);
-+    rpi_free_inter_pred(&jb->luma_ip);
-+    rpi_cache_flush_finish(jb->rfe);  // Not really needed - should do nothing
-+    sem_destroy(&jb->sem);
-+    av_free(jb);
-+}
-+
-+static void jbg_delete(HEVCRpiJobGlobal * const jbg)
-+{
-+    HEVCRpiJob * jb;
-+
-+    if (jbg == NULL)
-+        return;
-+
-+    jb = jbg->free1;
-+    while (jb != NULL)
-+    {
-+        HEVCRpiJob * const jb2 = jb;
-+        jb = jb2->next;
-+        job_delete(jb2);
-+    }
-+
-+    pthread_mutex_destroy(&jbg->lock);
-+    av_free(jbg);
-+}
-+
-+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
-+{
-+    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
-+    if (jbg == NULL)
-+        return NULL;
-+
-+    pthread_mutex_init(&jbg->lock, NULL);
-+
-+    while (job_count-- != 0)
-+    {
-+        HEVCRpiJob * const jb = job_new();
-+        if (jb == NULL)
-+            goto fail;
-+
-+        jb->next = jbg->free1;
-+        jbg->free1 = jb;
-+    }
-+
-+    return jbg;
-+
-+fail:
-+    jbg_delete(jbg);
-+    return NULL;
-+}
-+
-+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
-+{
-+    HEVCRpiJobGlobal * jbg;
-+
-+    if (jbc == NULL)
-+        return;
-+
-+    jbg = jbc->jbg;
-+
-+    if (jbc->jb1 != NULL)
-+        job_delete(jbc->jb1);
-+
-+    pthread_mutex_destroy(&jbc->in_lock);
-+    sem_destroy(&jbc->sem_out);
-+    av_free(jbc);
-+
-+    // Deref the global job context
-+    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
-+        jbg_delete(jbg);
-+}
-+
-+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
-+{
-+    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
-+
-+    if (jbc == NULL)
-+        return NULL;
-+
-+    jbc->jbg = jbg;
-+    atomic_fetch_add(&jbg->ref_count, 1);
-+
-+    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
-+    pthread_mutex_init(&jbc->in_lock, NULL);
-+
-+    if ((jbc->jb1 = job_new()) == NULL)
-+        goto fail;
-+    jbc->jb1->jbc_local = jbc;
-+
-+    return jbc;
-+
-+fail:
-+    rpi_job_ctl_delete(jbc);
-+    return NULL;
-+}
-+
-+
-+
-+static av_cold void hevc_init_worker(HEVCRpiContext * const s)
-+{
-+#if RPI_PASSES == 2
-+    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
-+#elif RPI_PASSES == 3
-+    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
-+    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
-+#else
-+#error Passes confused
-+#endif
-+    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
-+
-+    pass_queues_start_all(s);
-+}
-+
-+static av_cold void hevc_exit_worker(HEVCRpiContext *s)
-+{
-+    pass_queues_term_all(s);
-+
-+    pass_queues_kill_all(s);
-+
-+    rpi_job_ctl_delete(s->jbc);
-+    s->jbc = NULL;
-+}
-+
-+
-+static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
-+{
-+    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
-+    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
-+
-+    // Check for obvious disasters
-+    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // If dependant then ctb_addr_ts != 0 from previous check
-+    if (s->sh.dependent_slice_segment_flag) {
-+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
-+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        tile_id + s->sh.num_entry_point_offsets >= tiles)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // Tiled stuff must start at start of tile if it has multiple entry points
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        s->sh.num_entry_point_offsets != 0 &&
-+        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    ff_hevc_rpi_cabac_init_decoder(lc);
-+
-+    // Setup any required decode vars
-+    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
-+
-+//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
-+    lc->qp_y = s->sh.slice_qp;
-+
-+    // General setup
-+    lc->bt_line_no = 0;
-+    lc->ts = ctb_addr_ts;
-+    return 0;
-+}
-+
-+static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+    const GetBitContext * const gb = &s->HEVClc->gb;
-+    RpiSliceHeader * const sh = &s->sh;
-+    int i, j;
-+
-+    const unsigned int length = nal->size;
-+    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
-+    unsigned int cmpt;
-+    unsigned int startheader;
-+
-+    if (sh->num_entry_point_offsets == 0) {
-+        s->data = NULL;
-+        return 0;
-+    }
-+
-+    // offset in slice header includes emulation prevention bytes.
-+    // Unfortunately those have been removed by the time we get here so we
-+    // have to compensate.  The nal layer keeps a track of where they were.
-+    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
-+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+            startheader--;
-+            cmpt++;
-+        }
-+    }
-+
-+    for (i = 1; i < sh->num_entry_point_offsets; i++) {
-+        offset += (sh->entry_point_offset[i - 1] - cmpt);
-+        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
-+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+                startheader--;
-+                cmpt++;
-+            }
-+        }
-+        if (sh->entry_point_offset[i] <= cmpt) {
-+            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
-+        sh->offset[i - 1] = offset;
-+    }
-+
-+    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
-+    if (length < offset) {
-+        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
-+    sh->offset[sh->num_entry_point_offsets - 1] = offset;
-+
-+    // Remember data start pointer as we won't have nal later
-+    s->data = nal->data;
-+    return 0;
-+}
-+
-+
-+// Return
-+// < 0   Error
-+// 0     OK
-+//
-+// jb->ctu_ts_last < 0       Job still filling
-+// jb->ctu_ts_last >= 0      Job ready
-+
-+static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    HEVCRpiJob * const jb = lc->jb0;
-+    int more_data = 1;
-+    unsigned int ctb_addr_ts = lc->ts;
-+    unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
-+    const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
-+
-+    lc->unit_done = 0;
-+
-+    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
-+    {
-+        int q_full;
-+        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+
-+        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
-+
-+        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
-+
-+        hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
-+
-+        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
-+        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-+        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+
-+        // Zap stashes if navail
-+        if ((lc->ctb_avail & AVAIL_U) == 0)
-+            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
-+        if ((lc->ctb_avail & AVAIL_L) == 0)
-+        {
-+            memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
-+            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
-+        }
-+#if MVF_STASH_WIDTH > 64
-+        // Restore left mvf stash at start of tile if not at start of line
-+        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
-+        {
-+            unsigned int i;
-+            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
-+            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+            {
-+                *dst = *src++;
-+                dst += MVF_STASH_WIDTH_PU;
-+            }
-+        }
-+#endif
-+
-+        // Set initial tu states
-+        lc->tu.cu_qp_delta = 0;
-+        lc->tu.is_cu_qp_delta_wanted = 0;
-+        lc->tu.cu_chroma_qp_offset_wanted = 0;
-+
-+        // Decode
-+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
-+
-+        if (ff_hevc_rpi_cabac_overflow(lc))
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
-+            more_data = AVERROR_INVALIDDATA;
-+        }
-+
-+        if (more_data < 0) {
-+            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
-+            return more_data;
-+        }
-+
-+        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
-+             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
-+        {
-+            if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
-+                ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
-+                return -1;
-+            }
-+        }
-+
-+        // --- Post CTB processing
-+
-+        // Stash rpl top/left for deblock that needs to remember such things cross-slice
-+        s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
-+        s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
-+
-+        if (!s->is_irap)
-+        {
-+            // Copy MVF up to up-left & stash to up
-+            {
-+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
-+                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
-+
-+    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
-+
-+                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
-+                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
-+            }
-+            // Stash sideways if end of tile line but not end of line (no point)
-+            // ** Could/should do this @ end of fn
-+#if MVF_STASH_WIDTH > 64
-+            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
-+#endif
-+            {
-+                unsigned int i;
-+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
-+                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+                {
-+                    *dst++ = *src;
-+                    src += MVF_STASH_WIDTH_PU;
-+                }
-+            }
-+        }
-+
-+        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
-+            ff_hevc_rpi_save_states(s, lc);
-+
-+        // Report progress so we can use our MVs in other frames
-+        if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
-+            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
-+
-+        // End of line || End of tile line || End of tile
-+        // (EoL covers end of frame for our purposes here)
-+        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
-+
-+        // Allocate QPU chunks on fixed size 64 pel boundries rather than
-+        // whatever ctb_size is today.
-+        // * We might quite like to continue to 64 pel vertical too but that
-+        //   currently confuses WPP
-+        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
-+        {
-+            int overflow = 0;
-+            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
-+                overflow = 1;
-+            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
-+                overflow = 1;
-+            if (overflow)
-+            {
-+                // * This is very annoying (and slow) to cope with in WPP so
-+                //   we treat it as an error there (no known stream triggers this
-+                //   with the current buffer sizes).  Non-wpp should cope fine.
-+                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
-+                q_full = 1;
-+            }
-+        }
-+
-+        // Inc TS to next.
-+        ctb_addr_ts++;
-+        ctb_addr_rs++;
-+        x_ctb += ctb_size;
-+
-+        if (q_full)
-+        {
-+            // Do job
-+            // Prep for submission
-+            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
-+            job_gen_bounds(s, jb);
-+            break;
-+        }
-+
-+        // If max_blocks started as 0 then this will never be true
-+        if (--max_blocks == 0)
-+            break;
-+    }
-+
-+    lc->unit_done = (more_data <= 0);
-+    lc->ts = ctb_addr_ts;
-+    return 0;
-+}
-+
-+static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
-+{
-+    lc->context = s;
-+    lc->jb0 = NULL;
-+    lc->lc_n = n;
-+    lc->bt_terminate = 0;
-+    lc->bt_psem_out = NULL;
-+    sem_init(&lc->bt_sem_in, 0, 0);
-+}
-+
-+#define TRACE_WPP 0
-+#if RPI_EXTRA_BIT_THREADS > 0
-+static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
-+{
-+    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
-+    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
-+}
-+
-+// Move local context parameters from an aux bit thread back to the main
-+// thread at the end of a slice as processing is going to continue there.
-+static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
-+{
-+    if (src_lc == dst_lc) {
-+        return;
-+    }
-+
-+    // Move the job
-+    // We will still have an active job if the final line terminates early
-+    // Dest should always be null by now
-+    av_assert1(dst_lc->jb0 == NULL);
-+    dst_lc->jb0 = src_lc->jb0;
-+    src_lc->jb0 = NULL;
-+
-+    // Always need to store where we are in the bitstream
-+    dst_lc->ts = src_lc->ts;
-+    dst_lc->gb = src_lc->gb;
-+    // Cabac init request will be built at start of next slice
-+
-+    // Need to store context if we might have a dependent seg
-+    if (is_dep)
-+    {
-+        dst_lc->qPy_pred = src_lc->qPy_pred;
-+        memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
-+        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
-+        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
-+    }
-+}
-+
-+static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
-+{
-+    rpi_sem_wait(&lc->bt_sem_in);
-+    return lc->bt_terminate;
-+}
-+
-+// Do one WPP line
-+// Will not work correctly over horizontal tile boundries - vertical should be OK
-+static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
-+{
-+    const int is_tile = lc->bt_is_tile;
-+    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
-+    const unsigned int line = lc->bt_line_no;
-+    const unsigned int line_inc = lc->bt_line_inc;
-+    const int is_last = (line >= lc->bt_last_line);
-+
-+    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
-+    const unsigned int ts_next =
-+        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
-+            INT_MAX :
-+        is_tile ?
-+            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
-+            lc->ts + lc->bt_line_width * line_inc;
-+    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
-+    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
-+    unsigned int ts_prev;
-+    int loop_n = 0;
-+    int err = 0;
-+
-+    av_assert1(line <= s->sh.num_entry_point_offsets);
-+
-+#if TRACE_WPP
-+    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
-+           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
-+           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
-+           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
-+#endif
-+    if (line != 0)
-+    {
-+        const uint8_t * const data = s->data + s->sh.offset[line - 1];
-+        const unsigned int len = s->sh.size[line - 1];
-+        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
-+            return err;
-+
-+        ff_init_cabac_decoder(&lc->cc, data, len);
-+    }
-+
-+    // We should never be processing a dependent slice here so reset is good
-+    // ?? These probably shouldn't be needed (as they should be set by later
-+    //    logic) but do seem to be required
-+    lc->qp_y = s->sh.slice_qp;
-+
-+    do
-+    {
-+        if (!is_last && loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
-+#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+        // The wait for loop_n == 0 has been done in bit_thread
-+        if (!is_first && loop_n != 0)
-+        {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
-+#endif
-+            if (wait_bt_sem_in(lc) != 0)
-+                return AVERROR_EXIT;
-+        }
-+
-+#if TRACE_WPP
-+        {
-+            int n;
-+            sem_getvalue(&lc->bt_sem_in, &n);
-+            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
-+        }
-+#endif
-+
-+        ts_prev = lc->ts;
-+
-+        // If we have had an error - do no further decode but do continue
-+        // moving signals around so the other threads continue to operate
-+        // correctly (or at least as correctly as they can with this line missing)
-+        //
-+        // Errors in WPP/Tile are less fatal than normal as we have a good idea
-+        // of how to restart on the next line so there is no need to give up totally
-+        if (err != 0)
-+        {
-+            lc->unit_done = 0;
-+            lc->ts += partial_size;
-+        }
-+        else
-+        {
-+            worker_pass0_ready(s, lc);
-+
-+            if ((err = fill_job(s, lc, partial_size)) < 0 ||
-+                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
-+            {
-+                if (err == 0) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
-+                    err = AVERROR_INVALIDDATA;
-+                }
-+                worker_free(s, lc);
-+                lc->ts = ts_prev + partial_size;  // Pretend we did all that
-+                lc->unit_done = 0;
-+            }
-+            else if (is_tile)
-+            {
-+                worker_submit_job(s, lc);
-+            }
-+        }
-+
-+        ++loop_n;
-+    } while (lc->ts < ts_eol && !lc->unit_done);
-+
-+    // If we are on the last line & we didn't get a whole line we must wait for
-+    // and sink the sem_posts from the line above / tile to the left.
-+    while ((ts_prev += partial_size) < ts_eol)
-+    {
-+#if TRACE_WPP
-+        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
-+#endif
-+        if (wait_bt_sem_in(lc) != 0)
-+            return AVERROR_EXIT;
-+    }
-+
-+    lc->bt_line_no += line_inc;
-+
-+    if (!is_tile && err == 0)
-+        worker_submit_job(s, lc);
-+
-+    if (!is_last) {
-+        lc->ts = ts_next;
-+
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+        sem_post(lc->bt_psem_out);
-+        if (loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+    }
-+    else
-+    {
-+        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);  // * & not EoT
-+#if MVF_STASH_WIDTH > 64
-+        // Horrid calculations to work out what we want but luckily this should almost never execute
-+        // **** Move to movlc
-+        if (!s->is_irap)
-+        {
-+            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
-+            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
-+            {
-+                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
-+                unsigned int i;
-+                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+
-+                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
-+                {
-+                    *d_mvf = *s_mvf;
-+                    d_mvf += MVF_STASH_WIDTH_PU;
-+                    s_mvf += MVF_STASH_WIDTH_PU;
-+                }
-+
-+            }
-+        }
-+#endif
-+        // When all done poke the thread 0 sem_in one final time
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
-+#endif
-+        sem_post(&s->HEVClcList[0]->bt_sem_in);
-+    }
-+
-+#if TRACE_WPP
-+    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
-+#endif
-+    return err;
-+}
-+
-+static void wpp_setup_lcs(HEVCRpiContext * const s)
-+{
-+    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int line_width = line_ts_width(s, ts);
-+
-+    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
-+    {
-+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+        lc->ts = ts;
-+        lc->bt_is_tile = 0;
-+        lc->bt_line_no = i;
-+        lc->bt_line_width = line_width;
-+        lc->bt_last_line = s->sh.num_entry_point_offsets;
-+        lc->bt_line_inc = RPI_BIT_THREADS;
-+        ts += line_width;
-+    }
-+}
-+
-+
-+// Can only process tile single row at once
-+static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
-+{
-+    const HEVCRpiPPS * const pps = s->ps.pps;
-+    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int tile0 = pps->tile_id[ts0];
-+    const unsigned int col0 = tile0 % pps->num_tile_columns;
-+
-+    const unsigned int col = (slice_row == 0) ? col0 : 0;
-+    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
-+    const unsigned int last_line = FFMIN(
-+        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
-+
-+    const unsigned int par =
-+        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
-+#if TRACE_WPP
-+    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
-+           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
-+#endif
-+    for (unsigned int i = 0; i != par; ++i, ++line)
-+    {
-+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+        const unsigned int tile = tile0 + line;
-+
-+        lc->ts = pps->tile_pos_ts[tile];
-+        lc->bt_line_no = line;
-+        lc->bt_is_tile = 1;
-+        lc->bt_line_width = line_ts_width(s, lc->ts);
-+        lc->bt_last_line = last_line;
-+        lc->bt_line_inc = par;
-+    }
-+}
-+
-+
-+static void * bit_thread(void * v)
-+{
-+    HEVCRpiLocalContext * const lc = v;
-+    HEVCRpiContext *const s = lc->context;
-+
-+    while (wait_bt_sem_in(lc) == 0)
-+    {
-+        int err;
-+
-+        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
-+            if (lc->bt_terminate) {
-+                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
-+                break;
-+            }
-+            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
-+        }
-+    }
-+
-+    return NULL;
-+}
-+
-+static int bit_threads_start(HEVCRpiContext * const s)
-+{
-+    if (s->bt_started)
-+        return 0;
-+
-+    for (int i = 1; i < RPI_BIT_THREADS; ++i)
-+    {
-+        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
-+        if (s->HEVClcList[i] == NULL) {
-+            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
-+                return -1;
-+        }
-+
-+        bt_lc_init(s, s->HEVClcList[i], i);
-+        job_lc_init(s->HEVClcList[i]);
-+    }
-+
-+    // Link the sems in a circle
-+    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
-+        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
-+    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
-+
-+    // Init all lc before starting any threads
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
-+            return -1;
-+    }
-+
-+    s->bt_started = 1;
-+    return 0;
-+}
-+
-+static int bit_threads_kill(HEVCRpiContext * const s)
-+{
-+    if (!s->bt_started)
-+        return 0;
-+    s->bt_started = 0;
-+
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
-+        if (lc == NULL)
-+            break;
-+
-+        lc->bt_terminate = 1;
-+        sem_post(&lc->bt_sem_in);
-+        pthread_join(s->bit_threads[i], NULL);
-+
-+        sem_destroy(&lc->bt_sem_in);
-+        job_lc_kill(lc);
-+    }
-+    return 0;
-+}
-+#endif
-+
-+
-+// If we are at EoT and the row is shorter than the number of jobs
-+// we can Q we have to wait for it finish otherwise we risk cache/QPU
-+// disasters
-+static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
-+{
-+    return
-+        s->ps.pps->tile_wpp_inter_disable >= 2 &&
-+        s->sh.slice_type != HEVC_SLICE_I &&
-+        n >= 0 &&
-+        (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
-+}
-+
-+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+{
-+    HEVCRpiContext * const s  = avctxt->priv_data;
-+    HEVCRpiLocalContext * const lc = s->HEVClc;
-+    int err;
-+
-+    // Start of slice
-+    if ((err = slice_start(s, lc)) != 0)
-+        return err;
-+
-+#if RPI_EXTRA_BIT_THREADS > 0
-+
-+    if (s->sh.offload_tiles)
-+    {
-+        unsigned int slice_row = 0;
-+
-+#if TRACE_WPP
-+        printf("%s: Do Tiles\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
-+
-+        do
-+        {
-+            // Reset lc lines etc.
-+            tile_one_row_setup_lcs(s, slice_row);
-+
-+#if TRACE_WPP
-+            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            while (lc->bt_line_no <= lc->bt_last_line) {
-+                rpi_sem_wait(&lc->bt_sem_in);
-+                rpi_run_one_line(s, lc, 0);
-+            }
-+#if TRACE_WPP
-+            printf("%s: Done body\n", __func__);
-+#endif
-+
-+            // Wait for everything else to finish
-+            rpi_sem_wait(&lc->bt_sem_in);
-+
-+            ++slice_row;
-+        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
-+
-+
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+    else if (s->sh.offload_wpp)
-+    {
-+#if TRACE_WPP
-+        printf("%s: Do WPP\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
-+
-+        // Reset lc lines etc.
-+        wpp_setup_lcs(s);
-+
-+        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+        printf("%s: Done 1st\n", __func__);
-+#endif
-+
-+        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
-+            rpi_sem_wait(&lc->bt_sem_in);
-+            rpi_run_one_line(s, lc, 0);
-+        }
-+#if TRACE_WPP
-+        printf("%s: Done body\n", __func__);
-+#endif
-+
-+        // Wait for everything else to finish
-+        rpi_sem_wait(&lc->bt_sem_in);
-+
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+    else
-+#endif
-+    {
-+#if TRACE_WPP
-+        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
-+#endif
-+        // Single bit thread
-+        do {
-+            // Make sure we have space to prepare the next job
-+            worker_pass0_ready(s, lc);
-+
-+            if ((err = fill_job(s, lc, 0)) < 0)
-+                goto fail;
-+
-+            worker_submit_job(s, lc);
-+
-+            if (tile_needs_wait(s, lc->ts - 1))
-+                worker_wait(s, lc);
-+
-+        } while (!lc->unit_done);
-+
-+#if TRACE_WPP
-+        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+
-+    // If we have reached the end of the frame or
-+    // then wait for the worker to finish all its jobs
-+    if (lc->ts >= s->ps.sps->ctb_size)
-+        worker_wait(s, lc);
-+
-+#if RPI_TSTATS
-+    {
-+        HEVCRpiStats *const ts = &s->tstats;
-+
-+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
-+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
-+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
-+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
-+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
-+        memset(ts, 0, sizeof(*ts));
-+    }
-+#endif
-+
-+    return lc->ts;
-+
-+fail:
-+    // Cleanup
-+    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
-+    // Free our job & wait for temination
-+    worker_free(s, lc);
-+    worker_wait(s, lc);
-+    return err;
-+}
-+
-+
-+static void set_no_backward_pred(HEVCRpiContext * const s)
-+{
-+    int i, j;
-+    const RefPicList *const refPicList = s->refPicList;
-+
-+    s->no_backward_pred_flag = 0;
-+    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
-+        return;
-+
-+    for (j = 0; j < 2; j++) {
-+        for (i = 0; i < refPicList[j].nb_refs; i++) {
-+            if (refPicList[j].list[i] > s->poc) {
-+                s->no_backward_pred_flag = 1;
-+                return;
-+            }
-+        }
-+    }
-+}
-+
-+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+    int err;
-+    if ((err = gen_entry_points(s, nal)) < 0)
-+        return err;
-+
-+    set_no_backward_pred(s);
-+
-+    return rpi_decode_entry(s->avctx, NULL);
-+}
-+
-+static int set_side_data(HEVCRpiContext *s)
-+{
-+    AVFrame *out = s->ref->frame;
-+
-+    if (s->sei.frame_packing.present &&
-+        s->sei.frame_packing.arrangement_type >= 3 &&
-+        s->sei.frame_packing.arrangement_type <= 5 &&
-+        s->sei.frame_packing.content_interpretation_type > 0 &&
-+        s->sei.frame_packing.content_interpretation_type < 3) {
-+        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
-+        if (!stereo)
-+            return AVERROR(ENOMEM);
-+
-+        switch (s->sei.frame_packing.arrangement_type) {
-+        case 3:
-+            if (s->sei.frame_packing.quincunx_subsampling)
-+                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
-+            else
-+                stereo->type = AV_STEREO3D_SIDEBYSIDE;
-+            break;
-+        case 4:
-+            stereo->type = AV_STEREO3D_TOPBOTTOM;
-+            break;
-+        case 5:
-+            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
-+            break;
-+        }
-+
-+        if (s->sei.frame_packing.content_interpretation_type == 2)
-+            stereo->flags = AV_STEREO3D_FLAG_INVERT;
-+
-+        if (s->sei.frame_packing.arrangement_type == 5) {
-+            if (s->sei.frame_packing.current_frame_is_frame0_flag)
-+                stereo->view = AV_STEREO3D_VIEW_LEFT;
-+            else
-+                stereo->view = AV_STEREO3D_VIEW_RIGHT;
-+        }
-+    }
-+
-+    if (s->sei.display_orientation.present &&
-+        (s->sei.display_orientation.anticlockwise_rotation ||
-+         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
-+        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
-+        AVFrameSideData *rotation = av_frame_new_side_data(out,
-+                                                           AV_FRAME_DATA_DISPLAYMATRIX,
-+                                                           sizeof(int32_t) * 9);
-+        if (!rotation)
-+            return AVERROR(ENOMEM);
-+
-+        av_display_rotation_set((int32_t *)rotation->data, angle);
-+        av_display_matrix_flip((int32_t *)rotation->data,
-+                               s->sei.display_orientation.hflip,
-+                               s->sei.display_orientation.vflip);
-+    }
-+
-+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+    // so the side data persists for the entire coded video sequence.
-+    if (s->sei.mastering_display.present > 0 &&
-+        IS_IRAP(s) && s->no_rasl_output_flag) {
-+        s->sei.mastering_display.present--;
-+    }
-+    if (s->sei.mastering_display.present) {
-+        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
-+        const int mapping[3] = {2, 0, 1};
-+        const int chroma_den = 50000;
-+        const int luma_den = 10000;
-+        int i;
-+        AVMasteringDisplayMetadata *metadata =
-+            av_mastering_display_metadata_create_side_data(out);
-+        if (!metadata)
-+            return AVERROR(ENOMEM);
-+
-+        for (i = 0; i < 3; i++) {
-+            const int j = mapping[i];
-+            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
-+            metadata->display_primaries[i][0].den = chroma_den;
-+            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
-+            metadata->display_primaries[i][1].den = chroma_den;
-+        }
-+        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
-+        metadata->white_point[0].den = chroma_den;
-+        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
-+        metadata->white_point[1].den = chroma_den;
-+
-+        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
-+        metadata->max_luminance.den = luma_den;
-+        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
-+        metadata->min_luminance.den = luma_den;
-+        metadata->has_luminance = 1;
-+        metadata->has_primaries = 1;
-+
-+        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
-+        av_log(s->avctx, AV_LOG_DEBUG,
-+               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
-+               av_q2d(metadata->display_primaries[0][0]),
-+               av_q2d(metadata->display_primaries[0][1]),
-+               av_q2d(metadata->display_primaries[1][0]),
-+               av_q2d(metadata->display_primaries[1][1]),
-+               av_q2d(metadata->display_primaries[2][0]),
-+               av_q2d(metadata->display_primaries[2][1]),
-+               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
-+        av_log(s->avctx, AV_LOG_DEBUG,
-+               "min_luminance=%f, max_luminance=%f\n",
-+               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
-+    }
-+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+    // so the side data persists for the entire coded video sequence.
-+    if (s->sei.content_light.present > 0 &&
-+        IS_IRAP(s) && s->no_rasl_output_flag) {
-+        s->sei.content_light.present--;
-+    }
-+    if (s->sei.content_light.present) {
-+        AVContentLightMetadata *metadata =
-+            av_content_light_metadata_create_side_data(out);
-+        if (!metadata)
-+            return AVERROR(ENOMEM);
-+        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
-+        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
-+
-+        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
-+        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
-+               metadata->MaxCLL, metadata->MaxFALL);
-+    }
-+
-+    if (s->sei.a53_caption.a53_caption) {
-+        AVFrameSideData* sd = av_frame_new_side_data(out,
-+                                                     AV_FRAME_DATA_A53_CC,
-+                                                     s->sei.a53_caption.a53_caption_size);
-+        if (sd)
-+            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
-+        av_freep(&s->sei.a53_caption.a53_caption);
-+        s->sei.a53_caption.a53_caption_size = 0;
-+        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
-+    }
-+
-+    if (s->sei.alternative_transfer.present &&
-+        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
-+        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
-+        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
-+    }
-+
-+    return 0;
-+}
-+
-+static int hevc_frame_start(HEVCRpiContext * const s)
-+{
-+    int ret;
-+
-+    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
-+    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
-+
-+    // Only need to remember intra for CIP
-+    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
-+        s->is_intra = NULL;
-+    else
-+    {
-+        s->is_intra = s->is_intra_store;
-+        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+    }
-+
-+    s->is_decoded        = 0;
-+    s->first_nal_type    = s->nal_unit_type;
-+
-+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
-+
-+    if (s->pkt.nb_nals > s->rpl_tab_size)
-+    {
-+        // In most cases it will be faster to free & realloc as that doesn't
-+        // require (an unwanted) copy
-+        av_freep(&s->rpl_tab);
-+        s->rpl_tab_size = 0;
-+        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
-+            goto fail;
-+        s->rpl_tab_size = s->pkt.nb_nals;
-+    }
-+    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
-+
-+    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
-+    if (ret < 0)
-+        goto fail;
-+
-+    // Resize rpl_tab to max that we might want
-+    ret = ff_hevc_rpi_frame_rps(s);
-+    if (ret < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
-+        goto fail;
-+    }
-+
-+    s->ref->frame->key_frame = IS_IRAP(s);
-+
-+    ret = set_side_data(s);
-+    if (ret < 0)
-+        goto fail;
-+
-+    s->frame->pict_type = 3 - s->sh.slice_type;
-+
-+    if (!IS_IRAP(s))
-+        ff_hevc_rpi_bump_frame(s);
-+
-+    av_frame_unref(s->output_frame);
-+    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
-+    if (ret < 0)
-+        goto fail;
-+
-+    ff_thread_finish_setup(s->avctx);
-+
-+    return 0;
-+
-+fail:
-+    if (s->ref)
-+        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+    s->ref = NULL;
-+    return ret;
-+}
-+
-+static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
-+{
-+    GetBitContext * const gb    = &s->HEVClc->gb;
-+    int ctb_addr_ts, ret;
-+
-+    *gb              = nal->gb;
-+    s->nal_unit_type = nal->type;
-+    s->temporal_id   = nal->temporal_id;
-+
-+    switch (s->nal_unit_type) {
-+    case HEVC_NAL_VPS:
-+        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_SPS:
-+        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
-+                                     s->apply_defdispwin);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_PPS:
-+        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_SEI_PREFIX:
-+    case HEVC_NAL_SEI_SUFFIX:
-+        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_TRAIL_R:
-+    case HEVC_NAL_TRAIL_N:
-+    case HEVC_NAL_TSA_N:
-+    case HEVC_NAL_TSA_R:
-+    case HEVC_NAL_STSA_N:
-+    case HEVC_NAL_STSA_R:
-+    case HEVC_NAL_BLA_W_LP:
-+    case HEVC_NAL_BLA_W_RADL:
-+    case HEVC_NAL_BLA_N_LP:
-+    case HEVC_NAL_IDR_W_RADL:
-+    case HEVC_NAL_IDR_N_LP:
-+    case HEVC_NAL_CRA_NUT:
-+    case HEVC_NAL_RADL_N:
-+    case HEVC_NAL_RADL_R:
-+    case HEVC_NAL_RASL_N:
-+    case HEVC_NAL_RASL_R:
-+        ret = hls_slice_header(s);
-+        if (ret < 0)
-+            return ret;
-+
-+        // The definition of _N unit types is "non-reference for other frames
-+        // with the same temporal_id" so they may/will be ref frames for pics
-+        // with a higher temporal_id.
-+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
-+            !(s->nal_unit_type == HEVC_NAL_TRAIL_N ||
-+                        s->nal_unit_type == HEVC_NAL_TSA_N   ||
-+                        s->nal_unit_type == HEVC_NAL_STSA_N  ||
-+                        s->nal_unit_type == HEVC_NAL_RADL_N  ||
-+                        s->nal_unit_type == HEVC_NAL_RASL_N);
-+        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
-+        s->is_irap = IS_IRAP(s);
-+
-+#if DEBUG_DECODE_N
-+        {
-+            static int z = 0;
-+            if (IS_IDR(s)) {
-+                z = 1;
-+            }
-+            if (z != 0 && z++ > DEBUG_DECODE_N) {
-+                s->is_decoded = 0;
-+                break;
-+            }
-+        }
-+#endif
-+        if (
-+            (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
-+            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
-+            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
-+            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IDR(s)))
-+        {
-+            s->is_decoded = 0;
-+            break;
-+        }
-+
-+        if (s->sh.first_slice_in_pic_flag) {
-+            if (s->max_ra == INT_MAX) {
-+                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-+                    s->max_ra = s->poc;
-+                } else {
-+                    if (IS_IDR(s))
-+                        s->max_ra = INT_MIN;
-+                }
-+            }
-+
-+            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
-+                s->poc <= s->max_ra) {
-+                s->is_decoded = 0;
-+                break;
-+            } else {
-+                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
-+                    s->max_ra = INT_MIN;
-+            }
-+
-+            ret = hevc_frame_start(s);
-+            if (ret < 0)
-+                return ret;
-+        } else if (!s->ref) {
-+            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
-+            goto fail;
-+        }
-+
-+        if (s->nal_unit_type != s->first_nal_type) {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
-+                   s->first_nal_type, s->nal_unit_type);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (!s->sh.dependent_slice_segment_flag &&
-+            s->sh.slice_type != HEVC_SLICE_I) {
-+            ret = ff_hevc_rpi_slice_rpl(s);
-+            if (ret < 0) {
-+                av_log(s->avctx, AV_LOG_WARNING,
-+                       "Error constructing the reference lists for the current slice.\n");
-+                goto fail;
-+            }
-+        }
-+
-+        ctb_addr_ts = hls_slice_data(s, nal);
-+        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
-+            s->is_decoded = 1;
-+        }
-+
-+        if (ctb_addr_ts < 0) {
-+            ret = ctb_addr_ts;
-+            goto fail;
-+        }
-+        break;
-+    case HEVC_NAL_EOS_NUT:
-+    case HEVC_NAL_EOB_NUT:
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+        break;
-+    case HEVC_NAL_AUD:
-+    case HEVC_NAL_FD_NUT:
-+        break;
-+    default:
-+        av_log(s->avctx, AV_LOG_INFO,
-+               "Skipping NAL unit %d\n", s->nal_unit_type);
-+    }
-+
-+    return 0;
-+fail:
-+    if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+        return ret;
-+    return 0;
-+}
-+
-+static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
-+{
-+    int i, ret = 0;
-+    int eos_at_start = 1;
-+
-+    s->ref = NULL;
-+    s->last_eos = s->eos;
-+    s->eos = 0;
-+
-+    /* split the input packet into NAL units, so we know the upper bound on the
-+     * number of slices in the frame */
-+    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
-+                                s->nal_length_size, s->avctx->codec_id, 0, 0);
-+    if (ret < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Error splitting the input into NAL units.\n");
-+        return ret;
-+    }
-+
-+    for (i = 0; i < s->pkt.nb_nals; i++) {
-+        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
-+            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
-+            if (eos_at_start) {
-+                s->last_eos = 1;
-+            } else {
-+                s->eos = 1;
-+            }
-+        } else {
-+            eos_at_start = 0;
-+        }
-+    }
-+
-+    /* decode the NAL units */
-+    for (i = 0; i < s->pkt.nb_nals; i++) {
-+        ret = decode_nal_unit(s, &s->pkt.nals[i]);
-+        if (ret < 0) {
-+            av_log(s->avctx, AV_LOG_WARNING,
-+                   "Error parsing NAL unit #%d.\n", i);
-+            goto fail;
-+        }
-+    }
-+
-+fail:  // Also success path
-+    if (s->ref != NULL) {
-+        if (s->used_for_ref && s->threads_type != 0) {
-+            ff_hevc_rpi_progress_signal_all_done(s);
-+        }
-+        else {
-+            // Flush frame to real memory as we expect to be able to pass
-+            // it straight on to mmal
-+            flush_frame(s, s->frame);
-+        }
-+    }
-+    return ret;
-+}
-+
-+static void print_md5(void *log_ctx, int level, uint8_t md5[16])
-+{
-+    int i;
-+    for (i = 0; i < 16; i++)
-+        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
-+}
-+
-+static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
-+{
-+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-+    int pixel_shift;
-+    int i, j;
-+
-+    if (!desc)
-+        return AVERROR(EINVAL);
-+
-+    pixel_shift = desc->comp[0].depth > 8;
-+
-+    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
-+           s->poc);
-+
-+    /* the checksums are LE, so we have to byteswap for >8bpp formats
-+     * on BE arches */
-+#if HAVE_BIGENDIAN
-+    if (pixel_shift && !s->checksum_buf) {
-+        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
-+                       FFMAX3(frame->linesize[0], frame->linesize[1],
-+                              frame->linesize[2]));
-+        if (!s->checksum_buf)
-+            return AVERROR(ENOMEM);
-+    }
-+#endif
-+
-+    for (i = 0; frame->data[i]; i++) {
-+        int width  = s->avctx->coded_width;
-+        int height = s->avctx->coded_height;
-+        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
-+        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
-+        uint8_t md5[16];
-+
-+        av_md5_init(s->md5_ctx);
-+        for (j = 0; j < h; j++) {
-+            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
-+#if HAVE_BIGENDIAN
-+            if (pixel_shift) {
-+                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
-+                                    (const uint16_t *) src, w);
-+                src = s->checksum_buf;
-+            }
-+#endif
-+            av_md5_update(s->md5_ctx, src, w << pixel_shift);
-+        }
-+        av_md5_final(s->md5_ctx, md5);
-+
-+        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
-+            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
-+            print_md5(s->avctx, AV_LOG_DEBUG, md5);
-+            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
-+        } else {
-+            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
-+            print_md5(s->avctx, AV_LOG_ERROR, md5);
-+            av_log   (s->avctx, AV_LOG_ERROR, " != ");
-+            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
-+            av_log   (s->avctx, AV_LOG_ERROR, "\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
-+
-+    return 0;
-+}
-+
-+static int all_sps_supported(const HEVCRpiContext * const s)
-+{
-+    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        if (s->ps.sps_list[i] != NULL)
-+        {
-+            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+            if (!is_sps_supported(sps))
-+                return 0;
-+        }
-+    }
-+    return 1;
-+}
-+
-+static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
-+{
-+    int ret, i;
-+
-+    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
-+                                   &s->nal_length_size, s->avctx->err_recognition,
-+                                   s->apply_defdispwin, s->avctx);
-+    if (ret < 0)
-+        return ret;
-+
-+    /* export stream parameters from the first SPS */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        if (first && s->ps.sps_list[i]) {
-+            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+            export_stream_params(s->avctx, &s->ps, sps);
-+            break;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
-+                             AVPacket *avpkt)
-+{
-+    int ret;
-+    int new_extradata_size;
-+    uint8_t *new_extradata;
-+    HEVCRpiContext *s = avctx->priv_data;
-+
-+    if (!avpkt->size) {
-+        ret = ff_hevc_rpi_output_frame(s, data, 1);
-+        if (ret < 0)
-+            return ret;
-+
-+        *got_output = ret;
-+        return 0;
-+    }
-+
-+    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
-+                                            &new_extradata_size);
-+    if (new_extradata && new_extradata_size > 0) {
-+        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    s->ref = NULL;
-+    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
-+    if (ret < 0)
-+        return ret;
-+
-+    /* verify the SEI checksum */
-+    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
-+        s->sei.picture_hash.is_md5) {
-+        ret = verify_md5(s, s->ref->frame);
-+        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
-+            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+            return ret;
-+        }
-+    }
-+    s->sei.picture_hash.is_md5 = 0;
-+
-+    if (s->is_decoded) {
-+        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
-+        s->is_decoded = 0;
-+    }
-+
-+    if (s->output_frame->buf[0]) {
-+        av_frame_move_ref(data, s->output_frame);
-+        *got_output = 1;
-+    }
-+
-+    return avpkt->size;
-+}
-+
-+static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
-+{
-+    int ret;
-+
-+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (src->col_mvf_buf != NULL)
-+    {
-+        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
-+        if (!dst->col_mvf_buf)
-+            goto fail;
-+    }
-+    dst->col_mvf = src->col_mvf;
-+
-+    dst->poc        = src->poc;
-+    dst->flags      = src->flags;
-+    dst->sequence   = src->sequence;
-+    return 0;
-+
-+fail:
-+    ff_hevc_rpi_unref_frame(s, dst, ~0);
-+    return AVERROR(ENOMEM);
-+}
-+
-+
-+static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext * const s = avctx->priv_data;
-+    int i;
-+
-+    pic_arrays_free(s);
-+
-+    av_freep(&s->md5_ctx);
-+
-+    av_freep(&s->cabac_save);
-+
-+#if RPI_EXTRA_BIT_THREADS
-+    bit_threads_kill(s);
-+#endif
-+
-+    hevc_exit_worker(s);
-+    vpu_qpu_term();
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
-+    }
-+    job_lc_kill(s->HEVClc);
-+
-+    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
-+    av_freep(&s->sao_pixel_buffer_v[0]);
-+    av_frame_free(&s->output_frame);
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+        av_frame_free(&s->DPB[i].frame);
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
-+        av_buffer_unref(&s->ps.vps_list[i]);
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
-+        av_buffer_unref(&s->ps.sps_list[i]);
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
-+        av_buffer_unref(&s->ps.pps_list[i]);
-+    s->ps.sps = NULL;
-+    s->ps.pps = NULL;
-+    s->ps.vps = NULL;
-+
-+    // Free separately from sLists as used that way by RPI WPP
-+    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
-+        av_freep(s->HEVClcList + i);
-+    }
-+    s->HEVClc = NULL;  // Allocated as part of HEVClcList
-+
-+    ff_h2645_packet_uninit(&s->pkt);
-+
-+    // This must be after we free off the DPB
-+    // * If the outer code is still holding any frames hopefully it will
-+    //   have its own ref to zc
-+    av_rpi_zc_uninit(avctx);
-+
-+    return 0;
-+}
-+
-+
-+static av_cold int hevc_init_context(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int i;
-+
-+    s->avctx = avctx;
-+
-+    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
-+    if (!s->HEVClc)
-+        goto fail;
-+    s->HEVClcList[0] = s->HEVClc;
-+
-+    // Whilst FFmpegs init fn is only called once the close fn is called as
-+    // many times as we have threads (init_thread_copy is called for the
-+    // threads).  So to match init & term put the init here where it will be
-+    // called by both init & copy
-+    av_rpi_zc_init(avctx);
-+
-+    if (vpu_qpu_init() != 0)
-+        goto fail;
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    {
-+        static const uint32_t dframe[1] = {0x80808080};
-+        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
-+    }
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    s->qpu_dummy_frame_qpu = qpu_dummy();
-+#endif
-+
-+    bt_lc_init(s, s->HEVClc, 0);
-+    job_lc_init(s->HEVClc);
-+
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_init_state(s->progress_states + i);
-+    }
-+
-+    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
-+        goto fail;
-+
-+     if ((s->output_frame = av_frame_alloc()) == NULL)
-+        goto fail;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        s->DPB[i].frame = av_frame_alloc();
-+        if (!s->DPB[i].frame)
-+            goto fail;
-+        s->DPB[i].tf.f = s->DPB[i].frame;
-+        s->DPB[i].dpb_no = i;
-+    }
-+
-+    s->max_ra = INT_MAX;
-+
-+    if ((s->md5_ctx = av_md5_alloc()) == NULL)
-+        goto fail;
-+
-+    s->context_initialized = 1;
-+    s->eos = 0;
-+
-+    ff_hevc_rpi_reset_sei(&s->sei);
-+
-+    return 0;
-+
-+fail:
-+    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
-+    hevc_decode_free(avctx);
-+    return AVERROR(ENOMEM);
-+}
-+
-+static int hevc_update_thread_context(AVCodecContext *dst,
-+                                      const AVCodecContext *src)
-+{
-+    HEVCRpiContext *s  = dst->priv_data;
-+    HEVCRpiContext *s0 = src->priv_data;
-+    int i, ret;
-+
-+    if (!s->context_initialized) {
-+        ret = hevc_init_context(dst);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+        if (s0->DPB[i].frame->buf[0]) {
-+            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
-+            if (ret < 0)
-+                return ret;
-+        }
-+    }
-+
-+    if (s->ps.sps != s0->ps.sps)
-+        s->ps.sps = NULL;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
-+        av_buffer_unref(&s->ps.vps_list[i]);
-+        if (s0->ps.vps_list[i]) {
-+            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
-+            if (!s->ps.vps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        av_buffer_unref(&s->ps.sps_list[i]);
-+        if (s0->ps.sps_list[i]) {
-+            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
-+            if (!s->ps.sps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
-+        av_buffer_unref(&s->ps.pps_list[i]);
-+        if (s0->ps.pps_list[i]) {
-+            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
-+            if (!s->ps.pps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    if (s->ps.sps != s0->ps.sps)
-+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
-+            return ret;
-+
-+    s->seq_decode = s0->seq_decode;
-+    s->seq_output = s0->seq_output;
-+    s->pocTid0    = s0->pocTid0;
-+    s->max_ra     = s0->max_ra;
-+    s->eos        = s0->eos;
-+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
-+
-+    s->is_nalff        = s0->is_nalff;
-+    s->nal_length_size = s0->nal_length_size;
-+
-+    s->threads_type        = s0->threads_type;
-+
-+    if (s0->eos) {
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra = INT_MAX;
-+    }
-+
-+    s->sei.frame_packing        = s0->sei.frame_packing;
-+    s->sei.display_orientation  = s0->sei.display_orientation;
-+    s->sei.mastering_display    = s0->sei.mastering_display;
-+    s->sei.content_light        = s0->sei.content_light;
-+    s->sei.alternative_transfer = s0->sei.alternative_transfer;
-+
-+    // * We do this here as it allows us to easily locate our parents
-+    //   global job pool, but there really should be a less nasty way
-+    if (s->jbc == NULL)
-+    {
-+        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
-+        hevc_init_worker(s);
-+    }
-+
-+    return 0;
-+}
-+
-+static av_cold int hevc_decode_init(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int ret;
-+
-+    avctx->internal->allocate_progress = 1;
-+
-+    {
-+        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
-+        if (jbg == NULL)
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
-+            return -1;
-+        }
-+
-+        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
-+            return -1;
-+        }
-+    }
-+
-+    ret = hevc_init_context(avctx);
-+    if (ret < 0)
-+        return ret;
-+
-+    hevc_init_worker(s);
-+
-+    s->sei.picture_timing.picture_struct = 0;
-+    s->eos = 1;
-+
-+    atomic_init(&s->wpp_err, 0);
-+
-+    if (avctx->extradata_size > 0 && avctx->extradata) {
-+        ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
-+
-+        if (ret == 0 && !all_sps_supported(s))
-+            ret = AVERROR_DECODER_NOT_FOUND;
-+
-+        if (ret < 0)
-+        {
-+            hevc_decode_free(avctx);
-+            return ret;
-+        }
-+    }
-+
-+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
-+        s->threads_type = FF_THREAD_FRAME;
-+    else
-+        s->threads_type = 0;
-+
-+    return 0;
-+}
-+
-+static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int ret;
-+
-+    memset(s, 0, sizeof(*s));
-+
-+    ret = hevc_init_context(avctx);
-+    if (ret < 0)
-+        return ret;
-+
-+    return 0;
-+}
-+
-+static void hevc_decode_flush(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    ff_hevc_rpi_flush_dpb(s);
-+    s->max_ra = INT_MAX;
-+    s->eos = 1;
-+}
-+
-+#define OFFSET(x) offsetof(HEVCRpiContext, x)
-+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
-+
-+
-+static const AVOption options[] = {
-+    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
-+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
-+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+    { NULL },
-+};
-+
-+static const AVClass hevc_rpi_decoder_class = {
-+    .class_name = "HEVC RPI decoder",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+};
-+
-+static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
-+    AV_PIX_FMT_SAND128,
-+    AV_PIX_FMT_SAND64_10,
-+    AV_PIX_FMT_NONE
-+};
-+
-+//static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
-+//    HW_CONFIG_INTERNAL(HEVC_RPI),
-+//    NULL
-+//};
-+
-+
-+AVCodec ff_hevc_rpi_decoder = {
-+    .name                  = "hevc_rpi",
-+    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
-+    .type                  = AVMEDIA_TYPE_VIDEO,
-+    .id                    = AV_CODEC_ID_HEVC,
-+    .priv_data_size        = sizeof(HEVCRpiContext),
-+    .priv_class            = &hevc_rpi_decoder_class,
-+    .init                  = hevc_decode_init,
-+    .close                 = hevc_decode_free,
-+    .decode                = hevc_rpi_decode_frame,
-+    .flush                 = hevc_decode_flush,
-+    .update_thread_context = hevc_update_thread_context,
-+    .init_thread_copy      = hevc_init_thread_copy,
-+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-+//                             AV_CODEC_CAP_HARDWARE |
-+#if 0
-+    // Debugging is often easier without threads getting in the way
-+                            0,
-+#warning H265 threading turned off
-+#else
-+    // We only have decent optimisation for frame - so only admit to that
-+                             AV_CODEC_CAP_FRAME_THREADS,
-+#endif
-+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING,
-+    .pix_fmts              = hevc_rpi_pix_fmts,
-+    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
-+//    .hw_configs            = hevc_rpi_hw_configs,
-+//    .wrapper_name          = "hevc_rpi",
-+};
-+
-diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
-new file mode 100644
-index 0000000000..d324aa273c
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,1087 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDEC_H
-+#define AVCODEC_RPI_HEVCDEC_H
-+
-+#include "config.h"
-+
-+#include <stdatomic.h>
-+
-+#include "libavutil/buffer.h"
-+
-+#include "avcodec.h"
-+#include "bswapdsp.h"
-+#include "cabac.h"
-+#include "get_bits.h"
-+#include "rpi_hevcpred.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_mv.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+#include "rpi_hevcdsp.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "videodsp.h"
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_misc_neon.h"
-+#endif
-+
-+#define MAX_NB_THREADS 16
-+#define SHIFT_CTB_WPP 2
-+
-+//TODO: check if this is really the maximum
-+#define MAX_TRANSFORM_DEPTH 5
-+
-+#define MAX_TB_SIZE 32
-+#define MAX_QP 51
-+#define DEFAULT_INTRA_TC_OFFSET 2
-+
-+#define HEVC_CONTEXTS 199
-+
-+#define MRG_MAX_NUM_CANDS     5
-+
-+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
-+
-+// Size of DPB array
-+#define HEVC_DPB_ELS            32
-+
-+#define L0 0
-+#define L1 1
-+
-+#define EPEL_EXTRA_BEFORE 1
-+#define EPEL_EXTRA_AFTER  2
-+#define EPEL_EXTRA        3
-+#define QPEL_EXTRA_BEFORE 3
-+#define QPEL_EXTRA_AFTER  4
-+#define QPEL_EXTRA        7
-+
-+#define EDGE_EMU_BUFFER_STRIDE 80
-+
-+#include <semaphore.h>
-+#include "rpi_qpu.h"
-+
-+// Max jobs per frame thread. Actual usage will be limited by the size
-+// of the global job pool
-+// ?? Limits
-+#define RPI_MAX_JOBS            8
-+
-+// This is the number of _extra_ bit threads - we will have
-+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
-+//
-+// 0 is legitimate and will disable our WPP processing
-+//#define RPI_EXTRA_BIT_THREADS 0
-+#define RPI_EXTRA_BIT_THREADS   2
-+
-+// Number of separate threads/passes in worker
-+// 2 and 3 are the currently valid numbers
-+// At the moment 3 seems fractionally faster
-+//#define RPI_PASSES              2
-+#define RPI_PASSES              3
-+
-+// Print out various usage stats
-+#define RPI_TSTATS              0
-+
-+// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
-+#define RPI_COMPRESS_COEFFS     1
-+
-+// Wait for VPU/QPU to finish in worker pass 0
-+// If 0 then the wait is in pass 1
-+//
-+// One might expect the better place to wait would be in pass 1 however
-+// testing shows that pass 0 produces overall faster decode.
-+// Interestingly it is QPU/VPU limited streams that seem to suffer
-+// from pass 1 waits, CPU limited ones tend to show a very mild gain.
-+// This define exists so it is easy to test this.
-+#define RPI_WORKER_WAIT_PASS_0  1
-+
-+// Use ARM emulation of QPU pred
-+// These are for debug only as the emulation makes only limited
-+// effort to be fast
-+#define RPI_QPU_EMU_Y           0
-+#define RPI_QPU_EMU_C           0
-+
-+// Max width & height we are prepared to consider
-+// Sand frame shape calc becomes confused with large frames
-+// Some buffer alloc also depends on this
-+#define HEVC_RPI_MAX_WIDTH      2048
-+#define HEVC_RPI_MAX_HEIGHT     1088
-+
-+
-+// Min CTB size is 16
-+#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
-+
-+/**
-+ * Value of the luma sample at position (x, y) in the 2D array tab.
-+ */
-+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
-+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
-+
-+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
-+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
-+                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
-+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
-+
-+enum RPSType {
-+    ST_CURR_BEF = 0,
-+    ST_CURR_AFT,
-+    ST_FOLL,
-+    LT_CURR,
-+    LT_FOLL,
-+    NB_RPS_TYPE,
-+};
-+
-+enum SyntaxElement {
-+    SAO_MERGE_FLAG = 0,
-+    SAO_TYPE_IDX,
-+    SAO_EO_CLASS,
-+    SAO_BAND_POSITION,
-+    SAO_OFFSET_ABS,
-+    SAO_OFFSET_SIGN,
-+    END_OF_SLICE_FLAG,
-+    SPLIT_CODING_UNIT_FLAG,
-+    CU_TRANSQUANT_BYPASS_FLAG,
-+    SKIP_FLAG,
-+    CU_QP_DELTA,
-+    PRED_MODE_FLAG,
-+    PART_MODE,
-+    PCM_FLAG,
-+    PREV_INTRA_LUMA_PRED_FLAG,
-+    MPM_IDX,
-+    REM_INTRA_LUMA_PRED_MODE,
-+    INTRA_CHROMA_PRED_MODE,
-+    MERGE_FLAG,
-+    MERGE_IDX,
-+    INTER_PRED_IDC,
-+    REF_IDX_L0,
-+    REF_IDX_L1,
-+    ABS_MVD_GREATER0_FLAG,
-+    ABS_MVD_GREATER1_FLAG,
-+    ABS_MVD_MINUS2,
-+    MVD_SIGN_FLAG,
-+    MVP_LX_FLAG,
-+    NO_RESIDUAL_DATA_FLAG,
-+    SPLIT_TRANSFORM_FLAG,
-+    CBF_LUMA,
-+    CBF_CB_CR,
-+    TRANSFORM_SKIP_FLAG,
-+    EXPLICIT_RDPCM_FLAG,
-+    EXPLICIT_RDPCM_DIR_FLAG,
-+    LAST_SIGNIFICANT_COEFF_X_PREFIX,
-+    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
-+    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
-+    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
-+    SIGNIFICANT_COEFF_GROUP_FLAG,
-+    SIGNIFICANT_COEFF_FLAG,
-+    COEFF_ABS_LEVEL_GREATER1_FLAG,
-+    COEFF_ABS_LEVEL_GREATER2_FLAG,
-+    COEFF_ABS_LEVEL_REMAINING,
-+    COEFF_SIGN_FLAG,
-+    LOG2_RES_SCALE_ABS,
-+    RES_SCALE_SIGN_FLAG,
-+    CU_CHROMA_QP_OFFSET_FLAG,
-+    CU_CHROMA_QP_OFFSET_IDX,
-+};
-+
-+enum PartMode {
-+    PART_2Nx2N = 0,
-+    PART_2NxN  = 1,
-+    PART_Nx2N  = 2,
-+    PART_NxN   = 3,
-+    PART_2NxnU = 4,
-+    PART_2NxnD = 5,
-+    PART_nLx2N = 6,
-+    PART_nRx2N = 7,
-+};
-+
-+enum PredMode {
-+    MODE_INTER = 0,
-+    MODE_INTRA,
-+    MODE_SKIP,
-+};
-+
-+enum InterPredIdc {
-+    PRED_L0 = 0,
-+    PRED_L1,
-+    PRED_BI,
-+};
-+
-+enum PredFlag {
-+    PF_INTRA = 0,
-+    PF_L0,
-+    PF_L1,
-+    PF_BI,
-+};
-+
-+enum SAOType {
-+    SAO_NOT_APPLIED = 0,
-+    SAO_BAND,
-+    SAO_EDGE,
-+    SAO_APPLIED
-+};
-+
-+enum SAOEOClass {
-+    SAO_EO_HORIZ = 0,
-+    SAO_EO_VERT,
-+    SAO_EO_135D,
-+    SAO_EO_45D,
-+};
-+
-+enum ScanType {
-+    SCAN_DIAG = 0,
-+    SCAN_HORIZ,
-+    SCAN_VERT,
-+};
-+
-+typedef struct RefPicList {
-+    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
-+    int list[HEVC_MAX_REFS];
-+    uint8_t isLongTerm[HEVC_MAX_REFS];
-+    int nb_refs;
-+} RefPicList;
-+
-+typedef struct RefPicListTab {
-+    RefPicList refPicList[2];
-+} RefPicListTab;
-+
-+typedef struct RpiCodingUnit {
-+    unsigned int x;             // Passed to deblock
-+    unsigned int y;
-+    unsigned int x_split;
-+    unsigned int y_split;
-+
-+    enum PredMode pred_mode;    ///< PredMode
-+    enum PartMode part_mode;    ///< PartMode
-+
-+    // Inferred parameters
-+    uint8_t intra_split_flag;   ///< IntraSplitFlag
-+    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
-+    uint8_t cu_transquant_bypass_flag;
-+} RpiCodingUnit;
-+
-+typedef struct RpiPredictionUnit {
-+    uint8_t intra_pred_mode[4];
-+    uint8_t intra_pred_mode_c[4];
-+    uint8_t chroma_mode_c[4];
-+    uint8_t merge_flag;
-+} RpiPredictionUnit;
-+
-+typedef struct HEVCRpiTransformUnit {
-+    int8_t cu_qp_delta;
-+
-+    // Inferred parameters;
-+    uint8_t intra_pred_mode;
-+    uint8_t intra_pred_mode_c;
-+    uint8_t chroma_mode_c;
-+    uint8_t is_cu_qp_delta_wanted;
-+    uint8_t cu_chroma_qp_offset_wanted;
-+    const int8_t * qp_divmod6[3];
-+} HEVCRpiTransformUnit;
-+
-+typedef struct DBParams {
-+    int8_t beta_offset; // -12 to +12
-+    int8_t tc_offset;   // -12 to +12
-+} DBParams;
-+
-+#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
-+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
-+#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
-+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
-+
-+struct HEVCRpiJob;
-+
-+typedef struct HEVCRpiFrame {
-+    AVFrame *frame;
-+    ThreadFrame tf;
-+    ColMvField *col_mvf;
-+    int poc;
-+    struct HEVCRpiFrame *collocated_ref;
-+
-+    AVBufferRef *col_mvf_buf;
-+
-+    /**
-+     * A sequence counter, so that old frames are output first
-+     * after a POC reset
-+     */
-+    uint16_t sequence;
-+
-+    /**
-+     * A combination of HEVC_FRAME_FLAG_*
-+     */
-+    uint8_t flags;
-+
-+    // Entry no in DPB - can be used as a small unique
-+    // frame identifier (within the current thread)
-+    uint8_t dpb_no;
-+} HEVCRpiFrame;
-+
-+typedef struct HEVCRpiLocalContext {
-+    HEVCRpiTransformUnit tu;
-+
-+    CABACContext cc;
-+
-+    // Vars that allow us to locate everything from just an lc
-+    struct HEVCRpiContext * context;  // ??? make const ???
-+    unsigned int lc_n; // lc list el no
-+
-+    // Job wait links
-+    struct HEVCRpiLocalContext * jw_next;
-+    struct HEVCRpiLocalContext * jw_prev;
-+    struct HEVCRpiLocalContext * ljw_next;
-+    struct HEVCRpiLocalContext * ljw_prev;
-+    struct HEVCRpiJob * volatile jw_job;
-+    sem_t jw_sem;
-+
-+    // ?? Wrap in structure ??
-+    sem_t bt_sem_in;
-+    sem_t * bt_psem_out;
-+    volatile int bt_terminate;
-+    unsigned int ts;
-+    unsigned int bt_last_line;  // Last line in this bit_thread chunk
-+    unsigned int bt_line_no;
-+    unsigned int bt_line_width;
-+    unsigned int bt_line_inc;
-+
-+    struct HEVCRpiJob * jb0;
-+    char unit_done;  // Set once we have dealt with this slice
-+    char bt_is_tile;
-+    char last_progress_good;
-+    char cabac_init_req;
-+
-+    uint8_t cabac_state[HEVC_CONTEXTS];
-+    uint8_t stat_coeff[4];
-+    GetBitContext gb;
-+
-+    uint8_t ct_depth;
-+    int8_t qp_y;
-+    int8_t curr_qp_y;
-+    int8_t qPy_pred;
-+
-+// N.B. Used by asm (neon) - do not change
-+#define AVAIL_S_UR  0
-+#define AVAIL_S_U   1
-+#define AVAIL_S_UL  2
-+#define AVAIL_S_L   3
-+#define AVAIL_S_DL  4
-+
-+#define AVAIL_U     (1 << AVAIL_S_U)
-+#define AVAIL_L     (1 << AVAIL_S_L)
-+#define AVAIL_UL    (1 << AVAIL_S_UL)
-+#define AVAIL_UR    (1 << AVAIL_S_UR)
-+#define AVAIL_DL    (1 << AVAIL_S_DL)
-+
-+    uint8_t ctb_avail;
-+    int     end_of_ctb_x;
-+    int     end_of_ctb_y;
-+
-+    RpiCodingUnit cu;
-+    RpiPredictionUnit pu;
-+
-+#define BOUNDARY_LEFT_SLICE     (1 << 0)
-+#define BOUNDARY_LEFT_TILE      (1 << 1)
-+#define BOUNDARY_UPPER_SLICE    (1 << 2)
-+#define BOUNDARY_UPPER_TILE     (1 << 3)
-+    /* properties of the boundary of the current CTB for the purposes
-+     * of the deblocking filter */
-+    unsigned int boundary_flags;
-+
-+#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
-+    uint8_t ipm_left[IPM_TAB_SIZE];
-+    uint8_t ipm_up[IPM_TAB_SIZE];
-+
-+//#define MVF_STASH_WIDTH       128
-+#define MVF_STASH_WIDTH       64
-+#define MVF_STASH_HEIGHT      64
-+#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
-+#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
-+    HEVCRpiMvField mvf_ul[1];
-+    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
-+
-+    /* +7 is for subpixel interpolation, *2 for high bit depths */
-+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+    /* The extended size between the new edge emu buffer is abused by SAO */
-+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+//    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-+
-+} HEVCRpiLocalContext;
-+
-+// Each block can have an intra prediction and an add_residual command
-+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
-+
-+// Sand only has 2 planes (Y/C)
-+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
-+
-+// Command for intra prediction and transform_add of predictions to coefficients
-+enum rpi_pred_cmd_e
-+{
-+    RPI_PRED_ADD_RESIDUAL,
-+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
-+    RPI_PRED_ADD_DC,
-+    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
-+    RPI_PRED_ADD_DC_V,
-+    RPI_PRED_INTRA,
-+    RPI_PRED_INTRA_C,
-+    RPI_PRED_I_PCM,
-+    RPI_PRED_CMD_MAX
-+};
-+
-+typedef struct HEVCPredCmd {
-+    uint8_t type;
-+    uint8_t size;  // log2 "size" used by all variants
-+    uint8_t avail; // i_pred - but left here as they pack well
-+    uint8_t dummy;
-+    union {
-+        struct {  // TRANSFORM_ADD
-+            uint8_t * dst;
-+            const int16_t * buf;
-+            uint16_t stride;  // Should be good enough for all pic fmts we use
-+            int16_t dc;
-+        } ta;
-+        struct {
-+            uint8_t * dst;
-+            uint32_t stride;
-+            int dc;
-+        } dc;
-+        struct {  // INTRA
-+            uint16_t x;
-+            uint16_t y;
-+            enum IntraPredMode mode;
-+        } i_pred;
-+        struct {  // I_PCM
-+            uint16_t x;
-+            uint16_t y;
-+            const void * src;
-+            uint32_t src_len;
-+        } i_pcm;
-+    };
-+} HEVCPredCmd;
-+
-+union qpu_mc_pred_cmd_s;
-+struct qpu_mc_pred_y_p_s;
-+struct qpu_mc_src_s;
-+
-+typedef struct HEVCRpiInterPredQ
-+{
-+    union qpu_mc_pred_cmd_u *qpu_mc_base;
-+    union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    struct qpu_mc_src_s *last_l0;
-+    struct qpu_mc_src_s *last_l1;
-+    unsigned int load;
-+    uint32_t code_setup;
-+    uint32_t code_sync;
-+    uint32_t code_exit;
-+} HEVCRpiInterPredQ;
-+
-+typedef struct HEVCRpiInterPredEnv
-+{
-+    HEVCRpiInterPredQ * q;
-+    uint8_t n;                  // Number of Qs
-+    uint8_t n_grp;              // Number of Q in a group
-+    uint8_t curr;               // Current Q number (0..n-1)
-+    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
-+    uint8_t used_grp;           // 0 if nothing in any Q in the current group
-+    unsigned int max_fill;
-+    unsigned int min_gap;
-+    GPU_MEM_PTR_T gptr;
-+} HEVCRpiInterPredEnv;
-+
-+typedef struct HEVCRpiIntraPredEnv {
-+    unsigned int n;        // Number of commands
-+    HEVCPredCmd * cmds;
-+} HEVCRpiIntraPredEnv;
-+
-+typedef struct HEVCRpiCoeffEnv {
-+    unsigned int n;
-+#if RPI_COMPRESS_COEFFS
-+    unsigned int packed; // Equal to 1 if coefficients should be being packed
-+    unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed).  Only valid if packed==0
-+#endif
-+    int16_t * buf;
-+} HEVCRpiCoeffEnv;
-+
-+typedef struct HEVCRpiCoeffsEnv {
-+    HEVCRpiCoeffEnv s[4];
-+    GPU_MEM_PTR_T gptr;
-+    void * mptr;
-+} HEVCRpiCoeffsEnv;
-+
-+typedef struct HEVCRpiFrameProgressWait {
-+    int req;
-+    struct HEVCRpiFrameProgressWait * next;
-+    sem_t sem;
-+} HEVCRpiFrameProgressWait;
-+
-+typedef struct HEVCRpiFrameProgressState {
-+    struct HEVCRpiFrameProgressWait * first;
-+    struct HEVCRpiFrameProgressWait * last;
-+    pthread_mutex_t lock;
-+} HEVCRpiFrameProgressState;
-+
-+typedef struct RpiBlk
-+{
-+    unsigned int x;
-+    unsigned int y;
-+    unsigned int w;
-+    unsigned int h;
-+} RpiBlk;
-+
-+typedef struct HEVCRpiJob {
-+    struct HEVCRpiJob * next;  // Free chain
-+    struct HEVCRpiJobCtl * jbc_local;
-+    const HEVCRpiSPS * sps;       // sps used to set up this job
-+
-+    int waited;
-+    int ctu_ts_first;
-+    int ctu_ts_last;
-+    RpiBlk bounds;  // Bounding box of job
-+
-+    struct qpu_mc_pred_y_p_s * last_y8_p;
-+    struct qpu_mc_src_s * last_y8_l1;
-+    rpi_cache_flush_env_t * rfe;
-+
-+    HEVCRpiInterPredEnv chroma_ip;
-+    HEVCRpiInterPredEnv luma_ip;
-+    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
-+    HEVCRpiIntraPredEnv intra;
-+    HEVCRpiCoeffsEnv coeffs;
-+    HEVCRpiFrameProgressWait progress_wait;
-+    sem_t sem;
-+    rpi_cache_buf_t flush_buf;
-+} HEVCRpiJob;
-+
-+struct HEVCRpiContext;
-+
-+typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
-+
-+typedef struct HEVCRpiPassQueue
-+{
-+//    int pending;
-+    volatile int terminate;
-+    sem_t sem_in;
-+    sem_t * psem_out;
-+    unsigned int job_n;
-+    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
-+    HEVCRpiWorkerFn * worker;
-+    pthread_t thread;
-+    uint8_t pass_n;  // Pass number - debug
-+    uint8_t started;
-+} HEVCRpiPassQueue;
-+
-+
-+struct HEVCRpiJobGlobal;
-+
-+typedef struct HEVCRpiJobCtl
-+{
-+    sem_t sem_out;
-+
-+    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
-+    struct HEVCRpiJobGlobal * jbg;
-+
-+    HEVCRpiLocalContext * lcw_head;
-+    HEVCRpiLocalContext * lcw_tail;
-+
-+    pthread_mutex_t in_lock;
-+    int offload_in;
-+
-+    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
-+} HEVCRpiJobCtl;
-+
-+
-+typedef struct HEVCRpiJobGlobal
-+{
-+    intptr_t ref_count;
-+    pthread_mutex_t lock;
-+    HEVCRpiJob * free1;                 // Singly linked list of free jobs
-+    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
-+    HEVCRpiLocalContext * wait_good;  // Last good tail
-+    HEVCRpiLocalContext * wait_tail;
-+
-+} HEVCRpiJobGlobal;
-+
-+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
-+
-+#if RPI_TSTATS
-+typedef struct HEVCRpiStats {
-+    int y_pred1_y8_merge;
-+    int y_pred1_xy;
-+    int y_pred1_x0;
-+    int y_pred1_y0;
-+    int y_pred1_x0y0;
-+    int y_pred1_wle8;
-+    int y_pred1_wgt8;
-+    int y_pred1_hle16;
-+    int y_pred1_hgt16;
-+    int y_pred2_xy;
-+    int y_pred2_x0;
-+    int y_pred2_y0;
-+    int y_pred2_x0y0;
-+    int y_pred2_hle16;
-+    int y_pred2_hgt16;
-+} HEVCRpiStats;
-+#endif
-+
-+typedef struct HEVCRpiCabacState
-+{
-+    uint8_t rice[4];
-+    uint8_t state[HEVC_CONTEXTS];
-+} HEVCRpiCabacState;
-+
-+#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT   6   // 64 pels
-+#define HEVC_RPI_BS_STRIDE1_PELS        (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_PEL_MASK    (HEVC_RPI_BS_STRIDE1_PELS - 1)
-+#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT  2   // 4 els per byte
-+#define HEVC_RPI_BS_PELS_PER_EL_SHIFT   2   // 4 pels per el
-+#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT  (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTES       (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+#define HEVC_RPI_BS_Y_SHR               3   // 8 vertical pels per row
-+#define HEVC_RPI_BS_COL_BYTES_SHR       (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+
-+typedef struct HEVCRpiContext {
-+    const AVClass *c;  // needed by private avoptions
-+    AVCodecContext *avctx;
-+
-+    uint8_t             threads_type;
-+
-+    /** 1 if the independent slice segment header was successfully parsed */
-+    uint8_t slice_initialized;
-+    char used_for_ref;  // rpi
-+    char is_irap;
-+    char offload_recon;
-+    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
-+    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
-+    uint8_t no_backward_pred_flag;
-+    uint8_t is_decoded;
-+    uint8_t no_rasl_output_flag;
-+
-+
-+    /**
-+     * Sequence counters for decoded and output frames, so that old
-+     * frames are output first after a POC reset
-+     */
-+    uint16_t seq_decode;
-+    uint16_t seq_output;
-+
-+    int                 width;
-+    int                 height;
-+
-+    HEVCRpiJobCtl * jbc;
-+    // cabac stash
-+    // b0       skip flag
-+    // b1+      ct_depth
-+    uint8_t * cabac_stash_left;
-+    uint8_t * cabac_stash_up;
-+
-+    // Function pointers
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    const uint8_t * qpu_dummy_frame_emu;
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
-+#endif
-+    HEVCRpiQpu qpu;
-+
-+    HEVCRpiFrameProgressState progress_states[2];
-+
-+    HEVCRpiCabacState *cabac_save;
-+
-+    AVFrame *frame;
-+    AVFrame *output_frame;
-+    uint8_t *sao_pixel_buffer_h[3];
-+    uint8_t *sao_pixel_buffer_v[3];
-+
-+    unsigned int col_mvf_stride;
-+    AVBufferPool *col_mvf_pool;
-+
-+    RpiSAOParams *sao;
-+    DBParams *deblock;
-+    enum HEVCNALUnitType nal_unit_type;
-+    int temporal_id;  ///< temporal_id_plus1 - 1
-+    HEVCRpiFrame *ref;
-+    int poc;
-+    int pocTid0;
-+    int slice_idx; ///< number of the slice being currently decoded
-+    int max_ra;
-+
-+    int8_t *qp_y_tab;
-+
-+    // Deblocking block strength bitmaps
-+    unsigned int bs_stride2;
-+    unsigned int bs_size;
-+    uint8_t *bs_horizontal;
-+    uint8_t *bs_vertical;
-+    uint8_t *bsf_stash_up;
-+    uint8_t *bsf_stash_left;
-+
-+#if HEVC_RPI_MAX_CTBS >= 0xffff
-+#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
-+    uint32_t *tab_slice_address;
-+#else
-+#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
-+    uint16_t *tab_slice_address;
-+#endif
-+
-+    // Bitfield 1 bit per 8 pels (min pcm size)
-+    uint8_t *is_pcm;
-+    // Bitfield 1 bit per 8 pels (min cb size)
-+    // Only needed for CIP as CIP processing is async to the main thread
-+    uint8_t *is_intra;
-+
-+    // PU
-+    HEVCRpiMvField *mvf_up;
-+    HEVCRpiMvField *mvf_left;
-+
-+    const RefPicList **rpl_up;
-+    const RefPicList **rpl_left;
-+    RefPicList * refPicList;
-+
-+    // CTB-level flags affecting loop filter operation
-+    uint8_t *filter_slice_edges;
-+
-+    /** used on BE to byteswap the lines for checksumming */
-+    uint8_t *checksum_buf;
-+    int      checksum_buf_size;
-+
-+    atomic_int wpp_err;
-+
-+    const uint8_t *data;
-+
-+    H2645Packet pkt;
-+    // type of the first VCL NAL of the current frame
-+    enum HEVCNALUnitType first_nal_type;
-+
-+    uint8_t context_initialized;
-+    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
-+                            ///< as a format defined in 14496-15
-+    int apply_defdispwin;
-+
-+    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
-+    int nuh_layer_id;
-+
-+    struct AVMD5 *md5_ctx;
-+
-+    RefPicListTab * rpl_tab;
-+    unsigned int rpl_tab_size;
-+
-+    uint8_t *is_intra_store;
-+
-+    RpiSliceHeader sh;
-+
-+    HEVCRpiParamSets ps;
-+
-+    HEVCRpiLocalContext    *HEVClc;
-+    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
-+
-+    HEVCRpiFrame DPB[HEVC_DPB_ELS];
-+
-+    ///< candidate references for the current frame
-+    RefPicList rps[5];
-+
-+    HEVCRpiPredContext hpc;
-+    HEVCDSPContext hevcdsp;
-+
-+    HEVCSEIContext sei;
-+
-+    // Put structures that allocate non-trivial storage at the end
-+    // These are mostly used indirectly so position in the structure doesn't matter
-+    HEVCRpiPassQueue passq[RPI_PASSES];
-+#if RPI_EXTRA_BIT_THREADS > 0
-+    int bt_started;
-+    // This simply contains thread descriptors - task setup is held elsewhere
-+    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
-+#endif
-+#if RPI_TSTATS
-+    HEVCRpiStats tstats;
-+#endif
-+} HEVCRpiContext;
-+
-+/**
-+ * Mark all frames in DPB as unused for reference.
-+ */
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
-+
-+/**
-+ * Drop all frames currently in DPB.
-+ */
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture sets for the current frame.
-+ */
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture list(s) for the current slice.
-+ */
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
-+
-+
-+/**
-+ * Get the number of candidate references for the current frame.
-+ */
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
-+
-+/**
-+ * Find next frame in output order and put a reference to it in frame.
-+ * @return 1 if a frame was output, 0 otherwise
-+ */
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
-+
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, HEVCRpiMvField * const mv);
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    HEVCRpiMvField * const mv,
-+    const unsigned int mvp_lx_flag, const unsigned int LX);
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+                                               const unsigned int x0, const unsigned int y0,
-+                                               const unsigned int log2_trafo_size, const int is_coded_block);
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
-+
-+extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra[4];
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
-+
-+// arm/hevc_misc_neon.S
-+// Neon coeff zap fn
-+#if HAVE_NEON
-+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
-+#endif
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int val, const int field);
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
-+
-+// All of these expect that s->threads_type == FF_THREAD_FRAME
-+
-+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int y)
-+{
-+    if (s->threads_type != 0)
-+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
-+{
-+    if (s->used_for_ref && s->threads_type != 0)
-+        ff_hevc_rpi_progress_signal_field(s, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int y)
-+{
-+    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
-+{
-+    if (s->used_for_ref && s->threads_type != 0)
-+    {
-+        ff_hevc_rpi_progress_signal_field(s, y, 0);
-+    }
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
-+{
-+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
-+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
-+}
-+
-+
-+// Set all done - signal nothing (used in missing refs)
-+// Works for both rpi & non-rpi
-+static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
-+{
-+    if (ref->tf.progress != NULL)
-+    {
-+        int * const p = (int *)ref->tf.progress->data;
-+        p[0] = INT_MAX;
-+        p[1] = INT_MAX;
-+    }
-+}
-+
-+#define HEVC_RPI_420_ONLY 1
-+#define HEVC_RPI_SAND128_ONLY 1
-+
-+static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return cidx == 0 ? 0 : 1;
-+#else
-+    return s->ps.sps->hshift[cidx];
-+#endif
-+}
-+
-+static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return cidx == 0 ? 0 : 1;
-+#else
-+    return s->ps.sps->vshift[cidx];
-+#endif
-+}
-+
-+static inline int ctx_cfmt(const HEVCRpiContext * const s)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return 1;
-+#else
-+    return s->ps.sps->chroma_format_idc;
-+#endif
-+}
-+
-+static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
-+{
-+#if HEVC_RPI_SAND128_ONLY
-+    return 128;
-+#else
-+    return frame->linesize[c_idx];
-+#endif
-+}
-+
-+#if HEVC_RPI_SAND128_ONLY
-+// Propagate this decision to later zc includes
-+#define RPI_ZC_SAND128_ONLY 1
-+#endif
-+
-+#ifndef ff_hevc_rpi_copy_vert
-+static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
-+                                         int pixel_shift, int height,
-+                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    int i;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            for (i = 0; i < height; i++) {
-+                *(uint32_t *)dst = *(uint32_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        case 1:
-+            for (i = 0; i < height; i++) {
-+                *(uint16_t *)dst = *(uint16_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        default:
-+            for (i = 0; i < height; i++) {
-+                *dst = *src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+    }
-+}
-+#endif
-+
-+
-+#if MVF_STASH_WIDTH == 64
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+    return (HEVCRpiMvField *)((y < y0_ctb) ?
-+        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
-+        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
-+            lc->mvf_stash +
-+                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
-+                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+                               const unsigned int x0,
-+                               const unsigned int x)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
-+}
-+
-+#else
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+    // If not in the same CTB for Y assume up
-+    if (y < y0_ctb) {
-+        // If not in the same CTB for X too assume up-left
-+        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
-+    }
-+    return mvf_stash_ptr(s, lc, x, y);
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+                               const unsigned int x0,
-+                               const unsigned int x)
-+{
-+    return MVF_STASH_WIDTH_PU;
-+}
-+#endif
-+
-+#endif /* AVCODEC_RPI_HEVCDEC_H */
-diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
-new file mode 100644
-index 0000000000..ac29789e7f
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.c
-@@ -0,0 +1,450 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ *
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdsp.h"
-+#include "rpi_hevc_mv.h"
-+
-+static const int8_t transform[32][32] = {
-+    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
-+      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
-+    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
-+      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
-+    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
-+     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
-+    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
-+      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
-+    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
-+      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
-+    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
-+     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
-+    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
-+     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
-+    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
-+      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
-+    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
-+      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
-+    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
-+     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
-+    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
-+     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
-+    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
-+      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
-+    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
-+      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
-+    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
-+     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
-+    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
-+     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
-+    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
-+      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
-+    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
-+      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
-+    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
-+     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
-+    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
-+     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
-+    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
-+      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
-+    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
-+      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
-+    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
-+     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
-+    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
-+     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
-+    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
-+      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
-+    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
-+      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
-+    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
-+     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
-+    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
-+     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
-+    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
-+      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
-+    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
-+      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
-+    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
-+     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
-+    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
-+      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
-+    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
-+      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
-+    { -2, 58, 10, -2},
-+    { -4, 54, 16, -2},
-+    { -6, 46, 28, -4},
-+    { -4, 36, 36, -4},
-+    { -4, 28, 46, -6},
-+    { -2, 16, 54, -4},
-+    { -2, 10, 58, -2},
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
-+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
-+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
-+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
-+};
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc0, int in_inc1)
-+{
-+    int shift = 32;
-+    uint32_t bs = 0;
-+    for (; pus > 0; pus--) {
-+        int strength, out;
-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-+        int nr_idx0 = neigh->ref_idx[0];
-+        int nr_idx1 = neigh->ref_idx[1];
-+        int neigh_refL0 = neigh_rpl0[nr_idx0];
-+        int neigh_refL1 = neigh_rpl1[nr_idx1];
-+
-+        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
-+        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
-+
-+#if 1 // This more directly matches the original implementation
-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-+            // same L0 and L1
-+            if (curr_refL0 == neigh_refL0 &&
-+                curr_refL0 == curr_refL1 &&
-+                neigh_refL0 == neigh_refL1) {
-+                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
-+                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL0 == curr_refL0 &&
-+                       neigh_refL1 == curr_refL1) {
-+                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL1 == curr_refL0 &&
-+                       neigh_refL0 == curr_refL1) {
-+                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else {
-+                strength = 1;
-+            }
-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+            MvXY curr_mv0, neigh_mv0;
-+
-+            if (curr->pred_flag & 1) {
-+                curr_mv0   = curr->xy[0];
-+            } else {
-+                curr_mv0   = curr->xy[1];
-+                curr_refL0 = curr_refL1;
-+            }
-+
-+            if (neigh->pred_flag & 1) {
-+                neigh_mv0   = neigh->xy[0];
-+            } else {
-+                neigh_mv0   = neigh->xy[1];
-+                neigh_refL0 = neigh_refL1;
-+            }
-+
-+            if (curr_refL0 == neigh_refL0) {
-+                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else
-+                strength = 1;
-+        } else
-+            strength = 1;
-+#else // This has exactly the same effect, but is more suitable for vectorisation
-+        MvXY curr_mv[2];
-+        MvXY neigh_mv[2];
-+        memcpy(curr_mv, curr->xy, sizeof curr_mv);
-+        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
-+
-+        if (!(curr->pred_flag & 2)) {
-+            curr_mv[1] = curr_mv[0];
-+            curr_refL1 = curr_refL0;
-+        }
-+        if (!(neigh->pred_flag & 2)) {
-+            neigh_mv[1] = neigh_mv[0];
-+            neigh_refL1 = neigh_refL0;
-+        }
-+        if (!(curr->pred_flag & 1)) {
-+            curr_mv[0] = curr_mv[1];
-+            curr_refL0 = curr_refL1;
-+        }
-+        if (!(neigh->pred_flag & 1)) {
-+            neigh_mv[0] = neigh_mv[1];
-+            neigh_refL0 = neigh_refL1;
-+        }
-+
-+        strength = 1;
-+
-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
-+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
-+
-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
-+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
-+
-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
-+#endif
-+
-+        curr += in_inc0 / sizeof (HEVCRpiMvField);
-+        neigh += in_inc1 / sizeof (HEVCRpiMvField);
-+
-+        for (out = dup; out > 0; out--)
-+        {
-+            bs = (bs >> 2) | (strength << 30);
-+            shift -= 2;
-+        }
-+    }
-+    return bs >> shift;
-+}
-+
-+
-+static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
-+{
-+    unsigned int i, j;
-+
-+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
-+        for (i = 0; i < height; i++) {
-+            for (j = 0; j < width; j+=8)
-+                AV_COPY64U(dst+j, src+j);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    } else {
-+        for (i = 0; i < height; i++) {
-+            for (j = 0; j < width; j+=16)
-+                AV_COPY128(dst+j, src+j);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    }
-+}
-+
-+
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef PEL_FUNC
-+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
-+    for(i = 0 ; i < 10 ; i++)                                                  \
-+{                                                                              \
-+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
-+}
-+
-+#undef EPEL_FUNCS
-+#define EPEL_FUNCS(depth)                                                     \
-+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
-+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
-+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
-+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
-+
-+#undef EPEL_UNI_FUNCS
-+#define EPEL_UNI_FUNCS(depth)                                                 \
-+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
-+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
-+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
-+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
-+
-+#undef EPEL_BI_FUNCS
-+#define EPEL_BI_FUNCS(depth)                                                \
-+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
-+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
-+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
-+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
-+
-+#undef QPEL_FUNCS
-+#define QPEL_FUNCS(depth)                                                     \
-+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
-+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
-+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
-+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
-+
-+#undef QPEL_UNI_FUNCS
-+#define QPEL_UNI_FUNCS(depth)                                                 \
-+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
-+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
-+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
-+
-+#undef QPEL_BI_FUNCS
-+#define QPEL_BI_FUNCS(depth)                                                  \
-+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
-+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
-+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
-+
-+#define SLICED_ADD_RESIDUAL(depth)\
-+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
-+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
-+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
-+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
-+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
-+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
-+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
-+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
-+    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
-+    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
-+    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
-+    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
-+    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
-+    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
-+#define SLICED_LOOP_FILTERS(depth)\
-+    hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
-+#define SLICED_SAO(depth)\
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
-+        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
-+        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
-+    }                                                                         \
-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
-+
-+#define HEVC_DSP(depth)                                                     \
-+    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-+    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
-+    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
-+    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
-+    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
-+    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
-+    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
-+    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
-+    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
-+    SLICED_ADD_RESIDUAL(depth);                                             \
-+    hevcdsp->dequant                = FUNC(dequant, depth);                 \
-+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-+    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
-+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
-+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
-+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
-+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
-+                                                                            \
-+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
-+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
-+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
-+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
-+                                                                            \
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
-+        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
-+        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
-+    }                                                                       \
-+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
-+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
-+    SLICED_SAO(depth);                                                         \
-+                                                                               \
-+    QPEL_FUNCS(depth);                                                         \
-+    QPEL_UNI_FUNCS(depth);                                                     \
-+    QPEL_BI_FUNCS(depth);                                                      \
-+    EPEL_FUNCS(depth);                                                         \
-+    EPEL_UNI_FUNCS(depth);                                                     \
-+    EPEL_BI_FUNCS(depth);                                                      \
-+                                                                               \
-+    SLICED_LOOP_FILTERS(depth);                                                \
-+    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
-+int i = 0;
-+
-+    switch (bit_depth) {
-+    case 9:
-+        HEVC_DSP(9);
-+        break;
-+    case 10:
-+        HEVC_DSP(10);
-+        break;
-+    case 12:
-+        HEVC_DSP(12);
-+        break;
-+    default:
-+        HEVC_DSP(8);
-+        break;
-+    }
-+
-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-+    hevcdsp->cpy_blk = cpy_blk;
-+
-+    if (ARCH_PPC)
-+        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
-+    if (ARCH_X86)
-+        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
-+    if (ARCH_ARM)
-+        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
-+    if (ARCH_MIPS)
-+        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
-+}
-diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
-new file mode 100644
-index 0000000000..5a7cdeeb66
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.h
-@@ -0,0 +1,177 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ *
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDSP_H
-+#define AVCODEC_RPI_HEVCDSP_H
-+
-+#include "hevc.h"
-+#include "get_bits.h"
-+
-+struct HEVCRpiMvField;
-+
-+#define MAX_PB_SIZE 64
-+
-+#define RPI_HEVC_SAO_BUF_STRIDE 160
-+
-+
-+typedef struct RpiSAOParams {
-+    uint8_t band_position[3];   ///< sao_band_position (Y,U,V)
-+    uint8_t eo_class[3];        ///< sao_eo_class      (Y,U=V)
-+    uint8_t type_idx[3];        ///< sao_type_idx      (Y,U=V)
-+
-+    int16_t offset_val[3][5];   ///<SaoOffsetVal       (Y,U,V)
-+
-+} RpiSAOParams;
-+
-+
-+// This controls how many sao dsp functions there are
-+// N=5 has width = 8, 16, 32, 48, 64
-+// N=6 adds a function for width=24 (in fn array el 5 so existing code should
-+// still work)
-+#define SAO_FILTER_N 6
-+
-+
-+typedef struct HEVCDSPContext {
-+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
-+
-+    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
-+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
-+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
-+
-+    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
-+
-+    void (*dequant)(int16_t *coeffs, int16_t log2_size);
-+
-+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
-+
-+    void (*transform_4x4_luma)(int16_t *coeffs);
-+
-+    void (*idct[4])(int16_t *coeffs, int col_limit);
-+
-+    void (*idct_dc[4])(int16_t *coeffs);
-+
-+    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                               int width, int height);
-+
-+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-+    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
-+
-+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+
-+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                        int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+
-+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, int denom, int wx0, int wx1,
-+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, intptr_t mx, intptr_t my, int width);
-+
-+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, int denom, int wx0, int ox0, int wx1,
-+                                         int ox1, intptr_t mx, intptr_t my, int width);
-+
-+    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                    int beta, int32_t *tc,
-+                                    uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                    int beta, int32_t *tc,
-+                                    uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                      int beta, int32_t *tc,
-+                                      uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                      int beta, int32_t *tc,
-+                                      uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                        int32_t *tc, uint8_t *no_p,
-+                                        uint8_t *no_q);
-+    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                        int32_t *tc, uint8_t *no_p,
-+                                        uint8_t *no_q);
-+    void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                                 uint8_t * _pix_l);
-+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f);
-+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f);
-+
-+    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc0, int inc_inc1);
-+
-+    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
-+} HEVCDSPContext;
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-+
-+extern const int8_t ff_hevc_rpi_epel_filters[7][4];
-+extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
-+
-+void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
-+#endif /* AVCODEC_RPI_HEVCDSP_H */
-diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
-new file mode 100644
-index 0000000000..d1196a4440
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp_template.c
-@@ -0,0 +1,2278 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "get_bits.h"
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+#include "rpi_hevcdsp.h"
-+
-+#include "rpi_hevc_shader_template.h"
-+
-+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+                          GetBitContext *gb, int pcm_bit_depth)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+}
-+
-+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+                          GetBitContext *gb, int pcm_bit_depth)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+
-+    dst = (pixel *)_dst + 1;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
-+                                                ptrdiff_t stride, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size; x++) {
-+            dst[x] = av_clip_pixel(dst[x] + *res);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size; x++) {
-+            dst[x] = av_clip_pixel(dst[x] + dc);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_v, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + *res);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_u, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, unsigned int size)
-+{
-+    unsigned int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int16_t * ru = res;
-+    const int16_t * rv = res + size * size;
-+
-+//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
-+//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
-+//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
-+        }
-+        dst += stride;
-+    }
-+
-+//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int dc_v = dc >> 16;
-+    const int dc_u = (dc << 16) >> 16;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 32);
-+}
-+
-+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 32);
-+}
-+
-+// -- U -- (plaited)
-+
-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
-+}
-+
-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
-+}
-+
-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
-+}
-+
-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- V -- (plaited)
-+
-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
-+}
-+
-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
-+}
-+
-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
-+}
-+
-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- C -- (plaited - both U & V)
-+
-+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+
-+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
-+{
-+    int16_t *coeffs = (int16_t *) _coeffs;
-+    int x, y;
-+    int size = 1 << log2_size;
-+
-+    if (mode) {
-+        coeffs += size;
-+        for (y = 0; y < size - 1; y++) {
-+            for (x = 0; x < size; x++)
-+                coeffs[x] += coeffs[x - size];
-+            coeffs += size;
-+        }
-+    } else {
-+        for (y = 0; y < size; y++) {
-+            for (x = 1; x < size; x++)
-+                coeffs[x] += coeffs[x - 1];
-+            coeffs += size;
-+        }
-+    }
-+}
-+
-+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
-+{
-+    int shift  = 15 - BIT_DEPTH - log2_size;
-+    int x, y;
-+    int size = 1 << log2_size;
-+
-+    if (shift > 0) {
-+        int offset = 1 << (shift - 1);
-+        for (y = 0; y < size; y++) {
-+            for (x = 0; x < size; x++) {
-+                *coeffs = (*coeffs + offset) >> shift;
-+                coeffs++;
-+            }
-+        }
-+    } else {
-+        for (y = 0; y < size; y++) {
-+            for (x = 0; x < size; x++) {
-+                *coeffs = *coeffs << -shift;
-+                coeffs++;
-+            }
-+        }
-+    }
-+}
-+
-+#define SET(dst, x)   (dst) = (x)
-+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
-+
-+#define TR_4x4_LUMA(dst, src, step, assign)                             \
-+    do {                                                                \
-+        int c0 = src[0 * step] + src[2 * step];                         \
-+        int c1 = src[2 * step] + src[3 * step];                         \
-+        int c2 = src[0 * step] - src[3 * step];                         \
-+        int c3 = 74 * src[1 * step];                                    \
-+                                                                        \
-+        assign(dst[2 * step], 74 * (src[0 * step] -                     \
-+                                    src[2 * step] +                     \
-+                                    src[3 * step]));                    \
-+        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
-+        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
-+        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
-+    } while (0)
-+
-+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
-+{
-+    int i;
-+    int shift    = 7;
-+    int add      = 1 << (shift - 1);
-+    int16_t *src = coeffs;
-+
-+    for (i = 0; i < 4; i++) {
-+        TR_4x4_LUMA(src, src, 4, SCALE);
-+        src++;
-+    }
-+
-+    shift = 20 - BIT_DEPTH;
-+    add   = 1 << (shift - 1);
-+    for (i = 0; i < 4; i++) {
-+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
-+        coeffs += 4;
-+    }
-+}
-+
-+#undef TR_4x4_LUMA
-+
-+#define TR_4(dst, src, dstep, sstep, assign, end)                 \
-+    do {                                                          \
-+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
-+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
-+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
-+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
-+                                                                  \
-+        assign(dst[0 * dstep], e0 + o0);                          \
-+        assign(dst[1 * dstep], e1 + o1);                          \
-+        assign(dst[2 * dstep], e1 - o1);                          \
-+        assign(dst[3 * dstep], e0 - o0);                          \
-+    } while (0)
-+
-+#define TR_8(dst, src, dstep, sstep, assign, end)                 \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_8[4];                                               \
-+        int o_8[4] = { 0 };                                       \
-+        for (i = 0; i < 4; i++)                                   \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
-+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
-+                                                                  \
-+        for (i = 0; i < 4; i++) {                                 \
-+            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
-+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
-+        }                                                         \
-+    } while (0)
-+
-+#define TR_16(dst, src, dstep, sstep, assign, end)                \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_16[8];                                              \
-+        int o_16[8] = { 0 };                                      \
-+        for (i = 0; i < 8; i++)                                   \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
-+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
-+                                                                  \
-+        for (i = 0; i < 8; i++) {                                 \
-+            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
-+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
-+        }                                                         \
-+    } while (0)
-+
-+#define TR_32(dst, src, dstep, sstep, assign, end)                \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_32[16];                                             \
-+        int o_32[16] = { 0 };                                     \
-+        for (i = 0; i < 16; i++)                                  \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_32[i] += transform[j][i] * src[j * sstep];      \
-+        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
-+                                                                  \
-+        for (i = 0; i < 16; i++) {                                \
-+            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
-+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
-+        }                                                         \
-+    } while (0)
-+
-+#define IDCT_VAR4(H)                                              \
-+    int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR8(H)                                              \
-+    int limit  = FFMIN(col_limit, H);                             \
-+    int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR16(H)   IDCT_VAR8(H)
-+#define IDCT_VAR32(H)   IDCT_VAR8(H)
-+
-+#define IDCT(H)                                                   \
-+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
-+                                        int col_limit)            \
-+{                                                                 \
-+    int i;                                                        \
-+    int      shift = 7;                                           \
-+    int      add   = 1 << (shift - 1);                            \
-+    int16_t *src   = coeffs;                                      \
-+    IDCT_VAR ## H(H);                                             \
-+                                                                  \
-+    for (i = 0; i < H; i++) {                                     \
-+        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
-+        if (limit2 < H && i%4 == 0 && !!i)                        \
-+            limit2 -= 4;                                          \
-+        src++;                                                    \
-+    }                                                             \
-+                                                                  \
-+    shift = 20 - BIT_DEPTH;                                       \
-+    add   = 1 << (shift - 1);                                     \
-+    for (i = 0; i < H; i++) {                                     \
-+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
-+        coeffs += H;                                              \
-+    }                                                             \
-+}
-+
-+#define IDCT_DC(H)                                                \
-+static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
-+{                                                                 \
-+    int i, j;                                                     \
-+    int shift = 14 - BIT_DEPTH;                                   \
-+    int add   = 1 << (shift - 1);                                 \
-+    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
-+                                                                  \
-+    for (j = 0; j < H; j++) {                                     \
-+        for (i = 0; i < H; i++) {                                 \
-+            coeffs[i + j * H] = coeff;                            \
-+        }                                                         \
-+    }                                                             \
-+}
-+
-+IDCT( 4)
-+IDCT( 8)
-+IDCT(16)
-+IDCT(32)
-+
-+IDCT_DC( 4)
-+IDCT_DC( 8)
-+IDCT_DC(16)
-+IDCT_DC(32)
-+
-+#undef TR_4
-+#undef TR_8
-+#undef TR_16
-+#undef TR_32
-+
-+#undef SET
-+#undef SCALE
-+
-+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  int16_t *sao_offset_val, int sao_left_class,
-+                                  int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int offset_table[32] = { 0 };
-+    int k, y, x;
-+    int shift  = BIT_DEPTH - 5;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    for (k = 0; k < 4; k++)
-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+        dst += stride_dst;
-+        src += stride_src;
-+    }
-+}
-+
-+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
-+
-+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
-+                                  int eo, int width, int height) {
-+
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            int diff0 = CMP(src[x], src[x + a_stride]);
-+            int diff1 = CMP(src[x], src[x + b_stride]);
-+            int offset_val        = edge_idx[2 + diff0 + diff1];
-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-+        }
-+        src += stride_src;
-+        dst += stride_dst;
-+    }
-+}
-+
-+
-+#if BIT_DEPTH == 10
-+// We need a 32 bit variation for the _c restores so hijack bit depth 10
-+#undef pixel
-+#undef BIT_DEPTH
-+#define pixel uint32_t
-+#define BIT_DEPTH 32
-+// All 16 bit variations are the same
-+#define sao_edge_restore_0_10 sao_edge_restore_0_9
-+#define sao_edge_restore_1_10 sao_edge_restore_1_9
-+#define sao_edge_restore_0_11 sao_edge_restore_0_9
-+#define sao_edge_restore_1_11 sao_edge_restore_1_9
-+#define sao_edge_restore_0_12 sao_edge_restore_0_9
-+#define sao_edge_restore_1_12 sao_edge_restore_1_9
-+#define sao_edge_restore_0_13 sao_edge_restore_0_9
-+#define sao_edge_restore_1_13 sao_edge_restore_1_9
-+#define sao_edge_restore_0_14 sao_edge_restore_0_9
-+#define sao_edge_restore_1_14 sao_edge_restore_1_9
-+#define sao_edge_restore_0_15 sao_edge_restore_0_9
-+#define sao_edge_restore_1_15 sao_edge_restore_1_9
-+#define sao_edge_restore_0_16 sao_edge_restore_0_9
-+#define sao_edge_restore_1_16 sao_edge_restore_1_9
-+#endif
-+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
-+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int sao_eo_class    = sao->eo_class[c_idx];
-+    int init_x = 0, width = _width, height = _height;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    if (sao_eo_class != SAO_EO_VERT) {
-+        if (borders[0]) {
-+            for (y = 0; y < height; y++) {
-+                dst[y * stride_dst] = src[y * stride_src];
-+            }
-+            init_x = 1;
-+        }
-+        if (borders[2]) {
-+            int offset     = width - 1;
-+            for (x = 0; x < height; x++) {
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+            }
-+            width--;
-+        }
-+    }
-+    if (sao_eo_class != SAO_EO_HORIZ) {
-+        if (borders[1]) {
-+            for (x = init_x; x < width; x++)
-+                dst[x] = src[x];
-+        }
-+        if (borders[3]) {
-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
-+            for (x = init_x; x < width; x++)
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-+            height--;
-+        }
-+    }
-+}
-+
-+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int sao_eo_class    = sao->eo_class[c_idx];
-+    int init_x = 0, init_y = 0, width = _width, height = _height;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    if (sao_eo_class != SAO_EO_VERT) {
-+        if (borders[0]) {
-+            for (y = 0; y < height; y++) {
-+                dst[y * stride_dst] = src[y * stride_src];
-+            }
-+            init_x = 1;
-+        }
-+        if (borders[2]) {
-+            int offset     = width - 1;
-+            for (x = 0; x < height; x++) {
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+            }
-+            width--;
-+        }
-+    }
-+    if (sao_eo_class != SAO_EO_HORIZ) {
-+        if (borders[1]) {
-+            for (x = init_x; x < width; x++)
-+                dst[x] = src[x];
-+            init_y = 1;
-+        }
-+        if (borders[3]) {
-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
-+            for (x = init_x; x < width; x++)
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-+            height--;
-+        }
-+    }
-+
-+    {
-+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
-+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
-+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
-+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
-+
-+        // Restore pixels that can't be modified
-+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
-+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
-+                dst[y*stride_dst] = src[y*stride_src];
-+        }
-+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
-+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
-+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
-+        }
-+
-+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
-+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
-+                dst[x] = src[x];
-+        }
-+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
-+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
-+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
-+        }
-+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
-+            dst[0] = src[0];
-+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
-+            dst[width-1] = src[width-1];
-+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
-+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
-+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
-+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
-+
-+    }
-+}
-+#endif
-+#if BIT_DEPTH == 32
-+#undef BIT_DEPTH
-+#undef pixel
-+#define BIT_DEPTH 10
-+#define pixel uint16_t
-+#endif
-+
-+// --- Plaited chroma versions
-+
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int offset_table_u[32] = { 0 };
-+    int offset_table_v[32] = { 0 };
-+    int k, y, x;
-+    int shift  = BIT_DEPTH - 5;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+    width *= 2;
-+
-+    for (k = 0; k < 4; k++)
-+    {
-+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+    }
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2)
-+        {
-+//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
-+//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
-+            // *** & 31 shouldn't be wanted but just now we generate broken input that
-+            // crashes us in 10-bit world
-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
-+        }
-+        dst += stride_dst;
-+        src += stride_src;
-+    }
-+}
-+
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+                                  int eo, int width, int height) {
-+
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+
-+    stride_dst /= sizeof(pixel);
-+    width *= 2;
-+
-+    av_assert0(width <= 64);
-+
-+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2) {
-+            int diff0u = CMP(src[x], src[x + a_stride]);
-+            int diff1u = CMP(src[x], src[x + b_stride]);
-+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
-+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
-+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
-+        }
-+        src += stride_src;
-+        dst += stride_dst;
-+    }
-+}
-+
-+// Do once
-+#if BIT_DEPTH == 8
-+// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
-+#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
-+// We need 32 bit for 9 bit+
-+#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
-+#endif
-+
-+#undef CMP
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-+                                      uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = src[x] << (14 - BIT_DEPTH);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        memcpy(dst, src, width * sizeof(pixel));
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                           int16_t *src2,
-+                                           int height, int denom, int wx0, int wx1,
-+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
-+        }
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define QPEL_FILTER(src, stride)                                               \
-+    (filter[0] * src[x - 3 * stride] +                                         \
-+     filter[1] * src[x - 2 * stride] +                                         \
-+     filter[2] * src[x -     stride] +                                         \
-+     filter[3] * src[x             ] +                                         \
-+     filter[4] * src[x +     stride] +                                         \
-+     filter[5] * src[x + 2 * stride] +                                         \
-+     filter[6] * src[x + 3 * stride] +                                         \
-+     filter[7] * src[x + 4 * stride])
-+
-+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    for (y = 0; y < height; y++)  {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
-+                                   uint8_t *_src,
-+                                   ptrdiff_t _srcstride,
-+                                   int height, intptr_t mx,
-+                                   intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+        tmp += MAX_PB_SIZE;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                      uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                     uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+
-+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                       uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift =  14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int16_t *src2,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                        uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox,
-+                                        intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                        uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox,
-+                                        intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                         uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int height, int denom, int wx, int ox,
-+                                         intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int16_t *src2,
-+                                        int height, int denom, int wx0, int wx1,
-+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define EPEL_FILTER(src, stride)                                               \
-+    (filter[0] * src[x - stride] +                                             \
-+     filter[1] * src[x]          +                                             \
-+     filter[2] * src[x + stride] +                                             \
-+     filter[3] * src[x + 2 * stride])
-+
-+static void FUNC(put_hevc_epel_h)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_v)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
-+                                   uint8_t *_src, ptrdiff_t _srcstride,
-+                                   int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+        tmp += MAX_PB_SIZE;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        }
-+        dst  += dststride;
-+        src  += srcstride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        dst  += dststride;
-+        src  += srcstride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int16_t *src2,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        }
-+        dst += dststride;
-+        src += srcstride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        }
-+        dst += dststride;
-+        src += srcstride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int16_t *src2,
-+                                        int height, int denom, int wx0, int wx1,
-+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+// line zero
-+#define P3 pix[-4 * xstride]
-+#define P2 pix[-3 * xstride]
-+#define P1 pix[-2 * xstride]
-+#define P0 pix[-1 * xstride]
-+#define Q0 pix[0 * xstride]
-+#define Q1 pix[1 * xstride]
-+#define Q2 pix[2 * xstride]
-+#define Q3 pix[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix[-4 * xstride + 3 * ystride]
-+#define TP2 pix[-3 * xstride + 3 * ystride]
-+#define TP1 pix[-2 * xstride + 3 * ystride]
-+#define TP0 pix[-1 * xstride + 3 * ystride]
-+#define TQ0 pix[0  * xstride + 3 * ystride]
-+#define TQ1 pix[1  * xstride + 3 * ystride]
-+#define TQ2 pix[2  * xstride + 3 * ystride]
-+#define TQ3 pix[3  * xstride + 3 * ystride]
-+
-+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
-+                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
-+                                        int beta, int *_tc,
-+                                        uint8_t *_no_p, uint8_t *_no_q)
-+{
-+    int d, j;
-+    pixel *pix        = (pixel *)_pix;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    beta <<= BIT_DEPTH - 8;
-+
-+    for (j = 0; j < 2; j++) {
-+        const int dp0  = abs(P2  - 2 * P1  + P0);
-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
-+        const int d0   = dp0 + dq0;
-+        const int d3   = dp3 + dq3;
-+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
-+        const int no_p = _no_p[j];
-+        const int no_q = _no_q[j];
-+
-+        if (d0 + d3 >= beta) {
-+            pix += 4 * ystride;
-+            continue;
-+        } else {
-+            const int beta_3 = beta >> 3;
-+            const int beta_2 = beta >> 2;
-+            const int tc25   = ((tc * 5 + 1) >> 1);
-+
-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
-+                // strong filtering
-+                const int tc2 = tc << 1;
-+                for (d = 0; d < 4; d++) {
-+                    const int p3 = P3;
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    const int q3 = Q3;
-+                    if (!no_p) {
-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+                    }
-+                    if (!no_q) {
-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+                    }
-+                    pix += ystride;
-+                }
-+            } else { // normal filtering
-+                int nd_p = 1;
-+                int nd_q = 1;
-+                const int tc_2 = tc >> 1;
-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_p = 2;
-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_q = 2;
-+
-+                for (d = 0; d < 4; d++) {
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+                    if (abs(delta0) < 10 * tc) {
-+                        delta0 = av_clip(delta0, -tc, tc);
-+                        if (!no_p)
-+                            P0 = av_clip_pixel(p0 + delta0);
-+                        if (!no_q)
-+                            Q0 = av_clip_pixel(q0 - delta0);
-+                        if (!no_p && nd_p > 1) {
-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+                            P1 = av_clip_pixel(p1 + deltap1);
-+                        }
-+                        if (!no_q && nd_q > 1) {
-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+                            Q1 = av_clip_pixel(q1 + deltaq1);
-+                        }
-+                    }
-+                    pix += ystride;
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
-+                                          ptrdiff_t _ystride, int *_tc,
-+                                          uint8_t *_no_p, uint8_t *_no_q)
-+{
-+    int d, j, no_p, no_q;
-+    pixel *pix        = (pixel *)_pix;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    for (j = 0; j < 2; j++) {
-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
-+        if (tc <= 0) {
-+            pix += 4 * ystride;
-+            continue;
-+        }
-+        no_p = _no_p[j];
-+        no_q = _no_q[j];
-+
-+        for (d = 0; d < 4; d++) {
-+            int delta0;
-+            const int p1 = P1;
-+            const int p0 = P0;
-+            const int q0 = Q0;
-+            const int q1 = Q1;
-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+            if (!no_p)
-+                P0 = av_clip_pixel(p0 + delta0);
-+            if (!no_q)
-+                Q0 = av_clip_pixel(q0 - delta0);
-+            pix += ystride;
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                            int32_t *tc, uint8_t *no_p,
-+                                            uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                            int32_t *tc, uint8_t *no_p,
-+                                            uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                          int beta, int32_t *tc, uint8_t *no_p,
-+                                          uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
-+                                beta, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                          int beta, int32_t *tc, uint8_t *no_p,
-+                                          uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
-+                                beta, tc, no_p, no_q);
-+}
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+// line zero
-+#define P3 pix_l[0 * xstride]
-+#define P2 pix_l[1 * xstride]
-+#define P1 pix_l[2 * xstride]
-+#define P0 pix_l[3 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+#define Q2 pix_r[2 * xstride]
-+#define Q3 pix_r[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix_l[0 * xstride + 3 * ystride]
-+#define TP2 pix_l[1 * xstride + 3 * ystride]
-+#define TP1 pix_l[2 * xstride + 3 * ystride]
-+#define TP0 pix_l[3 * xstride + 3 * ystride]
-+#define TQ0 pix_r[0 * xstride + 3 * ystride]
-+#define TQ1 pix_r[1 * xstride + 3 * ystride]
-+#define TQ2 pix_r[2 * xstride + 3 * ystride]
-+#define TQ3 pix_r[3 * xstride + 3 * ystride]
-+
-+// This is identical to hevc_loop_filter_luma except that the P/Q
-+// components are on separate pointers
-+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                                 uint8_t * _pix_l)
-+{
-+    int d, j;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    const ptrdiff_t xstride = 1;
-+    const ptrdiff_t ystride = _stride / sizeof(pixel);
-+
-+    beta <<= BIT_DEPTH - 8;
-+
-+    for (j = 0; j < 2; j++) {
-+        const int dp0  = abs(P2  - 2 * P1  + P0);
-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
-+        const int d0   = dp0 + dq0;
-+        const int d3   = dp3 + dq3;
-+        const int tc   = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
-+        const int no_p = no_f & 1;
-+        const int no_q = no_f & 2;
-+
-+        if (d0 + d3 >= beta) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        } else {
-+            const int beta_3 = beta >> 3;
-+            const int beta_2 = beta >> 2;
-+            const int tc25   = ((tc * 5 + 1) >> 1);
-+
-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
-+                // strong filtering
-+                const int tc2 = tc << 1;
-+                for (d = 0; d < 4; d++) {
-+                    const int p3 = P3;
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    const int q3 = Q3;
-+                    if (!no_p) {
-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+                    }
-+                    if (!no_q) {
-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            } else { // normal filtering
-+                int nd_p = 1;
-+                int nd_q = 1;
-+                const int tc_2 = tc >> 1;
-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_p = 2;
-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_q = 2;
-+
-+                for (d = 0; d < 4; d++) {
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+                    if (abs(delta0) < 10 * tc) {
-+                        delta0 = av_clip(delta0, -tc, tc);
-+                        if (!no_p)
-+                            P0 = av_clip_pixel(p0 + delta0);
-+                        if (!no_q)
-+                            Q0 = av_clip_pixel(q0 - delta0);
-+                        if (!no_p && nd_p > 1) {
-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+                            P1 = av_clip_pixel(p1 + deltap1);
-+                        }
-+                        if (!no_q && nd_q > 1) {
-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+                            Q1 = av_clip_pixel(q1 + deltaq1);
-+                        }
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
-+{
-+    // Just call the non-2 function having massaged the parameters
-+    int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
-+    uint8_t no_p[2] = {no_f & 1, no_f & 1};
-+    uint8_t no_q[2] = {no_f & 2, no_f & 2};
-+    FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
-+}
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#define P1 pix_l[0 * xstride]
-+#define P0 pix_l[1 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+
-+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
-+                                          ptrdiff_t _ystride, const int32_t *_tc,
-+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
-+{
-+    int d, j, no_p, no_q;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    for (j = 0; j < 2; j++) {
-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
-+        if (tc <= 0) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        }
-+        no_p = _no_p[j];
-+        no_q = _no_q[j];
-+
-+        for (d = 0; d < 4; d++) {
-+            int delta0;
-+            const int p1 = P1;
-+            const int p0 = P0;
-+            const int q0 = Q0;
-+            const int q1 = Q1;
-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+            if (!no_p)
-+                P0 = av_clip_pixel(p0 + delta0);
-+            if (!no_q)
-+                Q0 = av_clip_pixel(q0 - delta0);
-+            pix_l += ystride;
-+            pix_r += ystride;
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
-+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
-+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
-+}
-+
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+
-diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
-new file mode 100644
-index 0000000000..62135b83c2
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.c
-@@ -0,0 +1,166 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "rpi_hevcpred.h"
-+#if (ARCH_ARM)
-+#include "arm/rpi_hevcpred_arm.h"
-+#endif
-+
-+#define PRED_C 0
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+#define PRED_C 1
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef FUNCC
-+#define FUNCC(a, depth) a ## _ ## depth ## _c
-+
-+#define HEVC_PRED_Y(depth)                                \
-+    hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
-+    hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
-+    hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
-+    hpc->intra_pred[3]   = FUNC(intra_pred_5, depth);   \
-+    hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
-+    hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
-+    hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
-+    hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
-+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
-+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
-+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
-+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
-+    hpc->pred_dc[0]      = FUNC(pred_dc_0, depth);      \
-+    hpc->pred_dc[1]      = FUNC(pred_dc_1, depth);      \
-+    hpc->pred_dc[2]      = FUNC(pred_dc_2, depth);      \
-+    hpc->pred_dc[3]      = FUNC(pred_dc_3, depth);      \
-+    hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_dc0[0]     = FUNC(pred_dc0_0, depth);     \
-+    hpc->pred_dc0[1]     = FUNC(pred_dc0_1, depth);     \
-+    hpc->pred_dc0[2]     = FUNC(pred_dc0_2, depth);     \
-+    hpc->pred_dc0[3]     = FUNC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED_C(depth)                                \
-+    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
-+    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
-+    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
-+    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
-+	hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
-+	hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
-+	hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
-+	hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
-+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
-+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
-+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
-+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
-+    hpc->pred_dc_c[0]      = FUNCC(pred_dc_0, depth);      \
-+    hpc->pred_dc_c[1]      = FUNCC(pred_dc_1, depth);      \
-+    hpc->pred_dc_c[2]      = FUNCC(pred_dc_2, depth);      \
-+    hpc->pred_dc_c[3]      = FUNCC(pred_dc_3, depth);      \
-+    hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_dc0_c[0]     = FUNCC(pred_dc0_0, depth);     \
-+    hpc->pred_dc0_c[1]     = FUNCC(pred_dc0_1, depth);     \
-+    hpc->pred_dc0_c[2]     = FUNCC(pred_dc0_2, depth);     \
-+    hpc->pred_dc0_c[3]     = FUNCC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED(depth) \
-+    HEVC_PRED_Y(depth); \
-+    HEVC_PRED_C(depth);
-+
-+    switch (bit_depth) {
-+    case 9:
-+        HEVC_PRED(9);
-+        break;
-+    case 10:
-+        HEVC_PRED(10);
-+        break;
-+    case 12:
-+        HEVC_PRED(12);
-+        break;
-+    default:
-+        HEVC_PRED(8);
-+        break;
-+    }
-+
-+#if (ARCH_ARM)
-+    ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
-+#elif (ARCH_MIPS)
-+    ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
-+#endif
-+}
-diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
-new file mode 100644
-index 0000000000..6e594277c0
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.h
-@@ -0,0 +1,121 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCPRED_H
-+#define AVCODEC_RPI_HEVCPRED_H
-+
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "config.h"
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiLocalContext;
-+
-+enum IntraPredMode {
-+    INTRA_PLANAR = 0,
-+    INTRA_DC,
-+    INTRA_ANGULAR_2,
-+    INTRA_ANGULAR_3,
-+    INTRA_ANGULAR_4,
-+    INTRA_ANGULAR_5,
-+    INTRA_ANGULAR_6,
-+    INTRA_ANGULAR_7,
-+    INTRA_ANGULAR_8,
-+    INTRA_ANGULAR_9,
-+    INTRA_ANGULAR_10,
-+    INTRA_ANGULAR_11,
-+    INTRA_ANGULAR_12,
-+    INTRA_ANGULAR_13,
-+    INTRA_ANGULAR_14,
-+    INTRA_ANGULAR_15,
-+    INTRA_ANGULAR_16,
-+    INTRA_ANGULAR_17,
-+    INTRA_ANGULAR_18,
-+    INTRA_ANGULAR_19,
-+    INTRA_ANGULAR_20,
-+    INTRA_ANGULAR_21,
-+    INTRA_ANGULAR_22,
-+    INTRA_ANGULAR_23,
-+    INTRA_ANGULAR_24,
-+    INTRA_ANGULAR_25,
-+    INTRA_ANGULAR_26,
-+    INTRA_ANGULAR_27,
-+    INTRA_ANGULAR_28,
-+    INTRA_ANGULAR_29,
-+    INTRA_ANGULAR_30,
-+    INTRA_ANGULAR_31,
-+    INTRA_ANGULAR_32,
-+    INTRA_ANGULAR_33,
-+    INTRA_ANGULAR_34,
-+};
-+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
-+#define INTRA_ANGULAR_VERTICAL   INTRA_ANGULAR_26
-+
-+typedef void intra_filter_fn_t(
-+        uint8_t * const left, uint8_t * const top,
-+        const unsigned int req, const unsigned int avail,
-+        const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
-+        const unsigned int stride,
-+        const unsigned int top_right_size, const unsigned int down_left_size);
-+
-+typedef struct HEVCRpiPredContext {
-+    void (*intra_pred[4])(const struct HEVCRpiContext * const s,
-+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail);
-+
-+    intra_filter_fn_t *intra_filter[4];
-+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-+                           const uint8_t *left, ptrdiff_t stride);
-+    void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+                    ptrdiff_t stride);
-+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
-+
-+    void (*intra_pred_c[4])(const struct HEVCRpiContext * const s,
-+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail);
-+    intra_filter_fn_t *intra_filter_c[4];
-+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
-+                           const uint8_t *left, ptrdiff_t stride);
-+    void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+                    ptrdiff_t stride);
-+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
-+} HEVCRpiPredContext;
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
-+
-+#endif /* AVCODEC_RPI_HEVCPRED_H */
-diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
-new file mode 100644
-index 0000000000..2f710626cf
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred_template.c
-@@ -0,0 +1,1522 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "bit_depth_template.c"
-+
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevcpred.h"
-+
-+#define DUMP_PRED 0
-+
-+#define POS(x, y) src[(x) + stride * (y)]
-+
-+// INCLUDED_ONCE defined at EOF
-+#ifndef INCLUDED_ONCE
-+typedef uint8_t (* c8_dst_ptr_t)[2];
-+typedef const uint8_t (* c8_src_ptr_t)[2];
-+typedef uint16_t (* c16_dst_ptr_t)[2];
-+typedef const uint16_t (* c16_src_ptr_t)[2];
-+
-+// *** On ARM make these NEON registers
-+typedef struct pixel4_16 {
-+    uint16_t x[4];
-+} pixel4_16;
-+typedef struct pixel4_32 {
-+    uint32_t x[4];
-+} pixel4_32;
-+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
-+{
-+    pixel4_16 t = {{x, x, x, x}};
-+    return t;
-+}
-+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
-+{
-+    pixel4_32 t = {{x, x, x, x}};
-+    return t;
-+}
-+#endif
-+
-+#if PRED_C
-+// For chroma we double pixel size so we copy pairs
-+#undef pixel
-+#undef pixel2
-+#undef pixel4
-+#undef dctcoef
-+#undef INIT_CLIP
-+#undef no_rnd_avg_pixel4
-+#undef rnd_avg_pixel4
-+#undef AV_RN2P
-+#undef AV_RN4P
-+#undef AV_RN4PA
-+#undef AV_WN2P
-+#undef AV_WN4P
-+#undef AV_WN4PA
-+#undef CLIP
-+#undef FUNC
-+#undef FUNCC
-+#undef av_clip_pixel
-+#undef PIXEL_SPLAT_X4
-+
-+#if BIT_DEPTH == 8
-+#define pixel uint16_t
-+#define pixel4 pixel4_16
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
-+#define cpel uint8_t
-+#define c_src_ptr_t  c8_src_ptr_t
-+#define c_dst_ptr_t  c8_dst_ptr_t
-+#else
-+#define pixel uint32_t
-+#define pixel4 pixel4_32
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
-+#define cpel uint16_t
-+#define c_src_ptr_t c16_dst_ptr_t
-+#define c_dst_ptr_t c16_dst_ptr_t
-+#endif
-+#define AV_RN4P(p) (*(pixel4*)(p))
-+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
-+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
-+#endif
-+
-+
-+// Get PW prior to horrid PRED_C trickery
-+#if BIT_DEPTH == 8
-+#define PW 1
-+#else
-+#define PW 2
-+#endif
-+
-+
-+#if DUMP_PRED && !defined(INCLUDED_ONCE)
-+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
-+{
-+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
-+        for (unsigned int x = 0; x != size; x++) {
-+            printf("%4d", data[x * 2]);
-+        }
-+        printf("\n");
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+#ifndef INCLUDED_ONCE
-+static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t v4 = v | (v << 8);
-+        uint32_t * p = (uint32_t *)ptr;
-+        v4 = v4 | (v4 << 16);
-+        do {
-+            *p++ = v4;
-+        } while (--n != 0);
-+    }
-+}
-+
-+static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t v2 = v | (v << 16);
-+        uint32_t * p = (uint32_t *)ptr;
-+        do {
-+            *p++ = v2;
-+            *p++ = v2;
-+        } while (--n != 0);
-+    }
-+}
-+
-+static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t * p = (uint32_t *)ptr;
-+        do {
-+            *p++ = v;
-+            *p++ = v;
-+            *p++ = v;
-+            *p++ = v;
-+        } while (--n != 0);
-+    }
-+}
-+
-+// Beware that this inverts the avail ordering
-+// For CIP it seems easier this way round
-+static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
-+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+                              unsigned int s0, unsigned int odd_s)
-+{
-+    const unsigned int n = 1 << log2_intra_bits;
-+    unsigned int fa = 0;
-+    unsigned int i;
-+
-+    size >>= 2;   // Now in 4-pel units
-+    s0 >>= 2;
-+
-+    if ((avail & AVAIL_DL) != 0)
-+        fa |= ((1 << s0) - 1) << (size - s0);
-+    if ((avail & AVAIL_L) != 0)
-+        fa |= ((1 << size) - 1) << size;
-+    if ((avail & AVAIL_UL) != 0)
-+        fa |= 1 << (size << 1);
-+
-+    if (odd_s) {
-+        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
-+            fa &= ~1;
-+        is_intra += i_stride;
-+    }
-+
-+    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
-+        const unsigned int m = ((1 << n) - 1) << i;
-+        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
-+            fa &= ~m;
-+    }
-+
-+    return fa;
-+}
-+
-+static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
-+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+                                unsigned int s1, unsigned int odd_s)
-+{
-+    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
-+    {
-+        return 0;
-+    }
-+    else
-+    {
-+        const unsigned int n = 1 << log2_intra_bits;
-+        unsigned int fa = 0;
-+        unsigned int i;
-+        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
-+
-+        size >>= 2;   // Now in 4-pel units
-+        s1 >>= 2;
-+
-+        if ((avail & AVAIL_U) != 0)
-+            fa |= ((1 << size) - 1);
-+        if ((avail & AVAIL_UR) != 0)
-+            fa |= ((1 << s1) - 1) << size;
-+
-+        if (odd_s) {
-+            fa &= im | ~1;
-+            im >>= 1;
-+        }
-+
-+        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
-+            const unsigned int m = ((1 << n) - 1) << i;
-+            if ((im & 1) == 0)
-+                fa &= ~m;
-+        }
-+        return fa;
-+    }
-+}
-+
-+
-+
-+static inline unsigned int rmbd(unsigned int x)
-+{
-+#if 1
-+    return __builtin_ctz(x);
-+#else
-+    unsigned int n = 0;
-+    if ((x & 0xffff) == 0) {
-+        x >>= 16;
-+        n += 16;
-+    }
-+    if ((x & 0xff) == 0) {
-+        x >>= 8;
-+        n += 8;
-+    }
-+    if ((x & 0xf) == 0) {
-+        x >>= 4;
-+        n += 4;
-+    }
-+    if ((x & 0x3) == 0) {
-+        x >>= 2;
-+        n += 2;
-+    }
-+
-+    return (x & 1) == 0 ? n + 1 : n;
-+#endif
-+}
-+#endif
-+
-+
-+static void FUNC(cip_fill)(pixel * const left, pixel * const top,
-+    const unsigned int avail_l, const unsigned int avail_u,
-+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+    const unsigned int stride,
-+    const unsigned int size)
-+{
-+    pixel a;
-+    unsigned int i;
-+
-+    // 1st find DL value
-+    if ((avail_l & 1) == 0) {
-+        if (avail_l != 0)
-+            a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
-+        else
-+        {
-+            // (avail_l | avail_u) != 0 so this must be good
-+            const unsigned int n = rmbd(avail_u)*4;
-+            a = (n >= size) ? src_ur[n - size] : src_u[n];
-+        }
-+    }
-+
-+    // L
-+    {
-+        pixel * d = left + size * 2 - 1;
-+        const pixel * s = src_l + (size * 2 - 1) * stride;
-+        unsigned int x = avail_l;
-+        for (i = 0; i < size * 2; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = a = *s;
-+                s -= stride;
-+            }
-+            else
-+            {
-+                *d-- = a;
-+                *d-- = a;
-+                *d-- = a;
-+                *d-- = a;
-+                s -= stride * 4;
-+            }
-+        }
-+        // UL
-+        *d = a = (x & 1) != 0 ? *s : a;
-+    }
-+
-+    // U
-+    {
-+        pixel * d = top;
-+        const pixel * s = src_u;
-+        unsigned int x = avail_u;
-+
-+        for (i = 0; i < size; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = a = *s++;
-+            }
-+            else
-+            {
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                s += 4;
-+            }
-+        }
-+
-+        // UR
-+        s = src_ur;
-+        for (i = 0; i < size; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = a = *s++;
-+            }
-+            else
-+            {
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                s += 4;
-+            }
-+        }
-+    }
-+}
-+
-+
-+#if !PRED_C && PW == 1
-+#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
-+#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
-+#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
-+#else
-+#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
-+#endif
-+
-+// Reqs:
-+//
-+// Planar:  DL[0], L, ul, U, UR[0]
-+// DC:         dl, L, ul, U, ur
-+// A2-9:       DL, L, ul, u, ur
-+// A10:        dl, L, ul, u, ur
-+// A11-17      dl, L, UL, U, ur
-+// A18-25      dl, L, Ul, U, ur
-+// A26         dl, l, ul, U, ur
-+// A27-34      dl, l, ul, U, UR
-+
-+#ifndef INCLUDED_ONCE
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+
-+#define FILTER_LIGHT    0x40
-+#define FILTER_STRONG   0x80
-+#define FILTER_EITHER   (FILTER_LIGHT | FILTER_STRONG)
-+
-+static const uint8_t req_avail_c[35] =
-+{
-+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0         |  AVAIL_U,             // DC
-+    AVAIL_DL | AVAIL_L,                                    // 2
-+    AVAIL_DL | AVAIL_L,                                    // 3
-+    AVAIL_DL | AVAIL_L,                                    // 4
-+    AVAIL_DL | AVAIL_L,                                    // 5
-+    AVAIL_DL | AVAIL_L,                                    // 6
-+    AVAIL_DL | AVAIL_L,                                    // 7
-+    AVAIL_DL | AVAIL_L,                                    // 8
-+    AVAIL_DL | AVAIL_L,                                    // 9
-+               AVAIL_L,                                    // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
-+                                    AVAIL_U,               // 26 (V)
-+                                    AVAIL_U | AVAIL_UR,    // 27
-+                                    AVAIL_U | AVAIL_UR,    // 28
-+                                    AVAIL_U | AVAIL_UR,    // 29
-+                                    AVAIL_U | AVAIL_UR,    // 30
-+                                    AVAIL_U | AVAIL_UR,    // 31
-+                                    AVAIL_U | AVAIL_UR,    // 32
-+                                    AVAIL_U | AVAIL_UR,    // 33
-+                                    AVAIL_U | AVAIL_UR     // 34
-+};
-+
-+static const uint8_t req_avail[4][35] = {
-+{
-+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0         |  AVAIL_U,             // DC
-+    AVAIL_DL | AVAIL_L,                                    // 2
-+    AVAIL_DL | AVAIL_L,                                    // 3
-+    AVAIL_DL | AVAIL_L,                                    // 4
-+    AVAIL_DL | AVAIL_L,                                    // 5
-+    AVAIL_DL | AVAIL_L,                                    // 6
-+    AVAIL_DL | AVAIL_L,                                    // 7
-+    AVAIL_DL | AVAIL_L,                                    // 8
-+    AVAIL_DL | AVAIL_L,                                    // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 26 (V)
-+                                    AVAIL_U | AVAIL_UR,    // 27
-+                                    AVAIL_U | AVAIL_UR,    // 28
-+                                    AVAIL_U | AVAIL_UR,    // 29
-+                                    AVAIL_U | AVAIL_UR,    // 30
-+                                    AVAIL_U | AVAIL_UR,    // 31
-+                                    AVAIL_U | AVAIL_UR,    // 32
-+                                    AVAIL_U | AVAIL_UR,    // 33
-+                                    AVAIL_U | AVAIL_UR     // 34
-+},
-+{  // 3
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 3
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 4
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 5
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 6
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 7
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 8
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | 0,             // 27
-+                                    AVAIL_U | AVAIL_UR | 0,             // 28
-+                                    AVAIL_U | AVAIL_UR | 0,             // 29
-+                                    AVAIL_U | AVAIL_UR | 0,             // 30
-+                                    AVAIL_U | AVAIL_UR | 0,             // 31
-+                                    AVAIL_U | AVAIL_UR | 0,             // 32
-+                                    AVAIL_U | AVAIL_UR | 0,             // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
-+},
-+{  // 4
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 3
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 4
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 5
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 6
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 7
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 8
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | 0,             // 27
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 28
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 29
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 30
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 31
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 32
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
-+},
-+{  // 5
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 2
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 3
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 4
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 5
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 6
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 7
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 8
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 9
-+               AVAIL_L                                 | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 25
-+                                    AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER  // 34
-+}
-+};
-+
-+
-+#endif
-+
-+#define filter_light1 FUNC(filter_light1)
-+static inline pixel filter_light1(pixel a, pixel b, pixel c)
-+{
-+    return (a + b*2 + c + 2) >> 2;
-+}
-+
-+#define filter_light FUNC(filter_light)
-+static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
-+{
-+    pixel p0;
-+    pixel p2 = *src;
-+    // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
-+    unsigned int n_minus_1 = n - 1;
-+
-+    do
-+    {
-+        src += sstride;
-+        p0 = p1;
-+        p1 = p2;
-+        p2 = *src;
-+        *dst++ = filter_light1(p0, p1, p2);
-+    } while (--n_minus_1 != 0);
-+    *dst = filter_light1(p1, p2, pn);
-+}
-+
-+#define filter_strong FUNC(filter_strong)
-+static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
-+{
-+    unsigned int a = 64 * p0 + 32;
-+    const int v = p1 - p0;
-+
-+    do
-+    {
-+        *dst++ = (a += v) >> 6;
-+    } while (--n != 0);
-+}
-+
-+#define intra_filter FUNC(intra_filter)
-+static av_always_inline void intra_filter(
-+    pixel * const left, pixel * const top,
-+    const unsigned int req, const unsigned int avail,
-+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+    const unsigned int stride,
-+    const unsigned int top_right_size, const unsigned int down_left_size,
-+    const unsigned int log2_size)
-+{
-+    const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
-+    const unsigned int size = 1 << log2_size;
-+
-+    // a_ is the first pel in a section working round dl -> ur
-+    // b_ is the last
-+    // Beware that top & left work out from UL so usage of a_ & b_ may
-+    // swap between them.  It is a bad naming scheme but I have found no
-+    // better
-+    const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
-+    const pixel * b_dl = src_l + size * stride;
-+    const pixel * a_l  = src_l + (size - 1) * stride;
-+    const pixel * b_l  = src_l;
-+    const pixel * ab_ul = src_l - stride;
-+    const pixel * a_u = src_u;
-+    const pixel * b_u = src_u + size - 1;
-+    const pixel * a_ur = src_ur;
-+    const pixel * b_ur = src_ur + top_right_size - 1;
-+
-+    const unsigned int want = req & ~avail;
-+    const unsigned int have = req & avail;
-+    unsigned int i;
-+
-+    if ((avail & AVAIL_DL) == 0)
-+    {
-+        a_dl = a_ur;
-+        if ((avail & AVAIL_U) != 0)
-+            a_dl = a_u;
-+        if ((avail & AVAIL_UL) != 0)
-+            a_dl = ab_ul;
-+        if ((avail & AVAIL_L) != 0)
-+            a_dl = a_l;
-+        b_dl = a_dl;
-+    }
-+
-+    if ((avail & AVAIL_L) == 0)
-+    {
-+        a_l = b_dl;
-+        b_l = b_dl;
-+    }
-+    if ((avail & AVAIL_UL) == 0)
-+    {
-+        ab_ul = b_l;
-+    }
-+    if ((avail & AVAIL_U) == 0)
-+    {
-+        a_u = ab_ul;
-+        b_u = ab_ul;
-+    }
-+    if ((avail & AVAIL_UR) == 0)
-+    {
-+        a_ur = b_u;
-+        b_ur = b_u;
-+    }
-+
-+    if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2)  // PRED_C, log2_size compiler opt hints
-+    {
-+        if ((req & AVAIL_UL) != 0)
-+            left[-1] = *ab_ul;
-+
-+        if ((want & AVAIL_L) != 0)
-+            EXTEND(left, *a_l, size);
-+        if ((want & AVAIL_DL) != 0)
-+            EXTEND(left + size, *a_dl, size);
-+        if ((want & AVAIL_U) != 0)
-+            EXTEND(top, *a_u, size);
-+        if ((want & AVAIL_UR) != 0)
-+            EXTEND(top + size, *a_ur, size);
-+
-+        if ((have & AVAIL_U) != 0)
-+            // Always good - even with sand
-+            memcpy(top, a_u, size * sizeof(pixel));
-+        if ((have & AVAIL_UR) != 0)
-+        {
-+            memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
-+            EXTEND(top + size + top_right_size, *b_ur,
-+                   size - top_right_size);
-+        }
-+        if ((have & AVAIL_L) != 0)
-+        {
-+            for (i = 0; i < size; i++)
-+                left[i] = b_l[stride * i];
-+        }
-+        if ((have & AVAIL_DL) != 0)
-+        {
-+            for (i = 0; i < down_left_size; i++)
-+                left[i + size] = b_dl[stride * i];
-+            EXTEND(left + size + down_left_size, *a_dl,
-+                   size - down_left_size);
-+        }
-+    }
-+    else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
-+            FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
-+            FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
-+    {
-+        if ((req & (AVAIL_U | AVAIL_UR)) != 0)
-+            filter_strong(top, *ab_ul, *b_ur, size * 2);
-+        left[-1] = *ab_ul;
-+        if ((req & (AVAIL_L | AVAIL_DL)) != 0)
-+            filter_strong(left, *ab_ul, *a_dl, size*2);
-+    }
-+    else
-+    {
-+        // Same code for both have & want for UL
-+        if ((req & AVAIL_UL) != 0)
-+        {
-+            left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
-+        }
-+
-+        if ((want & AVAIL_L) != 0)
-+        {
-+            EXTEND(left, *a_l, size);
-+            left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
-+        }
-+        if ((want & AVAIL_DL) != 0)
-+        {
-+            // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
-+            EXTEND(left + size, *a_l, size);
-+        }
-+        if ((want & AVAIL_U) != 0)
-+        {
-+            EXTEND(top, *a_u, size);
-+            top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
-+        }
-+        if ((want & AVAIL_UR) != 0)
-+        {
-+            // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
-+            EXTEND(top + size, *a_ur, size);
-+        }
-+
-+        if ((have & AVAIL_U) != 0)
-+        {
-+            filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
-+        }
-+        if ((have & AVAIL_UR) != 0) {
-+            filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
-+            top[size*2 - 1] = *b_ur;
-+            EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
-+        }
-+        if ((have & AVAIL_L) != 0)
-+        {
-+            filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
-+        }
-+        if ((have & AVAIL_DL) != 0)
-+        {
-+            filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
-+            left[size*2 - 1] = *a_dl;
-+            EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
-+        }
-+    }
-+}
-+
-+#define INTRA_FILTER(log2_size) \
-+static void FUNC(intra_filter_ ## log2_size)( \
-+     uint8_t * const left, uint8_t * const top, \
-+     const unsigned int req, const unsigned int avail, \
-+     const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
-+     const unsigned int stride, \
-+     const unsigned int top_right_size, const unsigned int down_left_size) \
-+{ \
-+    intra_filter((pixel *)left, (pixel *)top, req, avail, \
-+        (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
-+}
-+
-+INTRA_FILTER(2)
-+INTRA_FILTER(3)
-+INTRA_FILTER(4)
-+INTRA_FILTER(5)
-+
-+#undef intra_filter
-+#undef INTRA_FILTER
-+
-+static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s,
-+                                              const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
-+                                              const unsigned int log2_size)
-+{
-+    // c_idx will alaways be 1 for _c versions and 0 for y
-+    const unsigned int c_idx = PRED_C;
-+    const unsigned int hshift = ctx_hshift(s, c_idx);
-+    const unsigned int vshift = ctx_vshift(s, c_idx);
-+    const unsigned int size = (1 << log2_size);
-+    const unsigned int x = x0 >> hshift;
-+    const unsigned int y = y0 >> vshift;
-+
-+    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
-+    pixel *const src = c_idx == 0 ?
-+        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
-+        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
-+
-+    // Align so we can do multiple loads in the asm
-+    // Padded to 16 byte boundary so as not to confuse anything
-+    DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
-+    DECLARE_ALIGNED(16, pixel,  top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
-+
-+    pixel  * const left = left_array + 16 / sizeof(pixel);
-+    pixel  * const top  = top_array  + 16 / sizeof(pixel);
-+    const pixel * top_pred = top;
-+
-+    const pixel * src_l = src - 1;
-+    const pixel * src_u = src - stride;
-+    const pixel * src_ur = src_u + size;
-+#if !PRED_C
-+    unsigned int req = req_avail[log2_size - 2][mode];
-+#else
-+    unsigned int req = req_avail_c[mode];
-+#endif
-+
-+    // If we have nothing to pred from then fill with grey
-+    // This isn't a common case but dealing with it here means we don't have to
-+    // test for it later
-+    if (avail == 0)
-+    {
-+dc_only:
-+#if !PRED_C
-+        s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
-+#else
-+        s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
-+#endif
-+        return;
-+    }
-+
-+    // There will be no filtering on C so no point worrying about disabling it
-+#if !PRED_C
-+    if (s->ps.sps->intra_smoothing_disabled_flag)
-+        req &= ~FILTER_EITHER;
-+    if (!s->ps.sps->sps_strong_intra_smoothing_enable_flag)
-+        req &= ~FILTER_STRONG;
-+#endif
-+
-+    {
-+        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
-+        const AVFrame * const frame = s->frame;
-+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
-+        if ((x & mask) == 0)
-+            src_l -= stripe_adj;
-+        if (((x + size) & mask) == 0)
-+            src_ur += stripe_adj;
-+    }
-+
-+    // Can deal with I-slices in 'normal' code even if CIP
-+    // This also means that we don't need to generate (elsewhere) is_intra
-+    // for IRAP frames
-+    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
-+        s->sh.slice_type != HEVC_SLICE_I)
-+    {
-+        // * If we ever actually care about CIP performance then we should
-+        //   special case out size 4 stuff (can be done by 'normal') and
-+        //   have 8-pel avail masks
-+        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
-+                                           -(int)(s->ps.sps->pcm_width),
-+                                           1 << (((x - 1) >> (3 - hshift)) & 7),
-+                                           1 - hshift,
-+                                           avail,
-+                                           size,
-+                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
-+                                           vshift != 0 ? 0 : (y >> 2) & 1);
-+
-+        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
-+                                           (x >> (3 - hshift)) & 7,
-+                                           1 - hshift,
-+                                           avail,
-+                                           size,
-+                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
-+                                           hshift != 0 ? 0 : (x >> 2) & 1);
-+
-+        // Anything left?
-+        if ((avail_l | avail_u) == 0)
-+            goto dc_only;
-+
-+        FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
-+
-+#if !PRED_C
-+        if ((req & FILTER_LIGHT) != 0)
-+        {
-+            const unsigned threshold = 1 << (BIT_DEPTH - 5);
-+            if ((req & FILTER_STRONG) != 0 &&
-+                (int)(FFABS(left[-1]  + top[63] - 2 * top[31]))  < threshold &&
-+                (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
-+            {
-+                filter_strong(top, left[-1], top[63], 64);
-+                filter_strong(left, left[-1], left[63], 64);
-+            } else
-+            {
-+                // LHS writes UL too so copy for top
-+                const pixel p_ul = left[-1];
-+                filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
-+                filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
-+            }
-+        }
-+#endif
-+    }
-+    else
-+    {
-+        const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
-+        if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
-+            ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
-+        {
-+            top_pred = src_u;
-+        }
-+        else
-+        {
-+#if !PRED_C
-+            s->hpc.intra_filter[log2_size - 2]
-+#else
-+            s->hpc.intra_filter_c[log2_size - 2]
-+#endif
-+                ((uint8_t *)left, (uint8_t *)top, req, avail,
-+                 (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
-+                              ur_size,
-+                              FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
-+        }
-+    }
-+
-+
-+#if !PRED_C
-+    switch (mode) {
-+    case INTRA_PLANAR:
-+        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                          (uint8_t *)left, stride);
-+        break;
-+    case INTRA_DC:
-+        s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                       (uint8_t *)left, stride);
-+        break;
-+    case INTRA_ANGULAR_HORIZONTAL:
-+        s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    case INTRA_ANGULAR_VERTICAL:
-+        s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    default:
-+        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    }
-+#else
-+    switch (mode) {
-+    case INTRA_PLANAR:
-+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                          (uint8_t *)left, stride);
-+        break;
-+    case INTRA_DC:
-+        s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                       (uint8_t *)left, stride);
-+        break;
-+    case INTRA_ANGULAR_HORIZONTAL:
-+        s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    case INTRA_ANGULAR_VERTICAL:
-+        s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    default:
-+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    }
-+
-+#if DUMP_PRED
-+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
-+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
-+#endif
-+#endif
-+}
-+
-+#define INTRA_PRED(log2_size) \
-+static void FUNC(intra_pred_ ## log2_size)(const struct HEVCRpiContext * const s, \
-+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail) \
-+{ \
-+    FUNC(intra_pred)(s, mode, x0, y0, avail, log2_size); \
-+}
-+
-+INTRA_PRED(2)
-+INTRA_PRED(3)
-+INTRA_PRED(4)
-+INTRA_PRED(5)
-+
-+#undef INTRA_PRED
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
-+                                  const uint8_t *_left, ptrdiff_t stride,
-+                                  int trafo_size)
-+{
-+    int x, y;
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+    int size = 1 << trafo_size;
-+    for (y = 0; y < size; y++)
-+        for (x = 0; x < size; x++)
-+            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
-+                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
-+}
-+#else
-+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
-+                                  const uint8_t * _left, ptrdiff_t stride,
-+                                  int trafo_size)
-+{
-+    int x, y;
-+    int size = 1 << trafo_size;
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    for (y = 0; y < size; y++, src += stride)
-+    {
-+        for (x = 0; x < size; x++)
-+        {
-+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
-+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
-+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
-+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_PLANAR(size)\
-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
-+}
-+
-+PRED_PLANAR(0)
-+PRED_PLANAR(1)
-+PRED_PLANAR(2)
-+PRED_PLANAR(3)
-+
-+#undef PRED_PLANAR
-+
-+#if !PRED_C
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+                          const uint8_t *_left,
-+                          ptrdiff_t stride, int log2_size)
-+{
-+    int i, j, x, y;
-+    int size          = (1 << log2_size);
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+    int dc            = size;
-+    pixel4 a;
-+    for (i = 0; i < size; i++)
-+        dc += left[i] + top[i];
-+
-+    dc >>= log2_size + 1;
-+
-+    a = PIXEL_SPLAT_X4(dc);
-+
-+    for (i = 0; i < size; i++)
-+        for (j = 0; j < size; j+=4)
-+            AV_WN4P(&POS(j, i), a);
-+
-+//    if (c_idx == 0 && size < 32)
-+// As we now have separate fns for y & c - no need to test that
-+    if (size < 32)
-+    {
-+        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
-+        for (x = 1; x < size; x++)
-+            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
-+        for (y = 1; y < size; y++)
-+            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
-+    }
-+}
-+#else
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+                          const uint8_t *_left,
-+                          ptrdiff_t stride, int log2_size)
-+{
-+    unsigned int i, j;
-+    const unsigned int size = (1 << log2_size);
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+    unsigned int dc0 = size;
-+    unsigned int dc1 = size;
-+
-+    for (i = 0; i < size; i++)
-+    {
-+        dc0 += left[i][0] + top[i][0];
-+        dc1 += left[i][1] + top[i][1];
-+    }
-+
-+    dc0 >>= log2_size + 1;
-+    dc1 >>= log2_size + 1;
-+
-+    for (i = 0; i < size; i++, src += stride)
-+    {
-+        for (j = 0; j < size; ++j)
-+        {
-+            src[j][0] = dc0;
-+            src[j][1] = dc1;
-+
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_DC(size)\
-+static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_dc)(src, top, left, stride, size + 2);                        \
-+}
-+
-+PRED_DC(0)
-+PRED_DC(1)
-+PRED_DC(2)
-+PRED_DC(3)
-+
-+#undef PRED_DC
-+
-+
-+
-+
-+#if !PRED_C
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+    int i, j;
-+    int size          = (1 << log2_size);
-+    pixel *src        = (pixel *)_src;
-+    pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
-+
-+    for (i = 0; i < size; i++)
-+        for (j = 0; j < size; j+=4)
-+            AV_WN4P(&POS(j, i), a);
-+}
-+#else
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+    unsigned int i, j;
-+    const unsigned int size = (1 << log2_size);
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const pixel a = (1 << (BIT_DEPTH - 1));
-+
-+    for (i = 0; i < size; i++, src += stride)
-+    {
-+        for (j = 0; j < size; ++j)
-+        {
-+            src[j][0] = a;
-+            src[j][1] = a;
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_DC0(size)\
-+static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_dc0)(src, stride, size + 2);                        \
-+}
-+
-+PRED_DC0(0)
-+PRED_DC0(1)
-+PRED_DC0(2)
-+PRED_DC0(3)
-+
-+#undef PRED_DC0
-+
-+
-+
-+
-+#ifndef ANGLE_CONSTS
-+#define ANGLE_CONSTS
-+static const int intra_pred_angle[] = {
-+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
-+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
-+};
-+static const int inv_angle[] = {
-+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
-+    -630, -910, -1638, -4096
-+};
-+#endif
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+                                                const uint8_t *_top,
-+                                                const uint8_t *_left,
-+                                                ptrdiff_t stride,
-+                                                int mode, int size)
-+{
-+    int x, y;
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+
-+    int angle = intra_pred_angle[mode - 2];
-+    pixel ref_array[3 * MAX_TB_SIZE + 4];
-+    pixel *ref_tmp = ref_array + size;
-+    const pixel *ref;
-+    int last = (size * angle) >> 5;
-+
-+    if (mode >= 18) {
-+        ref = top - 1;
-+        if (angle < 0 && last < -1) {
-+            for (x = 0; x <= size; x += 4)
-+                AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1]));
-+            for (x = last; x <= -1; x++)
-+                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+            ref = ref_tmp;
-+        }
-+
-+        for (y = 0; y < size; y++) {
-+            int idx  = ((y + 1) * angle) >> 5;
-+            int fact = ((y + 1) * angle) & 31;
-+            if (fact) {
-+                for (x = 0; x < size; x += 4) {
-+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
-+                                           fact  * ref[x + idx + 2] + 16) >> 5;
-+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
-+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
-+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
-+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
-+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
-+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
-+                }
-+            } else {
-+                for (x = 0; x < size; x += 4)
-+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
-+            }
-+        }
-+//        if (mode == 26 && c_idx == 0 && size < 32) {
-+        if (mode == 26 && size < 32) {
-+            for (y = 0; y < size; y++)
-+                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
-+        }
-+
-+    } else {
-+        ref = left - 1;
-+        if (angle < 0 && last < -1) {
-+            for (x = 0; x <= size; x += 4)
-+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
-+            for (x = last; x <= -1; x++)
-+                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+            ref = ref_tmp;
-+        }
-+
-+        for (x = 0; x < size; x++) {
-+            int idx  = ((x + 1) * angle) >> 5;
-+            int fact = ((x + 1) * angle) & 31;
-+            if (fact) {
-+                for (y = 0; y < size; y++) {
-+                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
-+                                       fact  * ref[y + idx + 2] + 16) >> 5;
-+                }
-+            } else {
-+                for (y = 0; y < size; y++)
-+                    POS(x, y) = ref[y + idx + 1];
-+            }
-+        }
-+//        if (mode == 10 && c_idx == 0 && size < 32) {
-+        if (mode == 10 && size < 32) {
-+            for (x = 0; x < size; x += 4) {
-+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - top[-1]) >> 1));
-+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
-+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1));
-+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1));
-+            }
-+        }
-+    }
-+
-+
-+
-+#if BIT_DEPTH == 8 && 0
-+    if ((size == 16 || size == 32) && mode != 10 && mode != 26) {
-+        DECLARE_ALIGNED(16, uint8_t, a[64*32]);
-+        void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+//        void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+#if 1
-+        src = (pixel *)_src;
-+        printf("C: Mode=%d\n", mode);
-+        for (y = 0; y < size; y++, src += stride)
-+        {
-+            printf("%2d:  ", y);
-+            for (x = 0; x < size; x++)
-+            {
-+                printf("%3x ", src[x]);
-+            }
-+            printf("\n");
-+        }
-+#endif
-+//            ff_hevc_rpi_pred_vertical_16_neon_8(a, _top, _left, size);
-+        memset(a, 0, sizeof(a));
-+//        ff_hevc_rpi_pred_angular_32_neon_10(a, _top, _left, size, mode);
-+        ff_hevc_rpi_pred_angular_16_neon_8(a, _top, _left, size, mode);
-+#if 1
-+        src = (pixel *)a;
-+        printf("A:\n");
-+        for (y = 0; y < size; y++, src += size)
-+        {
-+            printf("%2d:  ", y);
-+            for (x = 0; x < size; x++)
-+            {
-+                printf("%3x ", src[x]);
-+            }
-+            printf("\n");
-+        }
-+#endif
-+        src = (pixel *)_src;
-+        for (y = 0; y < size; y++, src += stride)
-+        {
-+            if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
-+                printf("Fail at line %d\n", y);
-+                av_assert0(0);
-+            }
-+        }
-+    }
-+#endif
-+
-+}
-+#else
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+                                                const uint8_t *_top,
-+                                                const uint8_t *_left,
-+                                                ptrdiff_t stride,
-+                                                int mode, int size)
-+{
-+    int x, y;
-+    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
-+    c_src_ptr_t top  = (c_src_ptr_t)_top;
-+    c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    const int angle = intra_pred_angle[mode - 2];
-+    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
-+    c_dst_ptr_t ref_tmp = ref_array + size;
-+    c_src_ptr_t ref;
-+    const int last = (size * angle) >> 5;
-+
-+    if (mode >= 18) {
-+        ref = top - 1;
-+        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (y = 0; y < size; y++, src += stride) {
-+            const int idx  = ((y + 1) * angle) >> 5;
-+            const int fact = ((y + 1) * angle) & 31;
-+            if (fact) {
-+                for (x = 0; x < size; ++x) {
-+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
-+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
-+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
-+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                memcpy(src, ref + idx + 1, size * 2 * PW);
-+            }
-+        }
-+    } else {
-+        ref = left - 1;
-+        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (x = 0; x < size; x++, src++) {
-+            const int idx  = ((x + 1) * angle) >> 5;
-+            const int fact = ((x + 1) * angle) & 31;
-+            if (fact) {
-+                for (y = 0; y < size; y++) {
-+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
-+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
-+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
-+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                for (y = 0; y < size; y++)
-+                {
-+                    src[y * stride][0] = ref[y + idx + 1][0];
-+                    src[y * stride][1] = ref[y + idx + 1][1];
-+                }
-+            }
-+        }
-+    }
-+
-+#if BIT_DEPTH == 10 && 0
-+    if (size == 16 && mode != 10 && mode != 26) {
-+        DECLARE_ALIGNED(16, uint8_t, a[64*32]);
-+//            void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+        void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+        src = (c_dst_ptr_t)_src;
-+        printf("C: mode=%d\n", mode);
-+        for (y = 0; y < size; y++, src += stride)
-+        {
-+            for (x = 0; x < size; x++)
-+            {
-+                printf("%3x:%3x ", src[x][0], src[x][1]);
-+            }
-+            printf("\n");
-+        }
-+
-+        memset(a,  0, sizeof(a));
-+        ff_hevc_rpi_pred_angular_c_16_neon_10(a, _top, _left, size, mode);
-+
-+        src = (c_dst_ptr_t)a;
-+        printf("A:\n");
-+        for (y = 0; y < size; y++, src += size)
-+        {
-+            for (x = 0; x < size; x++)
-+            {
-+                printf("%3x:%3x ", src[x][0], src[x][1]);
-+            }
-+            printf("\n");
-+        }
-+
-+        src = (c_dst_ptr_t)_src;
-+        for (y = 0; y < size; y++, src += stride)
-+        {
-+            if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
-+                printf("Fail at line %d\n", y);
-+                av_assert0(0);
-+            }
-+        }
-+
-+    }
-+#endif
-+}
-+#endif
-+
-+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
-+}
-+
-+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
-+}
-+
-+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
-+}
-+
-+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
-+}
-+
-+#undef cpel
-+#undef c_src_ptr_t
-+#undef c_dst_ptr_t
-+
-+#undef EXTEND
-+#undef POS
-+#undef PW
-+
-+#undef filter_light1
-+#undef filter_light
-+#undef filter_strong
-+#undef ref_gen
-+
-+#ifndef INCLUDED_ONCE
-+#define INCLUDED_ONCE
-+#endif
-+
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-new file mode 100644
-index 0000000000..20f218f22c
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,107 @@
-+/*
-+Copyright (c) 2012, Broadcom Europe Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include <stdio.h>
-+#include <string.h>
-+#include <stdlib.h>
-+#include <fcntl.h>
-+#include <unistd.h>
-+#include <assert.h>
-+#include <stdint.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/ioctl.h>
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/vcio"
-+
-+#include "rpi_mailbox.h"
-+//#include <interface/vctypes/vc_image_structs.h>
-+
-+/*
-+ * use ioctl to send mbox property message
-+ */
-+
-+static int mbox_property(int file_desc, void *buf)
-+{
-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+   if (ret_val < 0) {
-+      printf("ioctl_set_msg failed:%d\n", ret_val);
-+   }
-+
-+#ifdef DEBUG
-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-+   for (i=0; i<size/4; i++)
-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-+#endif
-+   return ret_val;
-+}
-+
-+#define GET_VCIMAGE_PARAMS 0x30044
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
-+{
-+    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
-+    uint32_t * p = buf;
-+    void * rimg;
-+    int rv;
-+
-+    *p++ = 0; // size
-+    *p++ = 0; // process request
-+    *p++ = GET_VCIMAGE_PARAMS;
-+    *p++ = sizeof(*img);
-+    *p++ = sizeof(*img);
-+    rimg = p;
-+    memcpy(p, img, sizeof(*img));
-+    p += sizeof(*img) / sizeof(*p);
-+    *p++ = 0;  // End tag
-+    buf[0] = (p - buf) * sizeof(*p);
-+
-+    rv = mbox_property(fd, buf);
-+    memcpy(img, rimg, sizeof(*img));
-+
-+    return rv;
-+}
-+
-+int mbox_open() {
-+   int file_desc;
-+
-+   // open a char device file used for communicating with kernel mbox driver
-+   file_desc = open(DEVICE_FILE_NAME, 0);
-+   if (file_desc < 0) {
-+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-+   }
-+   return file_desc;
-+}
-+
-+void mbox_close(int file_desc) {
-+  close(file_desc);
-+}
-+
-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-new file mode 100644
-index 0000000000..06709d57fd
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,55 @@
-+#ifndef RPI_MAILBOX_H
-+#define RPI_MAILBOX_H
-+
-+/* The image structure. */
-+typedef struct vc_image_extra_uv_s {
-+  void *u, *v;
-+  int vpitch;
-+} VC_IMAGE_EXTRA_UV_T;
-+
-+typedef union {
-+    VC_IMAGE_EXTRA_UV_T uv;
-+//  VC_IMAGE_EXTRA_RGBA_T rgba;
-+//  VC_IMAGE_EXTRA_PAL_T pal;
-+//  VC_IMAGE_EXTRA_TF_T tf;
-+//  VC_IMAGE_EXTRA_BAYER_T bayer;
-+//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
-+//  VC_IMAGE_EXTRA_CODEC_T codec;
-+//  VC_IMAGE_EXTRA_OPENGL_T opengl;
-+} VC_IMAGE_EXTRA_T;
-+
-+
-+typedef struct VC_IMAGE_T {
-+  unsigned short                  type;           /* should restrict to 16 bits */
-+  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
-+  unsigned short                  width;          /* width in pixels */
-+  unsigned short                  height;         /* height in pixels */
-+  int                             pitch;          /* pitch of image_data array in bytes */
-+  int                             size;           /* number of bytes available in image_data array */
-+  void                           *image_data;     /* pixel data */
-+  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
-+  void                           *metadata;       /* metadata header for the image */
-+  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
-+  int                             mem_handle;     /* the mem handle for relocatable memory storage */
-+  int                             metadata_size;  /* size of metadata of each channel in bytes */
-+  int                             channel_offset; /* offset of consecutive channels in bytes */
-+  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
-+  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
-+  uint8_t                         current_channel;/* the channel this header is currently pointing to */
-+  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
-+  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
-+                                                            into a linked-mulitchannel image */
-+  uint8_t                         channel_index;         /* index of the channel this header represents while
-+                                                            it is being linked. */
-+  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
-+} VC_IMAGE_T;
-+
-+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
-+
-+
-+extern int mbox_open(void);
-+extern void mbox_close(int file_desc);
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
-+
-+#endif
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-new file mode 100644
-index 0000000000..f4498bf7b1
---- /dev/null
-+++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,957 @@
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "libavutil/avassert.h"
-+
-+#include "config.h"
-+
-+#include <pthread.h>
-+#include <time.h>
-+
-+#include <interface/vcsm/user-vcsm.h>
-+
-+#include "rpi_mailbox.h"
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_transform8.h"
-+#include "rpi_hevc_transform10.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
-+#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
-+
-+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
-+// Beware this is expensive and will probably throw off all other timing by >10%
-+#define RPI_TRACE_QPU_PROFILE_ALL       0
-+
-+// QPU "noflush" flags
-+// a mixture of flushing & profiling
-+
-+#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
-+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
-+#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
-+#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
-+#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
-+
-+#define vcos_verify_ge0(x) ((x)>=0)
-+
-+// Size in 32bit words
-+#define QPU_CODE_SIZE 4098
-+#define VPU_CODE_SIZE 16384
-+
-+static const short rpi_transMatrix2even[32][16] = { // Even rows first
-+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
-+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
-+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
-+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
-+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
-+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
-+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
-+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
-+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
-+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
-+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
-+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
-+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
-+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
-+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
-+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
-+// Odd rows
-+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
-+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
-+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
-+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
-+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
-+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
-+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
-+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
-+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
-+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
-+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
-+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
-+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
-+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
-+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
-+};
-+
-+// Code/constants on GPU
-+struct GPU
-+{
-+//  unsigned int qpu_code[QPU_CODE_SIZE];
-+    unsigned int vpu_code8[VPU_CODE_SIZE];
-+    unsigned int vpu_code10[VPU_CODE_SIZE];
-+    short transMatrix2even[16*16*2];
-+};
-+
-+struct rpi_cache_flush_env_s {
-+  struct vcsm_user_clean_invalid2_s v;
-+};
-+
-+#define WAIT_COUNT_MAX 16
-+
-+typedef struct trace_time_one_s
-+{
-+    int count;
-+    int64_t start[WAIT_COUNT_MAX];
-+    int64_t total[WAIT_COUNT_MAX];
-+} trace_time_one_t;
-+
-+typedef struct trace_time_wait_s
-+{
-+    unsigned int jcount;
-+    int64_t start0;
-+    int64_t last_update;
-+    trace_time_one_t active;
-+    trace_time_one_t wait;
-+} trace_time_wait_t;
-+
-+typedef struct vq_wait_s
-+{
-+    sem_t sem;
-+    struct vq_wait_s * next;
-+} vq_wait_t;
-+
-+#define VQ_WAIT_POOL_SIZE 16
-+typedef struct vq_wait_pool_s
-+{
-+    vq_wait_t * head;
-+    vq_wait_t pool[VQ_WAIT_POOL_SIZE];
-+} vq_wait_pool_t;
-+
-+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
-+
-+typedef struct gpu_env_s
-+{
-+    int open_count;
-+    int init_count;
-+    int mb;
-+    int vpu_i_cache_flushed;
-+    GPU_MEM_PTR_T qpu_code_gm_ptr;
-+    GPU_MEM_PTR_T code_gm_ptr;
-+    GPU_MEM_PTR_T dummy_gm_ptr;
-+    vq_wait_pool_t wait_pool;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+    trace_time_wait_t ttw;
-+#endif
-+} gpu_env_t;
-+
-+// Stop more than one thread trying to allocate memory or use the processing resources at once
-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static gpu_env_t * gpu = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+
-+static int64_t ns_time(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
-+}
-+
-+
-+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
-+
-+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
-+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
-+#define T_ARG(t) T_SEC(t), T_MS(t)
-+#define T_FMT "%u.%03u"
-+
-+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
-+{
-+    // Update totals for levels that are still pending
-+    for (int i = 0; i < tto->count; ++i) {
-+        tto->total[i] += now - tto->start[i];
-+        tto->start[i] = now;
-+    }
-+
-+    printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
-+         prefix,
-+         T_ARG(now - start0 - tto->total[0]),
-+         T_ARG(tto->total[0]),
-+         T_ARG(tto->total[1]),
-+         T_ARG(tto->total[2]),
-+         T_ARG(tto->total[3]));
-+}
-+
-+
-+static void tto_start(trace_time_one_t * const tto, const int64_t now)
-+{
-+    av_assert0(tto->count < WAIT_COUNT_MAX);
-+    tto->start[tto->count++] = now;
-+}
-+
-+static void tto_end(trace_time_one_t * const tto, const int64_t now)
-+{
-+    const int n = --tto->count;
-+    av_assert0(n >= 0);
-+    tto->total[n] += now - tto->start[n];
-+}
-+
-+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
-+{
-+    printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
-+    tto_print(&ttw->active, now, ttw->start0, "Active");
-+    tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
-+}
-+
-+#endif
-+
-+// GPU memory alloc fns (internal)
-+
-+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
-+{
-+    if (p->arm != NULL)
-+        vcsm_unlock_ptr(p->arm);
-+    if (p->vcsm_handle != 0)
-+        vcsm_free(p->vcsm_handle);
-+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
-+}
-+
-+
-+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
-+    const int numbytes, const unsigned int cache_type, const char * const name)
-+{
-+    memset(p, 0, sizeof(*p));
-+    p->numbytes = (numbytes + 255) & ~255;  // Round up
-+
-+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
-+        (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
-+        (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
-+        (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
-+    {
-+        gpu_free_internal(p);
-+        return AVERROR(ENOMEM);
-+    }
-+    return 0;
-+}
-+
-+
-+// GPU init, free, lock, unlock
-+
-+static void gpu_term(void)
-+{
-+    gpu_env_t * const ge = gpu;
-+
-+    // We have to hope that eveything has terminated...
-+    gpu = NULL;
-+
-+    vc_gpuserv_deinit();
-+
-+    gpu_free_internal(&ge->code_gm_ptr);
-+    gpu_free_internal(&ge->qpu_code_gm_ptr);
-+    gpu_free_internal(&ge->dummy_gm_ptr);
-+
-+    vcsm_exit();
-+
-+    mbox_close(ge->mb);
-+
-+    vq_wait_pool_deinit(&ge->wait_pool);
-+
-+    free(ge);
-+}
-+
-+
-+// Connect to QPU, returns 0 on success.
-+static int gpu_init(gpu_env_t ** const gpu) {
-+    volatile struct GPU* ptr;
-+    gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
-+    int rv;
-+    *gpu = NULL;
-+
-+    if (ge == NULL)
-+        return -1;
-+
-+    if ((ge->mb = mbox_open()) < 0)
-+        return -1;
-+
-+    vq_wait_pool_init(&ge->wait_pool);
-+
-+    vcsm_init();
-+
-+    // Now copy over the QPU code into GPU memory
-+    if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
-+      return rv;
-+
-+    {
-+        int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
-+        av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
-+        memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
-+    }
-+
-+    // And the VPU code
-+    if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
-+        return rv;
-+    ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
-+
-+    // Zero everything so we have zeros between the code bits
-+    memset((void *)ptr, 0, sizeof(*ptr));
-+    {
-+        int num_bytes = sizeof(rpi_hevc_transform8);
-+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
-+    }
-+    {
-+        int num_bytes = sizeof(rpi_hevc_transform10);
-+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
-+    }
-+    // And the transform coefficients
-+    memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+
-+    // Generate a dummy "frame" & fill with 0x80
-+    // * Could reset to 1 <<bit_depth?
-+    if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
-+        return rv;
-+    memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
-+
-+    *gpu = ge;
-+    return 0;
-+}
-+
-+
-+
-+static void gpu_unlock(void) {
-+    pthread_mutex_unlock(&gpu_mutex);
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static gpu_env_t * gpu_lock(void) {
-+    pthread_mutex_lock(&gpu_mutex);
-+
-+    av_assert1(gpu != NULL);
-+    return gpu;
-+}
-+
-+static gpu_env_t * gpu_lock_ref(void)
-+{
-+    pthread_mutex_lock(&gpu_mutex);
-+
-+    if (gpu == NULL) {
-+        int rv = gpu_init(&gpu);
-+        if (rv != 0) {
-+            gpu_unlock();
-+            return NULL;
-+        }
-+    }
-+
-+    ++gpu->open_count;
-+    return gpu;
-+}
-+
-+static void gpu_unlock_unref(gpu_env_t * const ge)
-+{
-+    if (--ge->open_count == 0)
-+        gpu_term();
-+
-+    gpu_unlock();
-+}
-+
-+static inline gpu_env_t * gpu_ptr(void)
-+{
-+    av_assert1(gpu != NULL);
-+    return gpu;
-+}
-+
-+// Public gpu fns
-+
-+// Allocate memory on GPU
-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+// Returns 0 on success.
-+// This allocates memory that will not be cached in ARM's data cache.
-+// Therefore safe to use without data cache flushing.
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
-+}
-+
-+// This allocates data that will be
-+//    Cached in ARM L2
-+//    Uncached in VPU L2
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T * const p) {
-+    gpu_free_internal(p);
-+}
-+
-+unsigned int vpu_get_fn(const unsigned int bit_depth) {
-+  uint32_t a = 0;
-+
-+  // Make sure that the gpu is initialized
-+  av_assert1(gpu != NULL);
-+  switch (bit_depth){
-+    case 8:
-+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
-+      break;
-+    case 10:
-+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
-+      break;
-+    default:
-+      av_assert0(0);
-+  }
-+  return a;
-+}
-+
-+unsigned int vpu_get_constants(void) {
-+  av_assert1(gpu != NULL);
-+  return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
-+}
-+
-+int gpu_get_mailbox(void)
-+{
-+  av_assert1(gpu);
-+  return gpu->mb;
-+}
-+
-+void gpu_ref(void)
-+{
-+  gpu_lock_ref();
-+  gpu_unlock();
-+}
-+
-+void gpu_unref(void)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  gpu_unlock_unref(ge);
-+}
-+
-+// ----------------------------------------------------------------------------
-+//
-+// Cache flush functions
-+
-+#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
-+{
-+  rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
-+  rfe->v.op_count = 0;
-+  return rfe;
-+}
-+
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
-+{
-+  // Nothing needed
-+}
-+
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
-+{
-+    int rc = 0;
-+    if (rfe->v.op_count != 0) {
-+        if (vcsm_clean_invalid2(&rfe->v) != 0)
-+        {
-+          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno);
-+          rc = -1;
-+        }
-+        rfe->v.op_count = 0;
-+    }
-+    return rc;
-+}
-+
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
-+{
-+  int rc = rpi_cache_flush_execute(rfe);;
-+
-+  return rc;
-+}
-+
-+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
-+{
-+  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+
-+  av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
-+
-+  b->invalidate_mode = mode;
-+  b->block_count = blocks;
-+  b->start_address = gm->arm + offset0;
-+  b->block_size = block_size;
-+  b->inter_block_stride = block_stride;
-+}
-+
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset, const unsigned int size)
-+{
-+  // Deal with empty pointer trivially
-+  if (gm == NULL || size == 0)
-+    return;
-+
-+  av_assert1(offset <= gm->numbytes);
-+  av_assert1(size <= gm->numbytes);
-+  av_assert1(offset + size <= gm->numbytes);
-+
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
-+}
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
-+{
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
-+}
-+
-+
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
-+{
-+#if !RPI_ONE_BUF
-+#error Fixme! (NIF)
-+#endif
-+  if (gpu_is_buf1(frame)) {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
-+  }
-+  else
-+  {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
-+  }
-+}
-+
-+// Flush an area of a frame
-+// Width, height, x0, y0 in luma pels
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma)
-+{
-+  const unsigned int y_offset = frame->linesize[0] * y0;
-+  const unsigned int y_size = frame->linesize[0] * height;
-+  // Round UV up/down to get everything
-+  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
-+  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
-+
-+#if 0
-+  // *** frame->height is cropped height so not good
-+  // As all unsigned they will also reject -ve
-+  // Test individually as well as added to reject overflow
-+  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
-+  av_assert0(n <= (unsigned int)frame->height);
-+  av_assert0(start_line + n <= (unsigned int)frame->height);
-+#endif
-+
-+  if (!gpu_is_buf1(frame))
-+  {
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
-+    }
-+  }
-+  else if (!av_rpi_is_sand_frame(frame))
-+  {
-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
-+    }
-+  }
-+  else
-+  {
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
-+    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
-+    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
-+    av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
-+
-+    if (do_chroma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
-+      b->block_size = uv_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+    if (do_luma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
-+      b->block_size = y_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+  }
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
-+{
-+  rpi_cache_buf_t cbuf;
-+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
-+  rpi_cache_flush_finish(rfe);
-+}
-+
-+
-+// ----------------------------------------------------------------------------
-+
-+
-+// Wait abstractions - mostly so we can easily add profile code
-+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_init(&wp->pool[i].sem, 0, 0);
-+    wp->pool[i].next = wp->pool + i + 1;
-+  }
-+  wp->head = wp->pool + 0;
-+  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
-+}
-+
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  wp->head = NULL;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_destroy(&wp->pool[i].sem);
-+    wp->pool[i].next = NULL;
-+  }
-+}
-+
-+
-+// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(void)
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  vq_wait_t * const wait = ge->wait_pool.head;
-+  ge->wait_pool.head = wait->next;
-+  wait->next = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  tto_start(&ge->ttw.active, ns_time());
-+#endif
-+
-+  gpu_unlock();
-+  return wait;
-+}
-+
-+static void vq_wait_delete(vq_wait_t * const wait)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  wait->next = ge->wait_pool.head;
-+  ge->wait_pool.head = wait;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    trace_time_wait_t * const ttw = &ge->ttw;
-+    const int64_t now = ns_time();
-+    ++ttw->jcount;
-+    tto_end(&ttw->wait, now);
-+
-+    if (ttw->start0 == 0)
-+    {
-+      ttw->start0 = ttw->active.start[0];
-+      ttw->last_update = ttw->start0;
-+    }
-+    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
-+    {
-+      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
-+      ttw_print(ttw, now);
-+    }
-+  }
-+#endif
-+  gpu_unlock_unref(ge);
-+}
-+
-+static void vq_wait_wait(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+      const int64_t now = ns_time();
-+      gpu_env_t * const ge = gpu_lock();
-+      tto_start(&ge->ttw.wait, now);
-+      gpu_unlock();
-+  }
-+#endif
-+
-+  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
-+    /* loop */;
-+}
-+
-+static void vq_wait_post(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    gpu_env_t *const ge = gpu_lock();
-+    tto_end(&ge->ttw.active, ns_time());
-+    gpu_unlock();
-+  }
-+#endif
-+
-+  sem_post(&wait->sem);
-+}
-+
-+
-+
-+// Header comments were wrong for these two
-+#define VPU_QPU_MASK_QPU  1
-+#define VPU_QPU_MASK_VPU  2
-+
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
-+{
-+//  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
-+  vpu_qpu_job_env_t * vqj = buf;
-+//  memset(vqj, 0, sizeof(*vqj));
-+  vqj->n = 0;
-+  vqj->mask = 0;
-+  return vqj;
-+}
-+
-+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
-+{
-+//  memset(vqj, 0, sizeof(*vqj));
-+//  free(vqj);
-+}
-+
-+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
-+{
-+  struct gpu_job_s * const j = vqj->j + vqj->n++;
-+  av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
-+  return j;
-+}
-+
-+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
-+{
-+  if (vpu_code != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_VPU;
-+
-+    j->command = EXECUTE_VPU;
-+    j->callback.func = 0;
-+    j->callback.cookie = NULL;
-+    // The bottom two bits of the execute address contain no-flush flags
-+    // b0 will flush the VPU I-cache if unset so we nearly always want that set
-+    // as we never reload code
-+    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
-+    j->u.v.q[1] = r0;
-+    j->u.v.q[2] = r1;
-+    j->u.v.q[3] = r2;
-+    j->u.v.q[4] = r3;
-+    j->u.v.q[5] = r4;
-+    j->u.v.q[6] = r5;
-+    gpu->vpu_i_cache_flushed = 1;
-+  }
-+}
-+
-+// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
-+{
-+  if (n != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_QPU;
-+
-+    j->command = EXECUTE_QPU;
-+    j->callback.func = 0;
-+    j->callback.cookie = NULL;
-+
-+    j->u.q.jobs = n;
-+#if RPI_TRACE_QPU_PROFILE_ALL
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
-+#else
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
-+#endif
-+    j->u.q.timeout = 5000;
-+    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  }
-+}
-+
-+// Convert callback to sem post
-+static void vpu_qpu_job_callback_wait(void * v)
-+{
-+  vq_wait_post(v);
-+}
-+
-+// Poke a user-supplied sem
-+static void vpu_qpu_job_callback_sem(void * v)
-+{
-+  sem_post((sem_t *)v);
-+}
-+
-+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
-+{
-+  vq_wait_t * wait;
-+
-+  if (vqj->mask == 0) {
-+    *wait_h = NULL;
-+    return;
-+  }
-+
-+  // We are going to want a sync object
-+  wait = vq_wait_new();
-+
-+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+  // If we only posted one thing or only QPU jobs
-+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+  {
-+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+    av_assert1(j->callback.func == 0);
-+
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+  else
-+  {
-+    struct gpu_job_s *const j = new_job(vqj);
-+
-+    j->command = EXECUTE_SYNC;
-+    j->u.s.mask = vqj->mask;
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+
-+  vqj->mask = 0;
-+  *wait_h = wait;
-+}
-+
-+// Returns 0 if no sync added ('cos Q empty), 1 if sync added
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
-+{
-+  // If nothing on q then just return
-+  if (vqj->mask == 0)
-+    return 0;
-+
-+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+  // If we only posted one thing or only QPU jobs
-+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+  {
-+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+    av_assert1(j->callback.func == 0);
-+
-+    j->callback.func = vpu_qpu_job_callback_sem;
-+    j->callback.cookie = sem;
-+  }
-+  else
-+  {
-+    struct gpu_job_s *const j = new_job(vqj);
-+
-+    j->command = EXECUTE_SYNC;
-+    j->u.s.mask = vqj->mask;
-+    j->callback.func = vpu_qpu_job_callback_sem;
-+    j->callback.cookie = sem;
-+  }
-+
-+  vqj->mask = 0;
-+  return 1;
-+}
-+
-+
-+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
-+{
-+  if (vqj->n == 0)
-+    return 0;
-+
-+  return vc_gpuserv_execute_code(vqj->n, vqj->j);
-+}
-+
-+// Simple wrapper of start + delete
-+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
-+{
-+  int rv;
-+  rv = vpu_qpu_job_start(vqj);
-+  vpu_qpu_job_delete(vqj);
-+  return rv;
-+}
-+
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
-+{
-+  if (wait_h != NULL)
-+  {
-+    vq_wait_t * const wait = *wait_h;
-+    if (wait != NULL) {
-+      *wait_h = NULL;
-+      vq_wait_wait(wait);
-+      vq_wait_delete(wait);
-+    }
-+  }
-+}
-+
-+int vpu_qpu_init()
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  if (ge == NULL)
-+    return -1;
-+
-+  if (ge->init_count++ == 0)
-+  {
-+    vc_gpuserv_init();
-+  }
-+
-+  gpu_unlock();
-+  return 0;
-+}
-+
-+void vpu_qpu_term()
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+
-+  if (--ge->init_count == 0) {
-+    vc_gpuserv_deinit();
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+    ttw_print(&ge->ttw, ns_time());
-+#endif
-+  }
-+
-+  gpu_unlock_unref(ge);
-+}
-+
-+uint32_t qpu_fn(const int * const mc_fn)
-+{
-+  return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
-+}
-+
-+uint32_t qpu_dummy(void)
-+{
-+  return gpu->dummy_gm_ptr.vc;
-+}
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
-+{
-+  // Dummy values we can catch with emulation
-+  qf->y_pxx = ~1U;
-+  qf->y_bxx = ~2U;
-+  qf->y_p00 = ~3U;
-+  qf->y_b00 = ~4U;
-+  qf->c_pxx = ~5U;
-+  qf->c_bxx = ~6U;
-+
-+  switch (bit_depth) {
-+    case 8:
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y_b00);
-+      qf->c_pxx = qpu_fn(mc_filter_c_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c_b);
-+      break;
-+    case 10:
-+      qf->c_pxx = qpu_fn(mc_filter_c10_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c10_b);
-+      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
-+      break;
-+    default:
-+      return -1;
-+  }
-+  return 0;
-+}
-+
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-new file mode 100644
-index 0000000000..e1b4d9c39e
---- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,229 @@
-+#ifndef RPI_QPU_H
-+#define RPI_QPU_H
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
-+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
-+#pragma GCC diagnostic pop
-+
-+
-+#define RPI_ONE_BUF 1
-+
-+typedef struct gpu_mem_ptr_s {
-+  unsigned char *arm; // Pointer to memory mapped on ARM side
-+  int vc_handle;   // Videocore handle of relocatable memory
-+  int vcsm_handle; // Handle for use by VCSM
-+  int vc;       // Address for use in GPU code
-+  int numbytes; // Size of memory block
-+} GPU_MEM_PTR_T;
-+
-+// General GPU functions
-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T * const p);
-+
-+#include "libavutil/frame.h"
-+#if !RPI_ONE_BUF
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
-+    return p->vc;
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
-+}
-+
-+#else
-+
-+static inline int gpu_is_buf1(const AVFrame * const frame)
-+{
-+    return frame->buf[1] == NULL;
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
-+{
-+    return av_buffer_get_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
-+{
-+    return av_buffer_pool_opaque(frame->buf[n]);
-+}
-+
-+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
-+{
-+    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
-+    return gm->vc + (frame->data[n] - gm->arm);
-+}
-+
-+
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 0);
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 1);
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 2);
-+}
-+
-+#if 0
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.numbytes = frame->data[1] - frame->data[0];
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 0);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[1] - frame->data[0];
-+        g.vc += frame->data[1] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 1);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[2] - frame->data[0];
-+        g.vc += frame->data[2] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 2);
-+}
-+#endif
-+#endif
-+
-+// Cache flush stuff
-+
-+struct rpi_cache_flush_env_s;
-+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
-+
-+typedef struct {uint32_t t[33];} rpi_cache_buf_t;
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
-+// Free env without flushing
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & clear but do not free the env
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & free the env
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
-+
-+typedef enum
-+{
-+    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
-+    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
-+    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
-+} rpi_cache_flush_mode_t;
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
-+  const unsigned int offset, const unsigned int size);
-+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma);
-+
-+// init, add, finish for one gm ptr
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
-+
-+
-+// QPU specific functions
-+
-+typedef struct HEVCRpiQpu {
-+    uint32_t c_pxx;
-+    uint32_t c_pxx_l1;
-+    uint32_t c_bxx;
-+    uint32_t y_pxx;
-+    uint32_t y_bxx;
-+    uint32_t y_p00;
-+    uint32_t y_b00;
-+} HEVCRpiQpu;
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
-+
-+uint32_t qpu_fn(const int * const mc_fn);
-+uint32_t qpu_dummy(void);
-+
-+#define QPU_N_GRP    4
-+#define QPU_N_MAX    12
-+
-+#define QPU_MAIL_EL_VALS  2
-+
-+struct vpu_qpu_wait_s;
-+typedef struct vq_wait_s * vpu_qpu_wait_h;
-+
-+// VPU specific functions
-+
-+struct vpu_qpu_job_env_s;
-+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
-+
-+#define VPU_QPU_JOB_MAX 4
-+struct vpu_qpu_job_env_s
-+{
-+  unsigned int n;
-+  unsigned int mask;
-+  struct gpu_job_s j[VPU_QPU_JOB_MAX];
-+};
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
-+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
-+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
-+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
-+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
-+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
-+
-+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
-+extern unsigned int vpu_get_constants(void);
-+
-+// Waits for previous post_codee to complete and Will null out *wait_h after use
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_init(void);
-+void vpu_qpu_term(void);
-+
-+extern int gpu_get_mailbox(void);
-+void gpu_ref(void);
-+void gpu_unref(void);
-+
-+#endif
-diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
-new file mode 100644
-index 0000000000..185288da5a
---- /dev/null
-+++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,741 @@
-+#include "libavcodec/avcodec.h"
-+#include "rpi_qpu.h"
-+#include "rpi_mailbox.h"
-+#include "rpi_zc.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include <pthread.h>
-+
-+#include "libavutil/buffer_internal.h"
-+#include <interface/vctypes/vc_image_types.h>
-+
-+#define TRACE_ALLOC 0
-+
-+struct ZcPoolEnt;
-+
-+typedef struct ZcPool
-+{
-+    int numbytes;
-+    unsigned int n;
-+    struct ZcPoolEnt * head;
-+    pthread_mutex_t lock;
-+} ZcPool;
-+
-+typedef struct ZcPoolEnt
-+{
-+    // It is important that we start with gmem as other bits of code will expect to see that
-+    GPU_MEM_PTR_T gmem;
-+    unsigned int n;
-+    struct ZcPoolEnt * next;
-+    struct ZcPool * pool;
-+} ZcPoolEnt;
-+
-+#define ALLOC_PAD       0
-+#define ALLOC_ROUND     0x1000
-+#define ALLOC_N_OFFSET  0
-+#define STRIDE_ROUND    64
-+#define STRIDE_OR       0
-+
-+#define DEBUG_ZAP0_BUFFERS 0
-+
-+
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
-+{
-+    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
-+
-+    // Round up to 4k & add 4k
-+    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
-+
-+    if (zp == NULL) {
-+        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
-+        goto fail0;
-+    }
-+
-+    if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
-+        goto fail1;
-+    }
-+
-+#if TRACE_ALLOC
-+    printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
-+#endif
-+
-+    pool->numbytes = zp->gmem.numbytes;
-+    zp->next = NULL;
-+    zp->pool = pool;
-+    zp->n = pool->n++;
-+    return zp;
-+
-+fail1:
-+    av_free(zp);
-+fail0:
-+    return NULL;
-+}
-+
-+static void zc_pool_ent_free(ZcPoolEnt * const zp)
-+{
-+#if TRACE_ALLOC
-+    printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
-+#endif
-+
-+    gpu_free(&zp->gmem);
-+    av_free(zp);
-+}
-+
-+static void zc_pool_flush(ZcPool * const pool)
-+{
-+    ZcPoolEnt * p = pool->head;
-+    pool->head = NULL;
-+    pool->numbytes = -1;
-+
-+    while (p != NULL)
-+    {
-+        ZcPoolEnt * const zp = p;
-+        p = p->next;
-+        zc_pool_ent_free(zp);
-+    }
-+}
-+
-+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
-+{
-+    ZcPoolEnt * zp;
-+    int numbytes;
-+
-+    pthread_mutex_lock(&pool->lock);
-+
-+    numbytes = pool->numbytes;
-+
-+    // If size isn't close then dump the pool
-+    // Close in this context means within 128k
-+    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
-+    {
-+        zc_pool_flush(pool);
-+        numbytes = req_bytes;
-+    }
-+
-+    if (pool->head != NULL)
-+    {
-+        zp = pool->head;
-+        pool->head = zp->next;
-+    }
-+    else
-+    {
-+        zp = zc_pool_ent_alloc(pool, numbytes);
-+    }
-+
-+    pthread_mutex_unlock(&pool->lock);
-+
-+    // Start with our buffer empty of preconceptions
-+//    rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+
-+    return zp;
-+}
-+
-+static void zc_pool_free(ZcPoolEnt * const zp)
-+{
-+    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
-+    if (zp != NULL)
-+    {
-+        pthread_mutex_lock(&pool->lock);
-+#if TRACE_ALLOC
-+        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
-+#endif
-+
-+        if (pool->numbytes == zp->gmem.numbytes)
-+        {
-+            zp->next = pool->head;
-+            pool->head = zp;
-+            pthread_mutex_unlock(&pool->lock);
-+        }
-+        else
-+        {
-+            pthread_mutex_unlock(&pool->lock);
-+            zc_pool_ent_free(zp);
-+        }
-+    }
-+}
-+
-+static void
-+zc_pool_init(ZcPool * const pool)
-+{
-+    pool->numbytes = -1;
-+    pool->head = NULL;
-+    pthread_mutex_init(&pool->lock, NULL);
-+}
-+
-+static void
-+zc_pool_destroy(ZcPool * const pool)
-+{
-+    pool->numbytes = -1;
-+    zc_pool_flush(pool);
-+    pthread_mutex_destroy(&pool->lock);
-+}
-+
-+typedef struct ZcOldCtxVals
-+{
-+    int thread_safe_callbacks;
-+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
-+    void * get_buffer_context;
-+} ZcOldCtxVals;
-+
-+typedef struct AVZcEnv
-+{
-+    unsigned int refcount;
-+    ZcPool pool;
-+    ZcOldCtxVals old;
-+} ZcEnv;
-+
-+// Callback when buffer unrefed to zero
-+static void rpi_free_display_buffer(void *opaque, uint8_t *data)
-+{
-+    ZcPoolEnt *const zp = opaque;
-+//    printf("%s: data=%p\n", __func__, data);
-+    zc_pool_free(zp);
-+}
-+
-+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
-+{
-+    // Kludge where we check the free fn to check this is really
-+    // one of our buffers - can't think of a better way
-+    return buf == NULL || buf->buffer->free != rpi_free_display_buffer ? NULL :
-+        av_buffer_get_opaque(buf);
-+}
-+
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const int format, const unsigned int video_width, const unsigned int video_height)
-+{
-+    AVRpiZcFrameGeometry geo;
-+
-+    switch (format)
-+    {
-+        case AV_PIX_FMT_YUV420P:
-+            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+            geo.stride_c = geo.stride_y / 2;
-+            geo.height_y = (video_height + 32 + 31) & ~31;
-+            geo.height_c = geo.height_y / 2;
-+            geo.planes_c = 2;
-+            geo.stripes = 1;
-+            geo.bytes_per_pel = 1;
-+            break;
-+
-+        case AV_PIX_FMT_YUV420P10:
-+            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+            geo.stride_c = geo.stride_y / 2;
-+            geo.height_y = (video_height + 32 + 31) & ~31;
-+            geo.height_c = geo.height_y / 2;
-+            geo.planes_c = 2;
-+            geo.stripes = 1;
-+            geo.bytes_per_pel = 2;
-+            break;
-+
-+        case AV_PIX_FMT_SAND128:
-+        {
-+            const unsigned int stripe_w = 128;
-+
-+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+            if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV_UV,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                gpu_ref();
-+                mbox_get_image_params(gpu_get_mailbox(), &new_img);
-+                gpu_unref();
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.planes_c = 1;
-+            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 1;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+
-+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
-+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
-+            break;
-+        }
-+
-+        case AV_PIX_FMT_SAND64_16:
-+        case AV_PIX_FMT_SAND64_10:
-+        {
-+            const unsigned int stripe_w = 128;  // bytes
-+
-+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+            if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV_UV_16,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                gpu_ref();
-+                mbox_get_image_params(gpu_get_mailbox(), &new_img);
-+                gpu_unref();
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.planes_c = 1;
-+            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 2;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+            break;
-+        }
-+
-+        default:
-+            memset(&geo, 0, sizeof(geo));
-+            break;
-+    }
-+    return geo;
-+}
-+
-+
-+static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
-+{
-+    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
-+    AVBufferRef * buf;
-+    intptr_t idata = (intptr_t)zp->gmem.arm;
-+#if ALLOC_N_OFFSET != 0
-+    intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
-+#endif
-+
-+    if (zp == NULL) {
-+        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
-+        goto fail0;
-+    }
-+
-+#if ALLOC_N_OFFSET != 0
-+    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
-+#endif
-+
-+#if DEBUG_ZAP0_BUFFERS
-+    memset((void*)idata, 0, size);
-+#endif
-+
-+    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
-+        goto fail2;
-+    }
-+
-+    return buf;
-+
-+fail2:
-+    zc_pool_free(zp);
-+fail0:
-+    return NULL;
-+}
-+
-+static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
-+{
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
-+    const unsigned int size_y = geo.stride_y * geo.height_y;
-+    const unsigned int size_c = geo.stride_c * geo.height_c;
-+    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
-+    AVBufferRef * buf;
-+    unsigned int i;
-+
-+//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
-+
-+    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
-+        frame->buf[i] = NULL;
-+        frame->data[i] = NULL;
-+        frame->linesize[i] = 0;
-+    }
-+
-+    frame->buf[0] = buf;
-+
-+    frame->linesize[0] = geo.stride_y;
-+    frame->linesize[1] = geo.stride_c;
-+    frame->linesize[2] = geo.stride_c;
-+    // abuse: linesize[3] = "stripe stride"
-+    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
-+    // In a general case this makes the calculation an xor and multiply rather
-+    // than a divide and multiply
-+    if (geo.stripes > 1)
-+        frame->linesize[3] = geo.height_y + geo.height_c;
-+
-+    frame->data[0] = buf->data;
-+    frame->data[1] = frame->data[0] + size_y;
-+    if (geo.planes_c > 1)
-+        frame->data[2] = frame->data[1] + size_c;
-+
-+    frame->extended_data = frame->data;
-+    // Leave extended buf alone
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF != 0
-+    // *** If we intend to use this for real we will want a 2nd buffer pool
-+    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
-+#endif
-+
-+    return 0;
-+}
-+
-+#define RPI_GET_BUFFER2 1
-+
-+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
-+{
-+#if !RPI_GET_BUFFER2
-+    return avcodec_default_get_buffer2(s, frame, flags);
-+#else
-+    int rv;
-+
-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
-+    {
-+//        printf("Do default alloc: format=%#x\n", frame->format);
-+        rv = avcodec_default_get_buffer2(s, frame, flags);
-+    }
-+    else if (frame->format == AV_PIX_FMT_YUV420P ||
-+             av_rpi_is_sand_frame(frame))
-+    {
-+        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
-+    }
-+    else
-+    {
-+        rv = avcodec_default_get_buffer2(s, frame, flags);
-+    }
-+
-+#if 0
-+    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
-+        frame->format, frame->width, frame->height,
-+        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
-+        frame->data[0], frame->data[1], frame->data[2],
-+        frame->buf[0], frame->buf[1], frame->buf[2],
-+        av_buffer_get_opaque(frame->buf[0]));
-+#endif
-+    return rv;
-+#endif
-+}
-+
-+
-+static AVBufferRef * zc_copy(struct AVCodecContext * const s,
-+    const AVFrame * const src)
-+{
-+    AVFrame dest_frame;
-+    AVFrame * const dest = &dest_frame;
-+    unsigned int i;
-+    uint8_t * psrc, * pdest;
-+
-+    dest->format = src->format;
-+    dest->width = src->width;
-+    dest->height = src->height;
-+
-+    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
-+    {
-+        return NULL;
-+    }
-+
-+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
-+         i != dest->height;
-+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
-+    {
-+        memcpy(pdest, psrc, dest->width);
-+    }
-+    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
-+         i != dest->height / 2;
-+         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
-+    {
-+        memcpy(pdest, psrc, dest->width / 2);
-+    }
-+    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
-+         i != dest->height / 2;
-+         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
-+    {
-+        memcpy(pdest, psrc, dest->width / 2);
-+    }
-+
-+    return dest->buf[0];
-+}
-+
-+
-+static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
-+    const AVFrame * const src)
-+{
-+    AVFrame dest_frame;
-+    AVFrame * const dest = &dest_frame;
-+    unsigned int i;
-+    uint8_t * psrc, * psrc2, * pdest;
-+
-+    memset(dest, 0, sizeof(*dest));
-+    dest->format = AV_PIX_FMT_SAND128;
-+    dest->width = src->width;
-+    dest->height = src->height;
-+
-+    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
-+    {
-+        return NULL;
-+    }
-+
-+    // Y
-+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
-+         i != dest->height;
-+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
-+    {
-+        uint16_t * s = (uint16_t*)psrc;
-+        uint8_t * d = pdest;
-+        for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
-+        {
-+            const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
-+            for (unsigned int j = 0; j != n; ++j)
-+                *d++ = (uint8_t)(*s++ >> 2);
-+            d += (dest->linesize[3] - 1) * dest->linesize[0];
-+        }
-+    }
-+
-+    // C
-+    for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
-+         i != dest->height / 2;
-+         ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
-+    {
-+        const uint16_t * su = (uint16_t*)psrc;
-+        const uint16_t * sv = (uint16_t*)psrc2;
-+        uint8_t * d = pdest;
-+        for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
-+        {
-+            const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
-+            for (unsigned int j = 0; j != n; ++j)
-+            {
-+                *d++ = (uint8_t)(*su++ >> 2);
-+                *d++ = (uint8_t)(*sv++ >> 2);
-+            }
-+            d += (dest->linesize[3] - 1) * dest->linesize[1];
-+        }
-+    }
-+
-+    return dest->buf[0];
-+}
-+
-+
-+static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
-+    const AVFrame * const src, const unsigned int src_bits)
-+{
-+    AVFrame dest_frame = {
-+        .format = AV_PIX_FMT_SAND128,
-+        .width = src->width,
-+        .height = src->height
-+    };
-+    AVFrame * const dest = &dest_frame;
-+    const unsigned int shr = src_bits - 8;
-+
-+    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
-+    {
-+        return NULL;
-+    }
-+
-+    // Y
-+    av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
-+                        src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
-+                        src->width, src->height, shr);
-+    // C
-+    av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
-+                        src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
-+                        src->width, src->height / 2, shr);
-+
-+    return dest->buf[0];
-+}
-+
-+
-+
-+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
-+{
-+    assert(s != NULL);
-+
-+    if (frame->format != AV_PIX_FMT_YUV420P &&
-+        frame->format != AV_PIX_FMT_YUV420P10 &&
-+        !av_rpi_is_sand_frame(frame))
-+    {
-+        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
-+        return NULL;
-+    }
-+
-+    if (frame->buf[1] != NULL || frame->format != expected_format)
-+    {
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
-+        {
-+//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
-+            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
-+        }
-+#endif
-+
-+        if (maycopy)
-+        {
-+            if (frame->buf[1] != NULL)
-+                av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
-+            else
-+                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
-+
-+            switch (frame->format)
-+            {
-+                case AV_PIX_FMT_YUV420P10:
-+                    return zc_420p10_to_sand128(s, frame);
-+
-+                case AV_PIX_FMT_SAND64_10:
-+                    return zc_sand64_16_to_sand128(s, frame, 10);
-+
-+                default:
-+                    return zc_copy(s, frame);
-+            }
-+        }
-+        else
-+        {
-+            if (frame->buf[1] != NULL)
-+                av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
-+            else
-+                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
-+            return NULL;
-+        }
-+    }
-+
-+    if (pic_gm_ptr(frame->buf[0]) == NULL)
-+    {
-+        if (maycopy)
-+        {
-+            av_log(s, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
-+            return zc_copy(s, frame);
-+        }
-+        else
-+        {
-+            av_log(s, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
-+            return NULL;
-+        }
-+    }
-+
-+    return av_buffer_ref(frame->buf[0]);
-+}
-+
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? -1 : p->vc_handle;
-+}
-+
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? 0 : fr_ref->data - p->arm;
-+}
-+
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
-+{
-+    return fr_ref == NULL ? 0 : fr_ref->size;
-+}
-+
-+
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? 0 : p->numbytes;
-+}
-+
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
-+{
-+    if (fr_ref != NULL)
-+    {
-+        av_buffer_unref(&fr_ref);
-+    }
-+}
-+
-+AVZcEnvPtr av_rpi_zc_env_alloc(void)
-+{
-+    ZcEnv * const zc = av_mallocz(sizeof(ZcEnv));
-+    if (zc == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
-+        return NULL;
-+    }
-+
-+    zc_pool_init(&zc->pool);
-+    return zc;
-+}
-+
-+void av_rpi_zc_env_free(AVZcEnvPtr zc)
-+{
-+    if (zc != NULL)
-+    {
-+        zc_pool_destroy(&zc->pool); ;
-+        av_free(zc);
-+    }
-+}
-+
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
-+{
-+    return s->get_buffer2 == av_rpi_zc_get_buffer2;
-+}
-+
-+int av_rpi_zc_init(struct AVCodecContext * const s)
-+{
-+    if (av_rpi_zc_in_use(s))
-+    {
-+        ZcEnv * const zc = s->get_buffer_context;
-+        ++zc->refcount;
-+    }
-+    else
-+    {
-+        ZcEnv *const zc = av_rpi_zc_env_alloc();
-+        if (zc == NULL)
-+        {
-+            return AVERROR(ENOMEM);
-+        }
-+
-+        zc->refcount = 1;
-+        zc->old.get_buffer_context = s->get_buffer_context;
-+        zc->old.get_buffer2 = s->get_buffer2;
-+        zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
-+
-+        s->get_buffer_context = zc;
-+        s->get_buffer2 = av_rpi_zc_get_buffer2;
-+        s->thread_safe_callbacks = 1;
-+    }
-+    return 0;
-+}
-+
-+void av_rpi_zc_uninit(struct AVCodecContext * const s)
-+{
-+    if (av_rpi_zc_in_use(s))
-+    {
-+        ZcEnv * const zc = s->get_buffer_context;
-+        if (--zc->refcount == 0)
-+        {
-+            s->get_buffer2 = zc->old.get_buffer2;
-+            s->get_buffer_context = zc->old.get_buffer_context;
-+            s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
-+            av_rpi_zc_env_free(zc);
-+        }
-+    }
-+}
-+
-diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
-new file mode 100644
-index 0000000000..26fb3be999
---- /dev/null
-+++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,105 @@
-+#ifndef LIBAVCODEC_RPI_ZC_H
-+#define LIBAVCODEC_RPI_ZC_H
-+
-+// Zero-Copy frame code for RPi
-+// RPi needs Y/U/V planes to be contiguous for display.  By default
-+// ffmpeg will allocate separated planes so a memcpy is needed before
-+// display.  This code provides a method a making ffmpeg allocate a single
-+// bit of memory for the frame when can then be reference counted until
-+// display has finished with it.
-+
-+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
-+// 0 disables
-+// *** This option still in development
-+//     Only works if SAO active
-+//     Allocates buffers that are twice the required size
-+#define RPI_ZC_SAND_8_IN_10_BUF  0
-+
-+struct AVBufferRef;
-+struct AVFrame;
-+struct AVCodecContext;
-+enum AVPixelFormat;
-+
-+// "Opaque" pointer to whatever we are using as a buffer reference
-+typedef struct AVBufferRef * AVRpiZcRefPtr;
-+
-+struct AVZcEnv;
-+typedef struct AVZcEnv * AVZcEnvPtr;
-+
-+typedef struct AVRpiZcFrameGeometry
-+{
-+    unsigned int stride_y;  // Luma stride (bytes)
-+    unsigned int height_y;  // Luma height (lines)
-+    unsigned int stride_c;  // Chroma stride (bytes)
-+    unsigned int height_c;  // Chroma stride (lines)
-+    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
-+    unsigned int stripes;   // Number of stripes (sand)
-+    unsigned int bytes_per_pel;
-+} AVRpiZcFrameGeometry;
-+
-+
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const int format,
-+    const unsigned int video_width, const unsigned int video_height);
-+
-+// Replacement fn for avctx->get_buffer2
-+// Should be set before calling avcodec_decode_open2
-+//
-+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
-+// must be set to 1 as otherwise the buffer info is killed before being returned
-+// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
-+// returned must be manually derefed with av_frame_unref.  This should be done
-+// after av_rpi_zc_ref has been called.
-+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
-+
-+// Generate a ZC reference to the buffer(s) in this frame
-+// If the buffer doesn't appear to be one allocated by _get_buffer_2
-+// then the behaviour depends on maycopy:
-+//   If maycopy=0 then return NULL
-+//   If maycopy=1 && the src frame is in a form where we can easily copy
-+//     the data, then allocate a new buffer and copy the data into it
-+//   Otherwise return NULL
-+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
-+
-+// Get the vc_handle from the frame ref
-+// Returns -1 if ref doesn't look valid
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
-+// Get offset from the start of the memory referenced
-+// by the vc_handle to valid data
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
-+// Length of buffer data
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
-+// Get the number of bytes allocated from the frame ref
-+// Returns 0 if ref doesn't look valid
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
-+
-+// Unreference the buffer refed/allocated by _zc_ref
-+// If fr_ref is NULL then this will NOP
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
-+
-+// Allocate an environment for the buffer pool used by the ZC code
-+// This should be put in avctx->get_buffer_context so it can be found by
-+// av_rpi_zc_get_buffer2 when it is called from ffmpeg
-+AVZcEnvPtr av_rpi_zc_env_alloc(void);
-+
-+// Allocate the environment used by the ZC code
-+void av_rpi_zc_env_free(AVZcEnvPtr);
-+
-+// Test to see if the context is using zc (checks get_buffer2)
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
-+
-+// Init ZC into a context
-+// There is nothing magic in this fn - it just packages setting
-+// get_buffer2 & get_buffer_context
-+int av_rpi_zc_init(struct AVCodecContext * const s);
-+
-+// Free ZC from a context
-+// There is nothing magic in this fn - it just packages unsetting
-+// get_buffer2 & get_buffer_context
-+void av_rpi_zc_uninit(struct AVCodecContext * const s);
-+
-+
-+
-+#endif
-+
-diff --git a/libavfilter/Makefile b/libavfilter/Makefile
-index 455c809b15..087cab98ee 100644
---- a/libavfilter/Makefile
-+++ b/libavfilter/Makefile
-@@ -406,6 +406,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)       += vf_transpose_opencl.o opencl.o o
- OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
- OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
- OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
-+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
- OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
- OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
-                                                 opencl/unsharp.o
-diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
-index 04a3df7d56..8d1470dd34 100644
---- a/libavfilter/allfilters.c
-+++ b/libavfilter/allfilters.c
-@@ -387,6 +387,7 @@ extern AVFilter ff_vf_transpose_vaapi;
- extern AVFilter ff_vf_trim;
- extern AVFilter ff_vf_unpremultiply;
- extern AVFilter ff_vf_unsharp;
-+extern AVFilter ff_vf_unsand;
- extern AVFilter ff_vf_unsharp_opencl;
- extern AVFilter ff_vf_uspp;
- extern AVFilter ff_vf_vaguedenoiser;
-diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
-index a149f8fb6d..776e3bb9ab 100644
---- a/libavfilter/avfiltergraph.c
-+++ b/libavfilter/avfiltergraph.c
-@@ -32,6 +32,9 @@
- #include "libavutil/internal.h"
- #include "libavutil/opt.h"
- #include "libavutil/pixdesc.h"
-+#if CONFIG_UNSAND_FILTER
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- #define FF_INTERNAL_FIELDS 1
- #include "framequeue.h"
-@@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFormats *a_arg,
-     }
- }
- 
-+#if CONFIG_UNSAND_FILTER
-+static int has_sand_format(const AVFilterFormats * const ff)
-+{
-+    int i;
-+    for (i = 0; i != ff->nb_formats; ++i) {
-+        if (av_rpi_is_sand_format(ff->formats[i])) {
-+            return 1;
-+        }
-+    }
-+    return 0;
-+}
-+#endif
-+
- /**
-  * Perform one round of query_formats() and merging formats lists on the
-  * filter graph.
-@@ -467,6 +483,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-         for (j = 0; j < filter->nb_inputs; j++) {
-             AVFilterLink *link = filter->inputs[j];
-             int convert_needed = 0;
-+            unsigned int extra_convert_tried = 0;
- 
-             if (!link)
-                 continue;
-@@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-             )
- #undef MERGE_DISPATCH
- 
--            if (convert_needed) {
-+            while (convert_needed) {
-                 AVFilterContext *convert;
-                 const AVFilter *filter;
-                 AVFilterLink *inlink, *outlink;
-                 char inst_name[30];
-+                int can_retry = 0;
-+
-+                convert_needed = 0;
- 
-                 if (graph->disable_auto_convert) {
-                     av_log(log_ctx, AV_LOG_ERROR,
-@@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-                 /* couldn't merge format lists. auto-insert conversion filter */
-                 switch (link->type) {
-                 case AVMEDIA_TYPE_VIDEO:
--                    if (!(filter = avfilter_get_by_name("scale"))) {
--                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
--                               "not present, cannot convert pixel formats.\n");
--                        return AVERROR(EINVAL);
--                    }
--
--                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
--                             scaler_count++);
-+#if CONFIG_UNSAND_FILTER
-+                    // Only try each extra conversion once
-+                    // The unsand output pad should never trigger has_sand_format
-+                    // but it is better to be safe
-+                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
-+                        if (!(filter = avfilter_get_by_name("unsand"))) {
-+                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
-+                                   "not present, cannot convert pixel formats.\n");
-+                            return AVERROR(EINVAL);
-+                        }
-+
-+                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
-+                                 scaler_count++);
-+
-+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
-+                                                                inst_name, "", NULL,
-+                                                                graph)) < 0)
-+                            return ret;
- 
--                    if ((ret = avfilter_graph_create_filter(&convert, filter,
--                                                            inst_name, graph->scale_sws_opts, NULL,
--                                                            graph)) < 0)
--                        return ret;
-+                        extra_convert_tried |= 1;
-+                        can_retry = 1;
-+                    }
-+                    else
-+#endif
-+                    {
-+                        if (!(filter = avfilter_get_by_name("scale"))) {
-+                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
-+                                   "not present, cannot convert pixel formats.\n");
-+                            return AVERROR(EINVAL);
-+                        }
-+
-+                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
-+                                 scaler_count++);
-+
-+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
-+                                                                inst_name, graph->scale_sws_opts, NULL,
-+                                                                graph)) < 0)
-+                            return ret;
-+                    }
-                     break;
-                 case AVMEDIA_TYPE_AUDIO:
-                     if (!(filter = avfilter_get_by_name("aresample"))) {
-@@ -585,9 +631,19 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-                     av_assert0(outlink-> in_channel_layouts->refcount > 0);
-                     av_assert0(outlink->out_channel_layouts->refcount > 0);
-                 }
--                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type) ||
--                    !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+                // If we have added an extra filter we must merge the input
-+                // side but we can have another go at the output
-+                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type))
-+                    ret = AVERROR(ENOSYS);
-+                else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+                {
-+                    if (can_retry) {
-+                        link = outlink;
-+                        convert_needed = 1;
-+                        continue;
-+                    }
-                     ret = AVERROR(ENOSYS);
-+                }
-                 if (inlink->type == AVMEDIA_TYPE_AUDIO &&
-                     (!ff_merge_samplerates(inlink->in_samplerates,
-                                            inlink->out_samplerates) ||
-diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
-index e0ff7e4dd8..77bc3d83fe 100644
---- a/libavfilter/buffersrc.c
-+++ b/libavfilter/buffersrc.c
-@@ -213,7 +213,7 @@ static int av_buffersrc_add_frame_internal(AVFilterContext *ctx,
- 
-         switch (ctx->outputs[0]->type) {
-         case AVMEDIA_TYPE_VIDEO:
--            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
-+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
-                                      frame->format, frame->pts);
-             break;
-         case AVMEDIA_TYPE_AUDIO:
-diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
-new file mode 100644
-index 0000000000..64578b7ac4
---- /dev/null
-+++ b/libavfilter/vf_unsand.c
-@@ -0,0 +1,232 @@
-+/*
-+ * Copyright (c) 2007 Bobby Bingham
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * format and noformat video filters
-+ */
-+
-+#include <string.h>
-+
-+#include "libavutil/internal.h"
-+#include "libavutil/mem.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
-+
-+typedef struct UnsandContext {
-+    const AVClass *class;
-+} UnsandContext;
-+
-+static av_cold void uninit(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+}
-+
-+static av_cold int init(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+
-+    return 0;
-+}
-+
-+
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterLink * const outlink = link->dst->outputs[0];
-+    AVFrame *out = NULL;
-+    int rv = 0;
-+
-+    if (outlink->format == in->format) {
-+        // If nothing to do then do nothing
-+        out = in;
-+    }
-+    else
-+    {
-+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
-+        {
-+            rv = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
-+        {
-+            rv = -1;
-+            goto fail;
-+        }
-+
-+        av_frame_free(&in);
-+    }
-+
-+    return ff_filter_frame(outlink, out);
-+
-+fail:
-+    av_frame_free(&out);
-+    av_frame_free(&in);
-+    return rv;
-+}
-+
-+#if 0
-+static void dump_fmts(const AVFilterFormats * fmts)
-+{
-+    int i;
-+    if (fmts== NULL) {
-+        printf("NULL\n");
-+        return;
-+    }
-+    for (i = 0; i < fmts->nb_formats; ++i) {
-+        printf(" %d", fmts->formats[i]);
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+static int query_formats(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+    int ret;
-+
-+    // If we aren't connected at both ends then just do nothing
-+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
-+        return 0;
-+
-+//    printf("Unsand: %s in: ", __func__);
-+//    dump_fmts(ctx->inputs[0]->in_formats);
-+//    printf("Unsand: %s out: ", __func__);
-+//    dump_fmts(ctx->outputs[0]->out_formats);
-+
-+    // Our output formats depend on our input formats and we can't/don't
-+    // want to convert between bit depths so we need to wait for the source
-+    // to have an opinion before we do
-+    if (ctx->inputs[0]->in_formats == NULL)
-+        return AVERROR(EAGAIN);
-+
-+    // Accept anything
-+    if (ctx->inputs[0]->out_formats == NULL &&
-+        (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
-+        return ret;
-+
-+    // Filter out sand formats
-+
-+    // Generate a container if we don't already have one
-+    if (ctx->outputs[0]->in_formats == NULL)
-+    {
-+        // Somewhat rubbish way of ensuring we have a good structure
-+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
-+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
-+
-+        if (formats == NULL)
-+            return AVERROR(ENOMEM);
-+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
-+            return ret;
-+    }
-+
-+    // Replace old format list with new filtered list derived from what our
-+    // input says it can do
-+    {
-+        const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
-+        AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
-+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
-+        int i;
-+        int n = 0;
-+        int seen_420p = 0;
-+        int seen_420p10 = 0;
-+
-+        for (i = 0; i < src_ff->nb_formats; ++i) {
-+            const enum AVPixelFormat f = src_ff->formats[i];
-+
-+            switch (f){
-+                case AV_PIX_FMT_YUV420P:
-+                case AV_PIX_FMT_SAND128:
-+                    if (!seen_420p) {
-+                        seen_420p = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
-+                    }
-+                    break;
-+                case AV_PIX_FMT_SAND64_10:
-+                case AV_PIX_FMT_YUV420P10:
-+                    if (!seen_420p10) {
-+                        seen_420p10 = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
-+                    }
-+                    break;
-+                default:
-+                    dst_fmts[n++] = f;
-+                    break;
-+            }
-+        }
-+
-+        av_freep(&dst_ff->formats);
-+        dst_ff->formats = dst_fmts;
-+        dst_ff->nb_formats = n;
-+    }
-+
-+//    printf("Unsand: %s calc: ", __func__);
-+//    dump_fmts(ctx->outputs[0]->in_formats);
-+
-+    return 0;
-+}
-+
-+
-+#define OFFSET(x) offsetof(UnsandContext, x)
-+static const AVOption unsand_options[] = {
-+    { NULL }
-+};
-+
-+
-+AVFILTER_DEFINE_CLASS(unsand);
-+
-+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
-+    {
-+        .name             = "default",
-+        .type             = AVMEDIA_TYPE_VIDEO,
-+        .filter_frame = filter_frame,
-+    },
-+    { NULL }
-+};
-+
-+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
-+    {
-+        .name = "default",
-+        .type = AVMEDIA_TYPE_VIDEO
-+    },
-+    { NULL }
-+};
-+
-+AVFilter ff_vf_unsand = {
-+    .name          = "unsand",
-+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
-+
-+    .init          = init,
-+    .uninit        = uninit,
-+
-+    .query_formats = query_formats,
-+
-+    .priv_size     = sizeof(UnsandContext),
-+    .priv_class    = &unsand_class,
-+
-+    .inputs        = avfilter_vf_unsand_inputs,
-+    .outputs       = avfilter_vf_unsand_outputs,
-+};
-+
-diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 6c6f4e1bd1..c6332d3e46 100644
---- a/libavformat/utils.c
-+++ b/libavformat/utils.c
-@@ -3013,6 +3013,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
-     return 1;
- }
- 
-+#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
-+// This should be quite general purpose but avoid possible conflicts
-+// by limiting usage to cases wehere we know it works.
-+static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
-+{
-+    // Only try fallback if we know it is supported (HEVC only)
-+    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
-+        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
-+    int err;
-+
-+    // Failed to find fallback or we are already at the fallback
-+    if (new_codec == NULL || new_codec == old_codec)
-+    {
-+        return AVERROR_DECODER_NOT_FOUND;
-+    }
-+
-+    // * This may be dodgy - header says to not use this fn,
-+    //   especially if we are going to reopen the context...
-+    //   (but it does seem to work for our cases)
-+    if (avcodec_is_open(avctx)) {
-+        avcodec_close(avctx);
-+    }
-+
-+    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
-+    {
-+        return err;
-+    }
-+
-+    return 0;
-+}
-+#else
-+#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
-+#endif
-+
- /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
- static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
-                             AVDictionary **options)
-@@ -3047,7 +3081,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
-         av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
-         if (s->codec_whitelist)
-             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
--        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
-+        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
-+        {
-+            // Try fallback if if looks worth a try
-+            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
-+        }
-         if (!options)
-             av_dict_free(&thread_opt);
-         if (ret < 0) {
-@@ -3078,6 +3116,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
-         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
-             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
-             ret = avcodec_send_packet(avctx, &pkt);
-+
-+            // If we are going to want to fall back we should know here
-+            if (ret == AVERROR_DECODER_NOT_FOUND) {
-+                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
-+                    break;
-+                continue;
-+            }
-+
-             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
-                 break;
-             if (ret >= 0)
-@@ -3671,9 +3717,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
-         // Try to just open decoders, in case this is enough to get parameters.
-         if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
-             if (codec && !avctx->codec)
--                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
--                    av_log(ic, AV_LOG_WARNING,
--                           "Failed to open codec in %s\n",__FUNCTION__);
-+            {
-+                int err;
-+
-+                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
-+                {
-+                    if (err == AVERROR_DECODER_NOT_FOUND) {
-+                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
-+                    }
-+                    if (err < 0) {
-+                        av_log(ic, AV_LOG_WARNING,
-+                               "Failed to open codec in %s\n",__FUNCTION__);
-+                    }
-+                }
-+            }
-         }
-         if (!options)
-             av_dict_free(&thread_opt);
-diff --git a/libavutil/Makefile b/libavutil/Makefile
-index 8a7a44e4b5..6bfc885796 100644
---- a/libavutil/Makefile
-+++ b/libavutil/Makefile
-@@ -170,6 +170,7 @@ OBJS-$(CONFIG_LZO)                      += lzo.o
- OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
- OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
- OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
-+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
- OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
- OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
- OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
-diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
-index 5da44b0542..b74b7c4e2f 100644
---- a/libavutil/arm/Makefile
-+++ b/libavutil/arm/Makefile
-@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
- 
- NEON-OBJS += arm/float_dsp_init_neon.o                                  \
-              arm/float_dsp_neon.o                                       \
-+             arm/rpi_sand_neon.o                                        \
-diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
-new file mode 100644
-index 0000000000..dbffdaefa4
---- /dev/null
-+++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,40 @@
-+#include "libavutil/arm/asm.S"
-+
-+@ void rpi_sand128b_stripe_to_8_10(
-+@   uint8_t * dest,             [r0]
-+@   const uint8_t * src1,       [r1]
-+@   const uint8_t * src2,       [r2]
-+@   unsigned int lines);        [r3]
-+
-+.macro  stripe2_to_8, bit_depth
-+        vpush    {q4-q7}
-+1:
-+        vldm     r1!, {q0-q7}
-+        subs     r3, #1
-+        vldm     r2!, {q8-q15}
-+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
-+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
-+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
-+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
-+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
-+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
-+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
-+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
-+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
-+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
-+        vqrshrn.u16 d10, q10, #\bit_depth - 8
-+        vqrshrn.u16 d11, q11, #\bit_depth - 8
-+        vqrshrn.u16 d12, q12, #\bit_depth - 8
-+        vqrshrn.u16 d13, q13, #\bit_depth - 8
-+        vqrshrn.u16 d14, q14, #\bit_depth - 8
-+        vqrshrn.u16 d15, q15, #\bit_depth - 8
-+        vstm     r0!, {q0-q7}
-+        bne      1b
-+        vpop     {q4-q7}
-+        bx       lr
-+.endm
-+
-+function rpi_sand128b_stripe_to_8_10, export=1
-+        stripe2_to_8     10
-+endfunc
-+
-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 8d1aa5fa84..649876db77 100644
---- a/libavutil/buffer.c
-+++ b/libavutil/buffer.c
-@@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
- 
-     return ret;
- }
-+
-+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
-+void *av_buffer_pool_opaque(AVBufferRef *ref) {
-+  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
-+  return buf->opaque;
-+}
-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 73b6bd0b14..d907de3f1c 100644
---- a/libavutil/buffer.h
-+++ b/libavutil/buffer.h
-@@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
-  */
- AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
- 
-+// Return the opaque for the underlying frame
-+void *av_buffer_pool_opaque(AVBufferRef *ref);
-+
- /**
-  * @}
-  */
-diff --git a/libavutil/frame.c b/libavutil/frame.c
-index dcf1fc3d17..dd0876f5a9 100644
---- a/libavutil/frame.c
-+++ b/libavutil/frame.c
-@@ -16,6 +16,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "config.h"
-+
- #include "channel_layout.h"
- #include "avassert.h"
- #include "buffer.h"
-@@ -25,6 +27,9 @@
- #include "imgutils.h"
- #include "mem.h"
- #include "samplefmt.h"
-+#if CONFIG_SAND
-+#include "rpi_sand_fns.h"
-+#endif
- 
- #if FF_API_FRAME_GET_SET
- MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
-@@ -893,6 +898,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
-         (frame->crop_top + frame->crop_bottom) >= frame->height)
-         return AVERROR(ERANGE);
- 
-+#if CONFIG_SAND
-+    // Sand cannot be cropped - do not try
-+    if (av_rpi_is_sand_format(frame->format))
-+        return 0;
-+#endif
-+
-     desc = av_pix_fmt_desc_get(frame->format);
-     if (!desc)
-         return AVERROR_BUG;
-diff --git a/libavutil/frame.h b/libavutil/frame.h
-index 5d3231e7bb..e250f420a2 100644
---- a/libavutil/frame.h
-+++ b/libavutil/frame.h
-@@ -964,6 +964,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
-  */
- const char *av_frame_side_data_name(enum AVFrameSideDataType type);
- 
-+
-+static inline int av_frame_cropped_width(const AVFrame * const frame)
-+{
-+    return frame->width - (frame->crop_left + frame->crop_right);
-+}
-+static inline int av_frame_cropped_height(const AVFrame * const frame)
-+{
-+    return frame->height - (frame->crop_top + frame->crop_bottom);
-+}
-+
- /**
-  * @}
-  */
-diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
-index b97b0665b0..e3f21b1137 100644
---- a/libavutil/pixdesc.c
-+++ b/libavutil/pixdesc.c
-@@ -2344,6 +2344,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
-         },
-         .flags = AV_PIX_FMT_FLAG_PLANAR,
-     },
-+    [AV_PIX_FMT_SAND128] = {
-+        .name = "sand128",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
-+            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
-+            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
-+        },
-+        .flags = 0,
-+    },
-+    [AV_PIX_FMT_SAND64_10] = {
-+        .name = "sand64_10",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
-+            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
-+            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
-+        },
-+        .flags = 0,
-+    },
- };
- #if FF_API_PLUS1_MINUS1
- FF_ENABLE_DEPRECATION_WARNINGS
-diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index 8b54c9415b..9f74f7b335 100644
---- a/libavutil/pixfmt.h
-+++ b/libavutil/pixfmt.h
-@@ -347,6 +347,10 @@ enum AVPixelFormat {
- 
-     AV_PIX_FMT_NV24,      ///< planar YUV 4:4:4, 24bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (first byte U and the following byte V)
-     AV_PIX_FMT_NV42,      ///< as above, but U and V bytes are swapped
-+    // RPI - not on ifdef so can be got at by calling progs
-+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
- 
-     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
- };
-diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
-new file mode 100644
-index 0000000000..52d52a2a83
---- /dev/null
-+++ b/libavutil/rpi_sand_fn_pw.h
-@@ -0,0 +1,182 @@
-+// * Included twice from rpi_sand_fn with different PW
-+
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x;
-+    const unsigned int w = _w;
-+    const unsigned int mask = stride1 - 1;
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
-+            memcpy(dst, p, w);
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const uint8_t * p = p2;
-+            uint8_t * d = dst;
-+            memcpy(d, p1, w1);
-+            d += w1;
-+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
-+                memcpy(d, p, stride1);
-+            }
-+            memcpy(d, p, w3);
-+        }
-+    }
-+}
-+
-+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
-+
-+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            const pixel * p = (const pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * p = (const pixel *)p1;
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *du++ = *p++;
-+                    *dv++ = *p++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+}
-+
-+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *p++ = *su++;
-+                    *p++ = *sv++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+}
-+
-+
-+#undef pixel
-+#undef STRCAT
-+#undef FUNC
-+
-diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
-new file mode 100644
-index 0000000000..3e31ef77ec
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,151 @@
-+#include "config.h"
-+#include <stdint.h>
-+#include <string.h>
-+#include "rpi_sand_fns.h"
-+#include "avassert.h"
-+#include "frame.h"
-+
-+#define PW 1
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#define PW 2
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#if HAVE_NEON
-+void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
-+#endif
-+
-+#if 1
-+// Simple round
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    const unsigned int rnd = (1 << shr) >> 1;
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        *dst++ = (*src++ + rnd) >> shr;
-+    }
-+}
-+#else
-+// Dithered variation
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    unsigned int rnd = (1 << shr) >> 1;
-+    const unsigned int mask = ((1 << shr) - 1);
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        rnd = *src++ + (rnd & mask);
-+        *dst++ = rnd >> shr;
-+    }
-+}
-+#endif
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr)
-+{
-+    const unsigned int n = dst_stride1 / 2;
-+    unsigned int j;
-+
-+    // This is true for our current layouts
-+    av_assert0(dst_stride1 == src_stride1);
-+
-+    // As we have the same stride1 for src & dest and src is wider than dest
-+    // then if we loop on src we can always write contiguously to dest
-+    // We make no effort to copy an exact width - round up to nearest src stripe
-+    // as we will always have storage in dest for that
-+
-+#if HAVE_NEON
-+    if (shr == 3 && src_stride1 == 128) {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
-+        }
-+    }
-+    else
-+#endif
-+    {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
-+                cpy16_to_8(d, s1, n, shr);
-+                cpy16_to_8(d + n, s2, n, shr);
-+            }
-+        }
-+    }
-+
-+    // Fix up a trailing dest half stripe
-+    if (j < w) {
-+        uint8_t * d = dst + j * dst_stride2;
-+        const uint8_t * s1 = src + j * 2 * src_stride2;
-+
-+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
-+            cpy16_to_8(d, s1, n, shr);
-+        }
-+    }
-+}
-+
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
-+{
-+    const int w = av_frame_cropped_width(src);
-+    const int h = av_frame_cropped_height(src);
-+    const int x = src->crop_left;
-+    const int y = src->crop_top;
-+
-+    // We will crop as part of the conversion
-+    dst->crop_top = 0;
-+    dst->crop_left = 0;
-+    dst->crop_bottom = 0;
-+    dst->crop_right = 0;
-+
-+    switch (src->format){
-+        case AV_PIX_FMT_SAND128:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P:
-+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2,  w/2, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        case AV_PIX_FMT_SAND64_10:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P10:
-+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x*2, y, w*2, h);
-+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y/2,  w, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        default:
-+            return -1;
-+    }
-+
-+    return av_frame_copy_props(dst, src);
-+}
-diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
-new file mode 100644
-index 0000000000..1f50b68ea8
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,136 @@
-+#ifndef AVUTIL_RPI_SAND_FNS
-+#define AVUTIL_RPI_SAND_FNS
-+
-+#include "libavutil/frame.h"
-+
-+// For all these fns _x & _w are measured as coord * PW
-+// For the C fns coords are in chroma pels (so luma / 2)
-+// Strides are in bytes
-+
-+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr);
-+
-+
-+// dst must contain required pixel format & allocated data buffers
-+// Cropping on the src buffer will be honoured and dst crop will be set to zero
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
-+
-+
-+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
-+{
-+#ifdef RPI_ZC_SAND128_ONLY
-+    // If we are sure we only only support 128 byte sand formats replace the
-+    // var with a constant which should allow for better optimisation
-+    return 128;
-+#else
-+    return frame->linesize[0];
-+#endif
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
-+{
-+    return frame->linesize[3];
-+}
-+
-+
-+static inline int av_rpi_is_sand_format(const int format)
-+{
-+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
-+}
-+
-+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand_format(frame->format);
-+}
-+
-+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
-+{
-+    return (frame->format == AV_PIX_FMT_SAND128);
-+}
-+
-+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
-+{
-+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
-+}
-+
-+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
-+}
-+
-+// If x is measured in bytes (not pixels) then this works for sand64_16 as
-+// well as sand128 - but in the general case we work that out
-+
-+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
-+}
-+
-+#endif
-+
-diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
-new file mode 100644
-index 0000000000..b1e99a6a89
---- /dev/null
-+++ b/pi-util/BUILD.txt
-@@ -0,0 +1,25 @@
-+Building Pi FFmpeg
-+==================
-+
-+Configuration:
-+=============
-+
-+pi-util/conf_pi2.sh
-+
-+contains suitable options to build the code for Pi2/3.  It expects to find
-+git clones of
-+
-+https://github.com/raspberrypi/tools
-+https://github.com/raspberrypi/firmware
-+
-+in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
-+lot of history you don't want.
-+
-+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
-+rebuilt.  Otherwise the prebuilt .c & .h files will be used.
-+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
-+
-+pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
-+H265 QPU acceleration is broken on Pi1 and so it is disabled.
-+
-+
-diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
-new file mode 100644
-index 0000000000..17fb09be58
---- /dev/null
-+++ b/pi-util/conf_h265.2016.csv
-@@ -0,0 +1,195 @@
-+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
-+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
-+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
-+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
-+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
-+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
-+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
-+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
-+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
-+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
-+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
-+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
-+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
-+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
-+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
-+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
-+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
-+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
-+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
-+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
-+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed
-+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
-+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
-+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
-+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
-+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
-+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
-+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
-+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5
-+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5
-diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
-new file mode 100644
-index 0000000000..b482907fcb
---- /dev/null
-+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
-@@ -0,0 +1,147 @@
-+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
-new file mode 100644
-index 0000000000..113528cfb0
---- /dev/null
-+++ b/pi-util/conf_h265.csv
-@@ -0,0 +1,144 @@
-+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
-+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
-+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
-+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
-+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
-+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
-+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
-new file mode 100755
-index 0000000000..59c0d3959e
---- /dev/null
-+++ b/pi-util/conf_pi1.sh
-@@ -0,0 +1,30 @@
-+echo "Configure for Pi1"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+./configure --enable-cross-compile\
-+ --cpu=arm1176jzf-s\
-+ --arch=arm\
-+ --disable-neon\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
-new file mode 100755
-index 0000000000..40549a35e5
---- /dev/null
-+++ b/pi-util/conf_pi2.sh
-@@ -0,0 +1,32 @@
-+echo "Configure for Pi2/3"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+./configure --enable-cross-compile\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --enable-rpi\
-+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
-new file mode 100755
-index 0000000000..e9556f0837
---- /dev/null
-+++ b/pi-util/ffconf.py
-@@ -0,0 +1,175 @@
-+#!/usr/bin/env python
-+
-+import string
-+import os
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+ffmpeg_exec = "./ffmpeg"
-+
-+def testone(fileroot, srcname, es_file, md5_file, vcodec):
-+    tmp_root = "/tmp"
-+
-+    names = srcname.split('/')
-+    while len(names) > 1:
-+        tmp_root = os.path.join(tmp_root, names[0])
-+        del names[0]
-+    name = names[0]
-+
-+    if not os.path.exists(tmp_root):
-+        os.makedirs(tmp_root)
-+
-+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
-+    try:
-+        os.remove(dec_file)
-+    except:
-+        pass
-+
-+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
-+
-+    # Unaligned needed for cropping conformance
-+    rstr = subprocess.call(
-+        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
-+        stdout=flog, stderr=subprocess.STDOUT)
-+
-+    try:
-+        m1 = None
-+        m2 = None
-+        with open(os.path.join(fileroot, md5_file)) as f:
-+            for line in f:
-+                m1 = re.search("[0-9a-f]{32}", line.lower())
-+                if m1:
-+                    break
-+
-+        with open(dec_file) as f:
-+            m2 = re.search("[0-9a-f]{32}", f.readline())
-+    except:
-+        pass
-+
-+    if  m1 and m2 and m1.group() == m2.group():
-+        print >> flog, "Match: " + m1.group()
-+        rv = 0
-+    elif not m1:
-+        print >> flog, "****** Cannot find m1"
-+        rv = 3
-+    elif not m2:
-+        print >> flog, "****** Cannot find m2"
-+        rv = 2
-+    else:
-+        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
-+        rv = 1
-+    flog.close()
-+    return rv
-+
-+def scandir(root):
-+    aconf = []
-+    ents = os.listdir(root)
-+    ents.sort(key=str.lower)
-+    for name in ents:
-+        test_path = os.path.join(root, name)
-+        if S_ISDIR(os.stat(test_path).st_mode):
-+            files = os.listdir(test_path)
-+            es_file = "?"
-+            md5_file = "?"
-+            for f in files:
-+                (base, ext) = os.path.splitext(f)
-+                if base[0] == '.':
-+                    pass
-+                elif ext == ".bit" or ext == ".bin":
-+                    es_file = f
-+                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
-+                    if md5_file == "?":
-+                        md5_file = f
-+                    elif base[-3:] == "yuv":
-+                        md5_file = f
-+            aconf.append((1, name, es_file, md5_file))
-+    return aconf
-+
-+def runtest(name, tests):
-+    if not tests:
-+        return True
-+    for t in tests:
-+        if name[0:len(t)] == t or name.find("/" + t) != -1:
-+            return True
-+    return False
-+
-+def doconf(csva, tests, test_root, vcodec):
-+    unx_failures = []
-+    unx_success = []
-+    failures = 0
-+    successes = 0
-+    for a in csva:
-+        exp_test = int(a[0])
-+        if (exp_test and runtest(a[1], tests)):
-+            name = a[1]
-+            print "==== ", name,
-+            sys.stdout.flush()
-+
-+            rv = testone(os.path.join(test_root, name), name, a[2], a[3], vcodec=vcodec)
-+            if (rv == 0):
-+                successes += 1
-+            else:
-+                failures += 1
-+
-+            if (rv == 0):
-+                if exp_test == 2:
-+                    print ": * OK *"
-+                    unx_success.append(name)
-+                else:
-+                    print ": ok"
-+            elif exp_test == 2 and rv == 1:
-+                print ": fail"
-+            elif exp_test == 3 and rv == 2:
-+                # Call an expected "crash" an abort
-+                print ": abort"
-+            else:
-+                unx_failures.append(name)
-+                if rv == 1:
-+                    print ": * FAIL *"
-+                elif (rv == 2) :
-+                    print ": * CRASH *"
-+                elif (rv == 3) :
-+                    print ": * MD5 MISSING *"
-+                else :
-+                    print ": * BANG *"
-+
-+    if unx_failures or unx_success:
-+        print "Unexpected Failures:", unx_failures
-+        print "Unexpected Success: ", unx_success
-+    else:
-+        print "All tests normal:", successes, "ok,", failures, "failed"
-+
-+
-+class ConfCSVDialect(csv.Dialect):
-+    delimiter = ','
-+    doublequote = True
-+    lineterminator = '\n'
-+    quotechar='"'
-+    quoting = csv.QUOTE_MINIMAL
-+    skipinitialspace = True
-+    strict = True
-+
-+if __name__ == '__main__':
-+
-+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
-+    argp.add_argument("tests", nargs='*')
-+    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
-+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
-+    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
-+    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
-+    args = argp.parse_args()
-+
-+    if args.csvgen:
-+        csv.writer(sys.stdout).writerows(scandir(args.test_root))
-+        exit(0)
-+
-+    with open(args.csv, 'rt') as csvfile:
-+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
-+
-+
-+    doconf(csva, args.tests, args.test_root, args.vcodec)
-+
-diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
-new file mode 100755
-index 0000000000..8bb326943f
---- /dev/null
-+++ b/pi-util/ffperf.py
-@@ -0,0 +1,125 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+    close_threshold = 0.01
-+
-+    def __init__(self, stats_dict=None):
-+        if stats_dict != None:
-+            self.name = stats_dict["name"]
-+            self.elapsed = float(stats_dict["elapsed"])
-+            self.user = float(stats_dict["user"])
-+            self.sys = float(stats_dict["sys"])
-+
-+    def times_str(self):
-+        ctime = self.sys + self.user
-+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+    def dict(self):
-+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+    def is_close(self, other):
-+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+    def __lt__(self, other):
-+        return self.elapsed < other.elapsed
-+    def __gt__(self, other):
-+        return self.elapsed > other.elapsed
-+
-+    def time_file(name, prefix):
-+        stats = tstats()
-+        stats.name = name
-+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
-+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+        pinfo = os.wait4(cproc.pid, 0)
-+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        stats.elapsed = end_time - start_time
-+        stats.user = pinfo[2].ru_utime
-+        stats.sys = pinfo[2].ru_stime
-+        return stats
-+
-+
-+def common_prefix(s1, s2):
-+    for i in range(min(len(s1),len(s2))):
-+        if s1[i] != s2[i]:
-+            return s1[:i]
-+    return s1[:i+1]
-+
-+def main():
-+    global flog
-+
-+    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
-+To blank the screen before starting use "xdg-screensaver activate"
-+(For some reason this doesn't seem to work from within python).
-+""")
-+
-+    argp.add_argument("streams", nargs='*')
-+    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
-+    argp.add_argument("--csv_in", help="CSV input filename")
-+    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
-+    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
-+
-+    args = argp.parse_args()
-+
-+    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
-+    csv_out.writeheader()
-+
-+    stats_in = {}
-+    if args.csv_in != None:
-+        with open(args.csv_in, 'r', newline='') as f_in:
-+            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
-+
-+    streams = args.streams
-+    if not streams:
-+        if not stats_in:
-+            print ("No source streams specified")
-+            return 1
-+        prefix = "" if args.prefix == None else args.prefix
-+        streams = [k for k in stats_in]
-+    elif args.prefix != None:
-+        prefix = args.prefix
-+    else:
-+        prefix = streams[0]
-+        for f in streams[1:]:
-+            prefix = common_prefix(prefix, f)
-+        pp = prefix.rpartition(os.sep)
-+        prefix = pp[0] + pp[1]
-+        streams = [s[len(prefix):] for s in streams]
-+
-+    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+        print ("====", f)
-+
-+        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
-+        for i in range(args.repeat):
-+            t = tstats.time_file(f, prefix)
-+            print ("...", t.times_str())
-+            if t0 > t:
-+                t0 = t
-+
-+        if t0.name in stats_in:
-+            pstat = stats_in[t0.name]
-+            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
-+
-+        csv_out.writerow(t0.dict())
-+
-+        print ()
-+
-+    return 0
-+
-+
-+if __name__ == '__main__':
-+    exit(main())
-+
-diff --git a/pi-util/make_array.py b/pi-util/make_array.py
-new file mode 100755
-index 0000000000..67b22d2d51
---- /dev/null
-+++ b/pi-util/make_array.py
-@@ -0,0 +1,23 @@
-+#!/usr/bin/env python
-+
-+# Usage
-+#   make_array file.bin
-+#   Produces file.h with array of bytes.
-+#
-+import sys
-+for file in sys.argv[1:]:
-+  prefix,suffix = file.split('.')
-+  assert suffix=='bin'
-+  name=prefix.split('/')[-1]
-+  print 'Converting',file
-+  with open(prefix+'.h','wb') as out:
-+    print >>out, 'static const unsigned char',name,'[] = {'
-+    with open(file,'rb') as fd:
-+      i = 0
-+      for byte in fd.read():
-+        print >>out, '0x%02x, ' % ord(byte),
-+        i = i + 1
-+        if i % 8 == 0:
-+          print >>out, ' // %04x' % (i - 8)
-+    print >>out,'};'
-+
-diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py
-new file mode 100644
-index 0000000000..e44cfa0c3c
---- /dev/null
-+++ b/pi-util/perfcmp.py
-@@ -0,0 +1,101 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+    close_threshold = 0.01
-+
-+    def __init__(self, stats_dict=None):
-+        if stats_dict != None:
-+            self.name = stats_dict["name"]
-+            self.elapsed = float(stats_dict["elapsed"])
-+            self.user = float(stats_dict["user"])
-+            self.sys = float(stats_dict["sys"])
-+
-+    def times_str(self):
-+        ctime = self.sys + self.user
-+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+    def dict(self):
-+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+    def is_close(self, other):
-+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+    def __lt__(self, other):
-+        return self.elapsed < other.elapsed
-+    def __gt__(self, other):
-+        return self.elapsed > other.elapsed
-+
-+    def time_file(name, prefix):
-+        stats = tstats()
-+        stats.name = name
-+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
-+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+        pinfo = os.wait4(cproc.pid, 0)
-+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        stats.elapsed = end_time - start_time
-+        stats.user = pinfo[2].ru_utime
-+        stats.sys = pinfo[2].ru_stime
-+        return stats
-+
-+
-+def common_prefix(s1, s2):
-+    for i in range(min(len(s1),len(s2))):
-+        if s1[i] != s2[i]:
-+            return s1[:i]
-+    return s1[:i+1]
-+
-+def main():
-+    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
-+
-+    argp.add_argument("stream0", help="CSV to compare")
-+    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
-+
-+    args = argp.parse_args()
-+
-+    with open(args.stream0, 'r', newline='') as f_in:
-+        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+    with open(args.stream1, 'r', newline='') as f_in:
-+        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+    print (args.stream0, "<<-->>", args.stream1)
-+    print ()
-+
-+    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+       if not (f in stats0) :
-+           print ("           XX               :", f)
-+           continue
-+       if not (f in stats1) :
-+           print ("       XX                   :", f)
-+           continue
-+
-+       s0 = stats0[f]
-+       s1 = stats1[f]
-+
-+       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
-+       thresh = 0.3
-+       tc = 6
-+
-+       nchar = min(tc - 1, int(abs(pcent) / thresh))
-+       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
-+
-+       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
-+           (s0.elapsed, cc, s1.elapsed, pcent, f))
-+
-+    return 0
-+
-+
-+if __name__ == '__main__':
-+    exit(main())
-+
-diff --git a/pi-util/qem.sh b/pi-util/qem.sh
-new file mode 100755
-index 0000000000..a4dbb6eacd
---- /dev/null
-+++ b/pi-util/qem.sh
-@@ -0,0 +1,9 @@
-+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
-+QASM=python\ ../local/bin/qasm.py
-+SRC_FILE=libavcodec/rpi_hevc_shader.qasm
-+DST_BASE=shader
-+
-+cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
-+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
-+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
-+
-diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
-new file mode 100755
-index 0000000000..5935a11ca5
---- /dev/null
-+++ b/pi-util/v3dusage.py
-@@ -0,0 +1,128 @@
-+#!/usr/bin/env python
-+
-+import sys
-+import argparse
-+import re
-+
-+def do_logparse(logname):
-+
-+    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
-+    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
-+    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
-+    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
-+
-+    ttotal = {'idle':0.0}
-+    tstart = {}
-+    qctotal = {}
-+    qtstotal = {}
-+    l2hits = {}
-+    l2total = {}
-+    time0 = None
-+    idle_start = None
-+    qpu_op_no = 0
-+    op_count = 0
-+
-+    with open(logname, "rt") as infile:
-+        for line in infile:
-+            match = rmatch.match(line)
-+            if match:
-+#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
-+                time = float(match.group(1))
-+                unit = match.group(3)
-+                opstart = not match.group(2)
-+                optype = match.group(7)
-+                hascb = match.group(8) != "0"
-+
-+                if unit == 'qpu1':
-+                    unit = unit + "." + str(qpu_op_no)
-+                    if not opstart:
-+                        if hascb or optype == 'EXECUTE_SYNC':
-+                            qpu_op_no = 0
-+                        else:
-+                            qpu_op_no += 1
-+
-+                # Ignore sync type
-+                if optype == 'EXECUTE_SYNC':
-+                    continue
-+
-+                if not time0:
-+                    time0 = time
-+
-+                if opstart:
-+                    tstart[unit] = time;
-+                elif unit in tstart:
-+                    op_count += 1
-+                    if not unit in ttotal:
-+                        ttotal[unit] = 0.0
-+                    ttotal[unit] += time - tstart[unit]
-+                    del tstart[unit]
-+
-+                if not idle_start and not tstart:
-+                    idle_start = time
-+                elif idle_start and tstart:
-+                    ttotal['idle'] += time - idle_start
-+                    idle_start = None
-+
-+            match = rqcycle.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in qctotal:
-+                    qctotal[unit] = 0
-+                qctotal[unit] += int(match.group(2))
-+
-+            match = rqtscycle.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in qtstotal:
-+                    qtstotal[unit] = 0
-+                qtstotal[unit] += int(match.group(2))
-+
-+            match = rl2hits.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in l2total:
-+                    l2total[unit] = 0
-+                    l2hits[unit] = 0
-+                l2total[unit] += int(match.group(3))
-+                if match.group(2) == "hits":
-+                    l2hits[unit] += int(match.group(3))
-+
-+
-+    if not time0:
-+        print "No v3d profile records found"
-+    else:
-+        tlogged = time - time0
-+
-+        print "Logged time:", tlogged, "  Op count:", op_count
-+        for unit in sorted(ttotal):
-+            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
-+        print
-+        for unit in sorted(qctotal):
-+            if not unit in qtstotal:
-+                qtstotal[unit] = 0;
-+            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
-+            if unit in l2total:
-+                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
-+
-+
-+
-+if __name__ == '__main__':
-+    argp = argparse.ArgumentParser(
-+        formatter_class=argparse.RawDescriptionHelpFormatter,
-+        description="QPU/VPU perf summary from VC logging",
-+        epilog = """
-+Will also summarise TMU stalls if logging requests set in qpu noflush param
-+in the profiled code.
-+
-+Example use:
-+  vcgencmd set_logging level=0xc0
-+  <command to profile>
-+  sudo vcdbg log msg >& t.log
-+  v3dusage.py t.log
-+""")
-+
-+    argp.add_argument("logfile")
-+    args = argp.parse_args()
-+
-+    do_logparse(args.logfile)
-+
--- 
-2.20.1
-
-From 8dda09dc660f4d29cc560e6ef82eb0bd82b52a0a Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 9 Jun 2020 14:52:07 +0100
-Subject: [PATCH] Fix Pi3 hevc_rpi cache flush overflow
-
-Cache flushs rounded height up to CTB size (in this case 64) and
-failed to limit by pic height. The code that actually operated
-on that area applied the limits correctly so nothing was corrupted.
-This fix, in fact, marginally simplifies the code as it ends up being
-able to remove a couple of later limit checks having got the numbers
-right in the first place.
----
- libavcodec/rpi_hevc_filter.c | 12 ++++++------
- libavcodec/rpi_hevcdec.c     |  3 +++
- 2 files changed, 9 insertions(+), 6 deletions(-)
-
-diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
-index c8a22bd3d8..5125d1eb6b 100644
---- a/libavcodec/rpi_hevc_filter.c
-+++ b/libavcodec/rpi_hevc_filter.c
-@@ -624,10 +624,10 @@ static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, c
-     const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-     const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-     const unsigned int ctb_size = (1 << log2_ctb_size);
--    const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 :  1);
-+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  1);
-     const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-     const DBParams * cb_dbp = s->deblock + ctb_n;
--    const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
- 
-     unsigned int cb_x;
- 
-@@ -734,10 +734,10 @@ static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds,
-     const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-     const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-     const unsigned int ctb_size = (1 << log2_ctb_size);
--    const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 :  8);
-+    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  8);
-     const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-     const DBParams * dbp = s->deblock + ctb_n;
--    const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
-     const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
-     const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
- 
-@@ -1129,8 +1129,8 @@ int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk boun
-     const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-     int x, y;
- 
--    const unsigned int br = FFMIN(bounds.x + bounds.w, s->ps.sps->width);
--    const unsigned int bb = FFMIN(bounds.y + bounds.h, s->ps.sps->height);
-+    const unsigned int br = bounds.x + bounds.w;
-+    const unsigned int bb = bounds.y + bounds.h;
- 
-     const int x_end = (br >= s->ps.sps->width);
-     const int y_end = (bb >= s->ps.sps->height);
-diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
-index 6d92c1dceb..2cc2ffbd13 100644
---- a/libavcodec/rpi_hevcdec.c
-+++ b/libavcodec/rpi_hevcdec.c
-@@ -3915,6 +3915,9 @@ static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb
-     bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
-     bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
-     bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+
-+    bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
-+    bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
- }
- 
- #if RPI_PASSES == 2
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0001-avutil-add-av_buffer_pool_flush.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0001-avutil-add-av_buffer_pool_flush.patch
deleted file mode 100644
index 119a6a33c4..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0001-avutil-add-av_buffer_pool_flush.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-From 3710124dd57ede7588884ecc86cbcaee5d530498 Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Mon, 3 Dec 2018 23:48:04 +0100
-Subject: [PATCH 01/12] avutil: add av_buffer_pool_flush()
-
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- libavutil/buffer.c | 13 +++++++++++++
- libavutil/buffer.h |  5 +++++
- 2 files changed, 18 insertions(+)
-
-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 8d1aa5fa84..58160f62f3 100644
---- a/libavutil/buffer.c
-+++ b/libavutil/buffer.c
-@@ -272,6 +272,19 @@ static void buffer_pool_free(AVBufferPool *pool)
-     av_freep(&pool);
- }
- 
-+void av_buffer_pool_flush(AVBufferPool *pool)
-+{
-+    ff_mutex_lock(&pool->mutex);
-+    while (pool->pool) {
-+        BufferPoolEntry *buf = pool->pool;
-+        pool->pool = buf->next;
-+
-+        buf->free(buf->opaque, buf->data);
-+        av_freep(&buf);
-+    }
-+    ff_mutex_unlock(&pool->mutex);
-+}
-+
- void av_buffer_pool_uninit(AVBufferPool **ppool)
- {
-     AVBufferPool *pool;
-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 73b6bd0b14..0678fa4bea 100644
---- a/libavutil/buffer.h
-+++ b/libavutil/buffer.h
-@@ -266,6 +266,11 @@ AVBufferPool *av_buffer_pool_init2(int size, void *opaque,
-                                    AVBufferRef* (*alloc)(void *opaque, int size),
-                                    void (*pool_free)(void *opaque));
- 
-+/**
-+ * Free all available buffers in a buffer pool.
-+ */
-+ void av_buffer_pool_flush(AVBufferPool *pool);
-+
- /**
-  * Mark the pool as being available for freeing. It will actually be freed only
-  * once all the allocated buffers associated with the pool are released. Thus it
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0002-Add-common-V4L2-request-API-code.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0002-Add-common-V4L2-request-API-code.patch
deleted file mode 100644
index 31a8c81d3b..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0002-Add-common-V4L2-request-API-code.patch
+++ /dev/null
@@ -1,1139 +0,0 @@
-From ca0440a82038ebe56c467d89787291f55535d03d Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Sat, 15 Dec 2018 22:32:16 +0100
-Subject: [PATCH 02/12] Add common V4L2 request API code
-
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- configure                 |  12 +
- libavcodec/Makefile       |   1 +
- libavcodec/hwaccel.h      |   2 +
- libavcodec/v4l2_request.c | 943 ++++++++++++++++++++++++++++++++++++++
- libavcodec/v4l2_request.h |  72 +++
- 5 files changed, 1030 insertions(+)
- create mode 100644 libavcodec/v4l2_request.c
- create mode 100644 libavcodec/v4l2_request.h
-
-diff --git a/configure b/configure
-index 34c2adb4a4..6b41344dfd 100755
---- a/configure
-+++ b/configure
-@@ -271,6 +271,7 @@ External library support:
-   --enable-libtls          enable LibreSSL (via libtls), needed for https support
-                            if openssl, gnutls or mbedtls is not used [no]
-   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
-+  --enable-libudev         enable libudev [no]
-   --enable-libv4l2         enable libv4l2/v4l-utils [no]
-   --enable-libvidstab      enable video stabilization using vid.stab [no]
-   --enable-libvmaf         enable vmaf filter via libvmaf [no]
-@@ -337,6 +338,7 @@ External library support:
-   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
-   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
-   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
-+  --enable-v4l2-request    enable V4L2 request API code [no]
-   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
-   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
-   --disable-videotoolbox   disable VideoToolbox code [autodetect]
-@@ -1797,6 +1799,7 @@ EXTERNAL_LIBRARY_LIST="
-     libtesseract
-     libtheora
-     libtwolame
-+    libudev
-     libv4l2
-     libvorbis
-     libvpx
-@@ -1851,6 +1854,7 @@ HWACCEL_LIBRARY_LIST="
-     mmal
-     omx
-     opencl
-+    v4l2_request
- "
- 
- DOCUMENT_LIST="
-@@ -2873,6 +2877,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
- dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
- ffnvcodec_deps_any="libdl LoadLibrary"
- nvdec_deps="ffnvcodec"
-+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
- vaapi_x11_deps="xlib"
- videotoolbox_hwaccel_deps="videotoolbox pthreads"
- videotoolbox_hwaccel_extralibs="-framework QuartzCore"
-@@ -6270,6 +6275,7 @@ enabled libtls            && require_pkg_config libtls libtls tls.h tls_configur
- enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
-                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
-                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
-+enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
- enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
- enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
- enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf
-@@ -6365,6 +6371,10 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
-                                { enabled libdrm ||
-                                  die "ERROR: rkmpp requires --enable-libdrm"; }
-                              }
-+enabled v4l2_request      && { enabled libdrm ||
-+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
-+                             { enabled libudev ||
-+                               die "ERROR: v4l2-request requires --enable-libudev"; }
- enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
- 
- 
-@@ -6444,6 +6454,8 @@ check_cc h264_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_H264;"
- check_cc vp8_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP8;"
- check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
- 
-+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
-+
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
- 
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 3cd73fbcc6..9e847eeadc 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -147,6 +147,7 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
- OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
- OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
- OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
-+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_request.o
- OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
- OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
- 
-diff --git a/libavcodec/hwaccel.h b/libavcodec/hwaccel.h
-index 3aaa92571c..2eefc91e7e 100644
---- a/libavcodec/hwaccel.h
-+++ b/libavcodec/hwaccel.h
-@@ -80,5 +80,7 @@ typedef struct AVCodecHWConfigInternal {
-     HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD,  NONE,         ff_ ## codec ## _d3d11va_hwaccel)
- #define HWACCEL_XVMC(codec) \
-     HW_CONFIG_HWACCEL(0, 0, 1, XVMC,         NONE,         ff_ ## codec ## _xvmc_hwaccel)
-+#define HWACCEL_V4L2REQUEST(codec) \
-+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
- 
- #endif /* AVCODEC_HWACCEL_H */
-diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c
-new file mode 100644
-index 0000000000..1dabf77689
---- /dev/null
-+++ b/libavcodec/v4l2_request.c
-@@ -0,0 +1,943 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <drm_fourcc.h>
-+#include <linux/media.h>
-+#include <sys/mman.h>
-+#include <sys/types.h>
-+#include <sys/stat.h>
-+#include <fcntl.h>
-+
-+#include <sys/sysmacros.h>
-+#include <libudev.h>
-+
-+#include "decode.h"
-+#include "internal.h"
-+#include "v4l2_request.h"
-+
-+uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    return req ? v4l2_timeval_to_ns(&req->capture.buffer.timestamp) : 0;
-+}
-+
-+int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    memset(&req->drm, 0, sizeof(AVDRMFrameDescriptor));
-+    req->output.used = 0;
-+    return 0;
-+}
-+
-+int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    memcpy(req->output.addr + req->output.used, data, size);
-+    req->output.used += size;
-+    return 0;
-+}
-+
-+static int v4l2_request_set_controls(V4L2RequestContext *ctx, int request_fd, struct v4l2_ext_control *control, int count)
-+{
-+    struct v4l2_ext_controls controls = {
-+        .controls = control,
-+        .count = count,
-+        .request_fd = request_fd,
-+        .which = (request_fd >= 0) ? V4L2_CTRL_WHICH_REQUEST_VAL : 0,
-+    };
-+
-+    if (!control || !count)
-+        return 0;
-+
-+    return ioctl(ctx->video_fd, VIDIOC_S_EXT_CTRLS, &controls);
-+}
-+
-+int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+
-+    ret = v4l2_request_set_controls(ctx, -1, control, count);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return ret;
-+}
-+
-+int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id)
-+{
-+    int ret;
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    struct v4l2_queryctrl control = {
-+        .id = id,
-+    };
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCTRL, &control);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return control.default_value;
-+}
-+
-+static int v4l2_request_queue_buffer(V4L2RequestContext *ctx, int request_fd, V4L2RequestBuffer *buf, uint32_t flags)
-+{
-+    struct v4l2_plane planes[1] = {};
-+    struct v4l2_buffer buffer = {
-+        .type = buf->buffer.type,
-+        .memory = buf->buffer.memory,
-+        .index = buf->index,
-+        .timestamp.tv_usec = buf->index + 1,
-+        .bytesused = buf->used,
-+        .request_fd = request_fd,
-+        .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags,
-+    };
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
-+        planes[0].bytesused = buf->used;
-+        buffer.bytesused = 0;
-+        buffer.length = 1;
-+        buffer.m.planes = planes;
-+    }
-+
-+    return ioctl(ctx->video_fd, VIDIOC_QBUF, &buffer);
-+}
-+
-+static int v4l2_request_dequeue_buffer(V4L2RequestContext *ctx, V4L2RequestBuffer *buf)
-+{
-+    int ret;
-+    struct v4l2_plane planes[1] = {};
-+    struct v4l2_buffer buffer = {
-+        .type = buf->buffer.type,
-+        .memory = buf->buffer.memory,
-+        .index = buf->index,
-+    };
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
-+        buffer.length = 1;
-+        buffer.m.planes = planes;
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_DQBUF, &buffer);
-+    if (ret < 0)
-+        return ret;
-+
-+    buf->buffer.timestamp = buffer.timestamp;
-+    return 0;
-+}
-+
-+const uint32_t v4l2_request_capture_pixelformats[] = {
-+    V4L2_PIX_FMT_NV12,
-+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
-+    V4L2_PIX_FMT_SUNXI_TILED_NV12,
-+#endif
-+};
-+
-+static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4l2_format *format)
-+{
-+    AVDRMFrameDescriptor *desc = &req->drm;
-+    AVDRMLayerDescriptor *layer = &desc->layers[0];
-+    uint32_t pixelformat = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat;
-+
-+    switch (pixelformat) {
-+    case V4L2_PIX_FMT_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
-+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
-+        break;
-+#endif
-+    default:
-+        return -1;
-+    }
-+
-+    desc->nb_objects = 1;
-+    desc->objects[0].fd = req->capture.fd;
-+    desc->objects[0].size = req->capture.size;
-+
-+    desc->nb_layers = 1;
-+    layer->nb_planes = 2;
-+
-+    layer->planes[0].object_index = 0;
-+    layer->planes[0].offset = 0;
-+    layer->planes[0].pitch = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.plane_fmt[0].bytesperline : format->fmt.pix.bytesperline;
-+
-+    layer->planes[1].object_index = 0;
-+    layer->planes[1].offset = layer->planes[0].pitch * (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.height : format->fmt.pix.height);
-+    layer->planes[1].pitch = layer->planes[0].pitch;
-+
-+    return 0;
-+}
-+
-+static int v4l2_request_queue_decode(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    struct timeval tv = { 2, 0 };
-+    fd_set except_fds;
-+    int ret;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice);
-+
-+    ret = v4l2_request_set_controls(ctx, req->request_fd, control, count);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    memset(req->output.addr + req->output.used, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-+
-+    ret = v4l2_request_queue_buffer(ctx, req->request_fd, &req->output, last_slice ? 0 : V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: queue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    if (first_slice) {
-+        ret = v4l2_request_queue_buffer(ctx, -1, &req->capture, 0);
-+        if (ret < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: queue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
-+            return -1;
-+        }
-+    }
-+
-+    // NOTE: do we need to dequeue when request fails/timeout?
-+
-+    // 4. queue request and wait
-+    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_QUEUE, NULL);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: queue request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        goto fail;
-+    }
-+
-+    FD_ZERO(&except_fds);
-+    FD_SET(req->request_fd, &except_fds);
-+
-+    ret = select(req->request_fd + 1, NULL, NULL, &except_fds, &tv);
-+    if (ret == 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: request %d timeout\n", __func__, req->request_fd);
-+        goto fail;
-+    } else if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: select request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    if (last_slice) {
-+        ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
-+        if (ret < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
-+            return -1;
-+        }
-+    }
-+
-+    // TODO: check errors
-+    // buffer.flags & V4L2_BUF_FLAG_ERROR
-+
-+    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    if (last_slice)
-+        return v4l2_request_set_drm_descriptor(req, &ctx->format);
-+
-+    return 0;
-+
-+fail:
-+    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
-+
-+    ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
-+
-+    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+
-+    return -1;
-+}
-+
-+int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+
-+    // fall back to queue each slice as a full frame
-+    if ((req->output.capabilities & V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) != V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
-+        return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
-+
-+    return v4l2_request_queue_decode(avctx, frame, control, count, first_slice, last_slice);
-+}
-+
-+int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count)
-+{
-+    return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
-+}
-+
-+static int v4l2_request_try_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    struct v4l2_fmtdesc fmtdesc = {
-+        .index = 0,
-+        .type = type,
-+    };
-+
-+    if (V4L2_TYPE_IS_OUTPUT(type)) {
-+        struct v4l2_create_buffers buffers = {
-+            .count = 0,
-+            .memory = V4L2_MEMORY_MMAP,
-+            .format.type = type,
-+        };
-+
-+        if (ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers) < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
-+            return -1;
-+        }
-+
-+        if ((buffers.capabilities & V4L2_BUF_CAP_SUPPORTS_REQUESTS) != V4L2_BUF_CAP_SUPPORTS_REQUESTS) {
-+            av_log(avctx, AV_LOG_INFO, "%s: output buffer type do not support requests, capabilities %u\n", __func__, buffers.capabilities);
-+            return -1;
-+        }
-+    }
-+
-+    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
-+        if (fmtdesc.pixelformat == pixelformat)
-+            return 0;
-+
-+        fmtdesc.index++;
-+    }
-+
-+    av_log(avctx, AV_LOG_INFO, "%s: pixelformat %u not supported for type %u\n", __func__, pixelformat, type);
-+    return -1;
-+}
-+
-+static int v4l2_request_set_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat, uint32_t buffersize)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    struct v4l2_format format = {
-+        .type = type,
-+    };
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
-+        format.fmt.pix_mp.width = avctx->coded_width;
-+        format.fmt.pix_mp.height = avctx->coded_height;
-+        format.fmt.pix_mp.pixelformat = pixelformat;
-+        format.fmt.pix_mp.plane_fmt[0].sizeimage = buffersize;
-+        format.fmt.pix_mp.num_planes = 1;
-+    } else {
-+        format.fmt.pix.width = avctx->coded_width;
-+        format.fmt.pix.height = avctx->coded_height;
-+        format.fmt.pix.pixelformat = pixelformat;
-+        format.fmt.pix.sizeimage = buffersize;
-+    }
-+
-+    return ioctl(ctx->video_fd, VIDIOC_S_FMT, &format);
-+}
-+
-+static int v4l2_request_select_capture_format(AVCodecContext *avctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    enum v4l2_buf_type type = ctx->format.type;
-+
-+#if 0
-+    struct v4l2_format format = {
-+        .type = type,
-+    };
-+    struct v4l2_fmtdesc fmtdesc = {
-+        .index = 0,
-+        .type = type,
-+    };
-+    uint32_t pixelformat;
-+    int i;
-+
-+    if (ioctl(ctx->video_fd, VIDIOC_G_FMT, &format) < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    pixelformat = V4L2_TYPE_IS_MULTIPLANAR(type) ? format.fmt.pix_mp.pixelformat : format.fmt.pix.pixelformat;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
-+        if (pixelformat == v4l2_request_capture_pixelformats[i])
-+            return v4l2_request_set_format(avctx, type, pixelformat, 0);
-+    }
-+
-+    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
-+        for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
-+            if (fmtdesc.pixelformat == v4l2_request_capture_pixelformats[i])
-+                return v4l2_request_set_format(avctx, type, fmtdesc.pixelformat, 0);
-+        }
-+
-+        fmtdesc.index++;
-+    }
-+#else
-+    for (int i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
-+        uint32_t pixelformat = v4l2_request_capture_pixelformats[i];
-+        if (!v4l2_request_try_format(avctx, type, pixelformat))
-+            return v4l2_request_set_format(avctx, type, pixelformat, 0);
-+    }
-+#endif
-+
-+    return -1;
-+}
-+
-+static int v4l2_request_probe_video_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret = AVERROR(EINVAL);
-+    struct v4l2_capability capability = {0};
-+    unsigned int capabilities = 0;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get video device devnode failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ctx->video_fd = open(path, O_RDWR | O_NONBLOCK, 0);
-+    if (ctx->video_fd < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCAP, &capability);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get video capability failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if (capability.capabilities & V4L2_CAP_DEVICE_CAPS)
-+        capabilities = capability.device_caps;
-+    else
-+        capabilities = capability.capabilities;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s capabilities=%u\n", __func__, avctx, ctx, path, capabilities);
-+
-+    if ((capabilities & V4L2_CAP_STREAMING) != V4L2_CAP_STREAMING) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: missing required streaming capability\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) == V4L2_CAP_VIDEO_M2M_MPLANE) {
-+        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-+        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-+    } else if ((capabilities & V4L2_CAP_VIDEO_M2M) == V4L2_CAP_VIDEO_M2M) {
-+        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+    } else {
-+        av_log(avctx, AV_LOG_ERROR, "%s: missing required mem2mem capability\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_try_format(avctx, ctx->output_type, pixelformat);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: try output format failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_set_controls(ctx, -1, control, count);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_set_format(avctx, ctx->output_type, pixelformat, buffersize);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set output format failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_select_capture_format(avctx);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_WARNING, "%s: select capture format failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    if (ctx->video_fd >= 0) {
-+        close(ctx->video_fd);
-+        ctx->video_fd = -1;
-+    }
-+    return ret;
-+}
-+
-+static int v4l2_request_init_context(AVCodecContext *avctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &ctx->format);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, ctx->format.fmt.pix_mp.pixelformat, ctx->format.fmt.pix_mp.width, ctx->format.fmt.pix_mp.height, ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline, ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage, ctx->format.fmt.pix_mp.num_planes);
-+    } else {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, ctx->format.fmt.pix.pixelformat, ctx->format.fmt.pix.width, ctx->format.fmt.pix.height, ctx->format.fmt.pix.bytesperline, ctx->format.fmt.pix.sizeimage);
-+    }
-+
-+    ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM);
-+    if (ret < 0)
-+        goto fail;
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->output_type);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: output stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->format.type);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: capture stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    ff_v4l2_request_uninit(avctx);
-+    return ret;
-+}
-+
-+static int v4l2_request_probe_media_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+    struct media_device_info device_info = {0};
-+    struct media_v2_topology topology = {0};
-+    struct media_v2_interface *interfaces = NULL;
-+    struct udev *udev = udev_device_get_udev(device);
-+    struct udev_device *video_device;
-+    dev_t devnum;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media device devnode failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ctx->media_fd = open(path, O_RDWR, 0);
-+    if (ctx->media_fd < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media device info failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s driver=%s\n", __func__, avctx, ctx, path, device_info.driver);
-+
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if (topology.num_interfaces <= 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: media device has no interfaces\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    interfaces = av_mallocz(topology.num_interfaces * sizeof(struct media_v2_interface));
-+    if (!interfaces) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: allocating media interface struct failed\n", __func__);
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = AVERROR(EINVAL);
-+    for (int i = 0; i < topology.num_interfaces; i++) {
-+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
-+            continue;
-+
-+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
-+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
-+        if (!video_device) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: video_device=%p\n", __func__, video_device);
-+            continue;
-+        }
-+
-+        ret = v4l2_request_probe_video_device(video_device, avctx, pixelformat, buffersize, control, count);
-+        udev_device_unref(video_device);
-+
-+        if (!ret)
-+            break;
-+    }
-+
-+    av_freep(&interfaces);
-+    return ret;
-+
-+fail:
-+    av_freep(&interfaces);
-+    if (ctx->media_fd >= 0) {
-+        close(ctx->media_fd);
-+        ctx->media_fd = -1;
-+    }
-+    return ret;
-+}
-+
-+int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret = AVERROR(EINVAL);
-+    struct udev *udev;
-+    struct udev_enumerate *enumerate;
-+    struct udev_list_entry *devices;
-+    struct udev_list_entry *entry;
-+    struct udev_device *device;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p hw_device_ctx=%p hw_frames_ctx=%p\n", __func__, avctx, avctx->hw_device_ctx, avctx->hw_frames_ctx);
-+
-+    ctx->media_fd = -1;
-+    ctx->video_fd = -1;
-+
-+    udev = udev_new();
-+    if (!udev) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev context failed\n", __func__);
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    enumerate = udev_enumerate_new(udev);
-+    if (!enumerate) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev enumerator failed\n", __func__);
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    udev_enumerate_add_match_subsystem(enumerate, "media");
-+    udev_enumerate_scan_devices(enumerate);
-+
-+    devices = udev_enumerate_get_list_entry(enumerate);
-+    udev_list_entry_foreach(entry, devices) {
-+        const char *path = udev_list_entry_get_name(entry);
-+        if (!path)
-+            continue;
-+
-+        device = udev_device_new_from_syspath(udev, path);
-+        if (!device)
-+            continue;
-+
-+        ret = v4l2_request_probe_media_device(device, avctx, pixelformat, buffersize, control, count);
-+        udev_device_unref(device);
-+
-+        if (!ret)
-+            break;
-+    }
-+
-+    udev_enumerate_unref(enumerate);
-+
-+    if (!ret)
-+        ret = v4l2_request_init_context(avctx);
-+
-+fail:
-+    udev_unref(udev);
-+    return ret;
-+}
-+
-+int ff_v4l2_request_uninit(AVCodecContext *avctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p\n", __func__, avctx, ctx);
-+
-+    if (ctx->video_fd >= 0) {
-+        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->output_type);
-+        if (ret < 0)
-+            av_log(avctx, AV_LOG_ERROR, "%s: output stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
-+
-+        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->format.type);
-+        if (ret < 0)
-+            av_log(avctx, AV_LOG_ERROR, "%s: capture stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
-+    }
-+
-+    if (avctx->hw_frames_ctx) {
-+        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-+        av_buffer_pool_flush(hwfc->pool);
-+    }
-+
-+    if (ctx->video_fd >= 0)
-+        close(ctx->video_fd);
-+
-+    if (ctx->media_fd >= 0)
-+        close(ctx->media_fd);
-+
-+    return 0;
-+}
-+
-+static int v4l2_request_buffer_alloc(AVCodecContext *avctx, V4L2RequestBuffer *buf, enum v4l2_buf_type type)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+    struct v4l2_plane planes[1] = {};
-+    struct v4l2_create_buffers buffers = {
-+        .count = 1,
-+        .memory = V4L2_MEMORY_MMAP,
-+        .format.type = type,
-+    };
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p buf=%p type=%u\n", __func__, avctx, buf, type);
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &buffers.format);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get format failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
-+        return ret;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buffers.format.type)) {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, buffers.format.fmt.pix_mp.pixelformat, buffers.format.fmt.pix_mp.width, buffers.format.fmt.pix_mp.height, buffers.format.fmt.pix_mp.plane_fmt[0].bytesperline, buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage, buffers.format.fmt.pix_mp.num_planes);
-+    } else {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, buffers.format.fmt.pix.pixelformat, buffers.format.fmt.pix.width, buffers.format.fmt.pix.height, buffers.format.fmt.pix.bytesperline, buffers.format.fmt.pix.sizeimage);
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
-+        return ret;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
-+        buf->width = buffers.format.fmt.pix_mp.width;
-+        buf->height = buffers.format.fmt.pix_mp.height;
-+        buf->size = buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage;
-+        buf->buffer.length = 1;
-+        buf->buffer.m.planes = planes;
-+    } else {
-+        buf->width = buffers.format.fmt.pix.width;
-+        buf->height = buffers.format.fmt.pix.height;
-+        buf->size = buffers.format.fmt.pix.sizeimage;
-+    }
-+
-+    buf->index = buffers.index;
-+    buf->capabilities = buffers.capabilities;
-+    buf->used = 0;
-+
-+    buf->buffer.type = type;
-+    buf->buffer.memory = V4L2_MEMORY_MMAP;
-+    buf->buffer.index = buf->index;
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_QUERYBUF, &buf->buffer);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: query buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
-+        return ret;
-+    }
-+
-+    buf->buffer.timestamp.tv_usec = buf->index + 1;
-+
-+    if (V4L2_TYPE_IS_OUTPUT(type)) {
-+        void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset);
-+        if (addr == MAP_FAILED) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: mmap failed, %s (%d)\n", __func__, strerror(errno), errno);
-+            return -1;
-+        }
-+
-+        buf->addr = (uint8_t*)addr;
-+    } else {
-+        struct v4l2_exportbuffer exportbuffer = {
-+            .type = type,
-+            .index = buf->index,
-+            .flags = O_RDONLY,
-+        };
-+
-+        ret = ioctl(ctx->video_fd, VIDIOC_EXPBUF, &exportbuffer);
-+        if (ret < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: export buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
-+            return ret;
-+        }
-+
-+        buf->fd = exportbuffer.fd;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
-+    return 0;
-+}
-+
-+static void v4l2_request_buffer_free(V4L2RequestBuffer *buf)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
-+
-+    if (buf->addr)
-+        munmap(buf->addr, buf->size);
-+
-+    if (buf->fd >= 0)
-+        close(buf->fd);
-+}
-+
-+static void v4l2_request_frame_free(void *opaque, uint8_t *data)
-+{
-+    AVCodecContext *avctx = opaque;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)data;
-+
-+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p request_fd=%d\n", __func__, avctx, data, req->request_fd);
-+
-+    if (req->request_fd >= 0)
-+        close(req->request_fd);
-+
-+    v4l2_request_buffer_free(&req->capture);
-+    v4l2_request_buffer_free(&req->output);
-+
-+    av_free(data);
-+}
-+
-+static AVBufferRef *v4l2_request_frame_alloc(void *opaque, int size)
-+{
-+    AVCodecContext *avctx = opaque;
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2RequestDescriptor *req;
-+    AVBufferRef *ref;
-+    uint8_t *data;
-+    int ret;
-+
-+    data = av_mallocz(size);
-+    if (!data)
-+        return NULL;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
-+
-+    ref = av_buffer_create(data, size, v4l2_request_frame_free, avctx, 0);
-+    if (!ref) {
-+        av_freep(&data);
-+        return NULL;
-+    }
-+
-+    req = (V4L2RequestDescriptor*)data;
-+    req->request_fd = -1;
-+    req->output.fd = -1;
-+    req->capture.fd = -1;
-+
-+    ret = v4l2_request_buffer_alloc(avctx, &req->output, ctx->output_type);
-+    if (ret < 0) {
-+        av_buffer_unref(&ref);
-+        return NULL;
-+    }
-+
-+    ret = v4l2_request_buffer_alloc(avctx, &req->capture, ctx->format.type);
-+    if (ret < 0) {
-+        av_buffer_unref(&ref);
-+        return NULL;
-+    }
-+
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_REQUEST_ALLOC, &req->request_fd);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: request alloc failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        av_buffer_unref(&ref);
-+        return NULL;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p request_fd=%d\n", __func__, avctx, size, data, req->request_fd);
-+    return ref;
-+}
-+
-+static void v4l2_request_pool_free(void *opaque)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
-+}
-+
-+static void v4l2_request_hwframe_ctx_free(AVHWFramesContext *hwfc)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
-+
-+    av_buffer_pool_flush(hwfc->pool);
-+    av_buffer_pool_uninit(&hwfc->pool);
-+}
-+
-+int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
-+
-+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
-+    hwfc->sw_format = AV_PIX_FMT_NV12;
-+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
-+        hwfc->width = ctx->format.fmt.pix_mp.width;
-+        hwfc->height = ctx->format.fmt.pix_mp.height;
-+    } else {
-+        hwfc->width = ctx->format.fmt.pix.width;
-+        hwfc->height = ctx->format.fmt.pix.height;
-+    }
-+
-+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2RequestDescriptor), avctx, v4l2_request_frame_alloc, v4l2_request_pool_free);
-+    if (!hwfc->pool)
-+        return AVERROR(ENOMEM);
-+
-+    hwfc->free = v4l2_request_hwframe_ctx_free;
-+
-+    hwfc->initial_pool_size = 1;
-+
-+    switch (avctx->codec_id) {
-+    case AV_CODEC_ID_VP9:
-+        hwfc->initial_pool_size += 8;
-+        break;
-+    case AV_CODEC_ID_VP8:
-+        hwfc->initial_pool_size += 3;
-+        break;
-+    default:
-+        hwfc->initial_pool_size += 2;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
-+
-+    return 0;
-+}
-diff --git a/libavcodec/v4l2_request.h b/libavcodec/v4l2_request.h
-new file mode 100644
-index 0000000000..d4146bd4ee
---- /dev/null
-+++ b/libavcodec/v4l2_request.h
-@@ -0,0 +1,72 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_V4L2_REQUEST_H
-+#define AVCODEC_V4L2_REQUEST_H
-+
-+#include <linux/videodev2.h>
-+
-+#include "libavutil/hwcontext_drm.h"
-+
-+typedef struct V4L2RequestContext {
-+    int video_fd;
-+    int media_fd;
-+    enum v4l2_buf_type output_type;
-+    struct v4l2_format format;
-+} V4L2RequestContext;
-+
-+typedef struct V4L2RequestBuffer {
-+    int index;
-+    int fd;
-+    uint8_t *addr;
-+    uint32_t width;
-+    uint32_t height;
-+    uint32_t size;
-+    uint32_t used;
-+    uint32_t capabilities;
-+    struct v4l2_buffer buffer;
-+} V4L2RequestBuffer;
-+
-+typedef struct V4L2RequestDescriptor {
-+    AVDRMFrameDescriptor drm;
-+    int request_fd;
-+    V4L2RequestBuffer output;
-+    V4L2RequestBuffer capture;
-+} V4L2RequestDescriptor;
-+
-+uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame);
-+
-+int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame);
-+
-+int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size);
-+
-+int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count);
-+
-+int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id);
-+
-+int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice);
-+
-+int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count);
-+
-+int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count);
-+
-+int ff_v4l2_request_uninit(AVCodecContext *avctx);
-+
-+int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
-+
-+#endif /* AVCODEC_V4L2_REQUEST_H */
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0003-Add-V4L2-request-API-mpeg2-hwaccel.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0003-Add-V4L2-request-API-mpeg2-hwaccel.patch
deleted file mode 100644
index 8af3e620be..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0003-Add-V4L2-request-API-mpeg2-hwaccel.patch
+++ /dev/null
@@ -1,244 +0,0 @@
-From 279092b1ef7cb944c513167dd55861ff2f2bb473 Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Sat, 15 Dec 2018 22:32:16 +0100
-Subject: [PATCH 03/12] Add V4L2 request API mpeg2 hwaccel
-
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- configure                       |   3 +
- libavcodec/Makefile             |   1 +
- libavcodec/hwaccels.h           |   1 +
- libavcodec/mpeg12dec.c          |   6 ++
- libavcodec/v4l2_request_mpeg2.c | 154 ++++++++++++++++++++++++++++++++
- 5 files changed, 165 insertions(+)
- create mode 100644 libavcodec/v4l2_request_mpeg2.c
-
-diff --git a/configure b/configure
-index 6b41344dfd..e88c201cce 100755
---- a/configure
-+++ b/configure
-@@ -2937,6 +2937,8 @@ mpeg2_dxva2_hwaccel_deps="dxva2"
- mpeg2_dxva2_hwaccel_select="mpeg2video_decoder"
- mpeg2_nvdec_hwaccel_deps="nvdec"
- mpeg2_nvdec_hwaccel_select="mpeg2video_decoder"
-+mpeg2_v4l2request_hwaccel_deps="v4l2_request mpeg2_v4l2_request"
-+mpeg2_v4l2request_hwaccel_select="mpeg2video_decoder"
- mpeg2_vaapi_hwaccel_deps="vaapi"
- mpeg2_vaapi_hwaccel_select="mpeg2video_decoder"
- mpeg2_vdpau_hwaccel_deps="vdpau"
-@@ -6455,6 +6457,7 @@ check_cc vp8_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP8;"
- check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
- 
- check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
-+check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
- 
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 9e847eeadc..864f24a2af 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -891,6 +891,7 @@ OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL)      += dxva2_mpeg2.o
- OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL)        += dxva2_mpeg2.o
- OBJS-$(CONFIG_MPEG2_NVDEC_HWACCEL)        += nvdec_mpeg12.o
- OBJS-$(CONFIG_MPEG2_QSV_HWACCEL)          += qsvdec_other.o
-+OBJS-$(CONFIG_MPEG2_V4L2REQUEST_HWACCEL)  += v4l2_request_mpeg2.o
- OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL)        += vaapi_mpeg2.o
- OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL)        += vdpau_mpeg12.o
- OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
-diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
-index 7d73da8676..ef54de2a3b 100644
---- a/libavcodec/hwaccels.h
-+++ b/libavcodec/hwaccels.h
-@@ -47,6 +47,7 @@ extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel;
- extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel;
- extern const AVHWAccel ff_mpeg2_nvdec_hwaccel;
- extern const AVHWAccel ff_mpeg2_dxva2_hwaccel;
-+extern const AVHWAccel ff_mpeg2_v4l2request_hwaccel;
- extern const AVHWAccel ff_mpeg2_vaapi_hwaccel;
- extern const AVHWAccel ff_mpeg2_vdpau_hwaccel;
- extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel;
-diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
-index 83e537884b..305127bc94 100644
---- a/libavcodec/mpeg12dec.c
-+++ b/libavcodec/mpeg12dec.c
-@@ -1156,6 +1156,9 @@ static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = {
- #endif
- #if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
-     AV_PIX_FMT_VIDEOTOOLBOX,
-+#endif
-+#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
-+    AV_PIX_FMT_DRM_PRIME,
- #endif
-     AV_PIX_FMT_YUV420P,
-     AV_PIX_FMT_NONE
-@@ -2941,6 +2944,9 @@ AVCodec ff_mpeg2video_decoder = {
- #endif
- #if CONFIG_MPEG2_XVMC_HWACCEL
-                         HWACCEL_XVMC(mpeg2),
-+#endif
-+#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
-+                        HWACCEL_V4L2REQUEST(mpeg2),
- #endif
-                         NULL
-                     },
-diff --git a/libavcodec/v4l2_request_mpeg2.c b/libavcodec/v4l2_request_mpeg2.c
-new file mode 100644
-index 0000000000..782b9c2471
---- /dev/null
-+++ b/libavcodec/v4l2_request_mpeg2.c
-@@ -0,0 +1,154 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hwaccel.h"
-+#include "mpegvideo.h"
-+#include "v4l2_request.h"
-+
-+typedef struct V4L2RequestControlsMPEG2 {
-+    struct v4l2_ctrl_mpeg2_slice_params slice_params;
-+    struct v4l2_ctrl_mpeg2_quantization quantization;
-+} V4L2RequestControlsMPEG2;
-+
-+static int v4l2_request_mpeg2_start_frame(AVCodecContext *avctx,
-+                                          av_unused const uint8_t *buffer,
-+                                          av_unused uint32_t size)
-+{
-+    const MpegEncContext *s = avctx->priv_data;
-+    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
-+
-+    controls->slice_params = (struct v4l2_ctrl_mpeg2_slice_params) {
-+        .bit_size = 0,
-+        .data_bit_offset = 0,
-+
-+        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
-+        .quantiser_scale_code = s->qscale >> 1,
-+
-+        .sequence = {
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
-+            .horizontal_size = s->width,
-+            .vertical_size = s->height,
-+            .vbv_buffer_size = req->output.size,
-+
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
-+            .profile_and_level_indication = 0,
-+            .progressive_sequence = s->progressive_sequence,
-+            .chroma_format = s->chroma_format,
-+        },
-+
-+        .picture = {
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
-+            .picture_coding_type = s->pict_type,
-+
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
-+            .f_code[0][0] = s->mpeg_f_code[0][0],
-+            .f_code[0][1] = s->mpeg_f_code[0][1],
-+            .f_code[1][0] = s->mpeg_f_code[1][0],
-+            .f_code[1][1] = s->mpeg_f_code[1][1],
-+            .intra_dc_precision = s->intra_dc_precision,
-+            .picture_structure = s->picture_structure,
-+            .top_field_first = s->top_field_first,
-+            .frame_pred_frame_dct = s->frame_pred_frame_dct,
-+            .concealment_motion_vectors = s->concealment_motion_vectors,
-+            .q_scale_type = s->q_scale_type,
-+            .intra_vlc_format = s->intra_vlc_format,
-+            .alternate_scan = s->alternate_scan,
-+            .repeat_first_field = s->repeat_first_field,
-+            .progressive_frame = s->progressive_frame,
-+        },
-+    };
-+
-+    switch (s->pict_type) {
-+    case AV_PICTURE_TYPE_B:
-+        controls->slice_params.backward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->next_picture.f);
-+        // fall-through
-+    case AV_PICTURE_TYPE_P:
-+        controls->slice_params.forward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->last_picture.f);
-+    }
-+
-+    controls->quantization = (struct v4l2_ctrl_mpeg2_quantization) {
-+        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
-+        .load_intra_quantiser_matrix = 1,
-+        .load_non_intra_quantiser_matrix = 1,
-+        .load_chroma_intra_quantiser_matrix = 1,
-+        .load_chroma_non_intra_quantiser_matrix = 1,
-+    };
-+
-+    for (int i = 0; i < 64; i++) {
-+        int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
-+        controls->quantization.intra_quantiser_matrix[i] = s->intra_matrix[n];
-+        controls->quantization.non_intra_quantiser_matrix[i] = s->inter_matrix[n];
-+        controls->quantization.chroma_intra_quantiser_matrix[i] = s->chroma_intra_matrix[n];
-+        controls->quantization.chroma_non_intra_quantiser_matrix[i] = s->chroma_inter_matrix[n];
-+    }
-+
-+    return ff_v4l2_request_reset_frame(avctx, s->current_picture_ptr->f);
-+}
-+
-+static int v4l2_request_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const MpegEncContext *s = avctx->priv_data;
-+
-+    return ff_v4l2_request_append_output_buffer(avctx, s->current_picture_ptr->f, buffer, size);
-+}
-+
-+static int v4l2_request_mpeg2_end_frame(AVCodecContext *avctx)
-+{
-+    const MpegEncContext *s = avctx->priv_data;
-+    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS,
-+            .ptr = &controls->slice_params,
-+            .size = sizeof(controls->slice_params),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION,
-+            .ptr = &controls->quantization,
-+            .size = sizeof(controls->quantization),
-+        },
-+    };
-+
-+    controls->slice_params.bit_size = req->output.used * 8;
-+
-+    return ff_v4l2_request_decode_frame(avctx, s->current_picture_ptr->f, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_mpeg2_init(AVCodecContext *avctx)
-+{
-+    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_MPEG2_SLICE, 1024 * 1024, NULL, 0);
-+}
-+
-+const AVHWAccel ff_mpeg2_v4l2request_hwaccel = {
-+    .name           = "mpeg2_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_MPEG2VIDEO,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame    = v4l2_request_mpeg2_start_frame,
-+    .decode_slice   = v4l2_request_mpeg2_decode_slice,
-+    .end_frame      = v4l2_request_mpeg2_end_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsMPEG2),
-+    .init           = v4l2_request_mpeg2_init,
-+    .uninit         = ff_v4l2_request_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContext),
-+    .frame_params   = ff_v4l2_request_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
-+};
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0004-Add-V4L2-request-API-h264-hwaccel.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0004-Add-V4L2-request-API-h264-hwaccel.patch
deleted file mode 100644
index 0ac85b0a05..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0004-Add-V4L2-request-API-h264-hwaccel.patch
+++ /dev/null
@@ -1,547 +0,0 @@
-From da483a0eb4802714d9ad3a07ea17c2ec0f65c2a5 Mon Sep 17 00:00:00 2001
-From: Jernej Skrabec <jernej.skrabec@siol.net>
-Date: Sat, 15 Dec 2018 22:32:16 +0100
-Subject: [PATCH 04/12] Add V4L2 request API h264 hwaccel
-
-Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- configure                      |   3 +
- libavcodec/Makefile            |   1 +
- libavcodec/h264_slice.c        |   4 +
- libavcodec/h264dec.c           |   3 +
- libavcodec/hwaccels.h          |   1 +
- libavcodec/v4l2_request_h264.c | 443 +++++++++++++++++++++++++++++++++
- 6 files changed, 455 insertions(+)
- create mode 100644 libavcodec/v4l2_request_h264.c
-
-diff --git a/configure b/configure
-index e88c201cce..576a79ff09 100755
---- a/configure
-+++ b/configure
-@@ -2895,6 +2895,8 @@ h264_dxva2_hwaccel_deps="dxva2"
- h264_dxva2_hwaccel_select="h264_decoder"
- h264_nvdec_hwaccel_deps="nvdec"
- h264_nvdec_hwaccel_select="h264_decoder"
-+h264_v4l2request_hwaccel_deps="v4l2_request h264_v4l2_request"
-+h264_v4l2request_hwaccel_select="h264_decoder"
- h264_vaapi_hwaccel_deps="vaapi"
- h264_vaapi_hwaccel_select="h264_decoder"
- h264_vdpau_hwaccel_deps="vdpau"
-@@ -6457,6 +6459,7 @@ check_cc vp8_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP8;"
- check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
- 
- check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
-+check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
- check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
- 
- check_headers sys/videoio.h
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 864f24a2af..58bd444934 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -872,6 +872,7 @@ OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
- OBJS-$(CONFIG_H264_DXVA2_HWACCEL)         += dxva2_h264.o
- OBJS-$(CONFIG_H264_NVDEC_HWACCEL)         += nvdec_h264.o
- OBJS-$(CONFIG_H264_QSV_HWACCEL)           += qsvdec_h2645.o
-+OBJS-$(CONFIG_H264_V4L2REQUEST_HWACCEL)   += v4l2_request_h264.o
- OBJS-$(CONFIG_H264_VAAPI_HWACCEL)         += vaapi_h264.o
- OBJS-$(CONFIG_H264_VDPAU_HWACCEL)         += vdpau_h264.o
- OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
-diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
-index fc4e65bf01..9912ede703 100644
---- a/libavcodec/h264_slice.c
-+++ b/libavcodec/h264_slice.c
-@@ -765,6 +765,7 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
- #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \
-                      (CONFIG_H264_D3D11VA_HWACCEL * 2) + \
-                      CONFIG_H264_NVDEC_HWACCEL + \
-+                     CONFIG_H264_V4L2REQUEST_HWACCEL + \
-                      CONFIG_H264_VAAPI_HWACCEL + \
-                      CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \
-                      CONFIG_H264_VDPAU_HWACCEL)
-@@ -849,6 +850,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
- #endif
- #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
-             *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
-+#endif
-+#if CONFIG_H264_V4L2REQUEST_HWACCEL
-+            *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-             if (h->avctx->codec->pix_fmts)
-                 choices = h->avctx->codec->pix_fmts;
-diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
-index 03c87b59bd..dbcceb915a 100644
---- a/libavcodec/h264dec.c
-+++ b/libavcodec/h264dec.c
-@@ -1081,6 +1081,9 @@ AVCodec ff_h264_decoder = {
- #endif
- #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
-                                HWACCEL_VIDEOTOOLBOX(h264),
-+#endif
-+#if CONFIG_H264_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(h264),
- #endif
-                                NULL
-                            },
-diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
-index ef54de2a3b..003200edea 100644
---- a/libavcodec/hwaccels.h
-+++ b/libavcodec/hwaccels.h
-@@ -27,6 +27,7 @@ extern const AVHWAccel ff_h264_d3d11va_hwaccel;
- extern const AVHWAccel ff_h264_d3d11va2_hwaccel;
- extern const AVHWAccel ff_h264_dxva2_hwaccel;
- extern const AVHWAccel ff_h264_nvdec_hwaccel;
-+extern const AVHWAccel ff_h264_v4l2request_hwaccel;
- extern const AVHWAccel ff_h264_vaapi_hwaccel;
- extern const AVHWAccel ff_h264_vdpau_hwaccel;
- extern const AVHWAccel ff_h264_videotoolbox_hwaccel;
-diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
-new file mode 100644
-index 0000000000..81b3c4b092
---- /dev/null
-+++ b/libavcodec/v4l2_request_h264.c
-@@ -0,0 +1,443 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "h264dec.h"
-+#include "hwaccel.h"
-+#include "v4l2_request.h"
-+
-+typedef struct V4L2RequestControlsH264 {
-+    struct v4l2_ctrl_h264_sps sps;
-+    struct v4l2_ctrl_h264_pps pps;
-+    struct v4l2_ctrl_h264_scaling_matrix scaling_matrix;
-+    struct v4l2_ctrl_h264_decode_params decode_params;
-+    struct v4l2_ctrl_h264_slice_params slice_params[16];
-+    int first_slice;
-+} V4L2RequestControlsH264;
-+
-+typedef struct V4L2RequestContextH264 {
-+    V4L2RequestContext base;
-+    int decode_mode;
-+    int start_code;
-+} V4L2RequestContextH264;
-+
-+static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
-+
-+static void fill_weight_factors(struct v4l2_h264_weight_factors *factors, int list, const H264SliceContext *sl)
-+{
-+    for (int i = 0; i < sl->ref_count[list]; i++) {
-+        if (sl->pwt.luma_weight_flag[list]) {
-+            factors->luma_weight[i] = sl->pwt.luma_weight[i][list][0];
-+            factors->luma_offset[i] = sl->pwt.luma_weight[i][list][1];
-+        } else {
-+            factors->luma_weight[i] = 1 << sl->pwt.luma_log2_weight_denom;
-+            factors->luma_offset[i] = 0;
-+        }
-+        for (int j = 0; j < 2; j++) {
-+            if (sl->pwt.chroma_weight_flag[list]) {
-+                factors->chroma_weight[i][j] = sl->pwt.chroma_weight[i][list][j][0];
-+                factors->chroma_offset[i][j] = sl->pwt.chroma_weight[i][list][j][1];
-+            } else {
-+                factors->chroma_weight[i][j] = 1 << sl->pwt.chroma_log2_weight_denom;
-+                factors->chroma_offset[i][j] = 0;
-+            }
-+        }
-+    }
-+}
-+
-+static void fill_dpb_entry(struct v4l2_h264_dpb_entry *entry, const H264Picture *pic)
-+{
-+    entry->reference_ts = ff_v4l2_request_get_capture_timestamp(pic->f);
-+    entry->frame_num = pic->frame_num;
-+    entry->pic_num = pic->pic_id;
-+    entry->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
-+    if (pic->reference)
-+        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE;
-+    if (pic->long_ref)
-+        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
-+    if (pic->field_poc[0] != INT_MAX)
-+        entry->top_field_order_cnt = pic->field_poc[0];
-+    if (pic->field_poc[1] != INT_MAX)
-+        entry->bottom_field_order_cnt = pic->field_poc[1];
-+}
-+
-+static void fill_dpb(struct v4l2_ctrl_h264_decode_params *decode, const H264Context *h)
-+{
-+    int entries = 0;
-+
-+    for (int i = 0; i < h->short_ref_count; i++) {
-+        const H264Picture *pic = h->short_ref[i];
-+        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
-+            fill_dpb_entry(&decode->dpb[entries++], pic);
-+    }
-+
-+    if (!h->long_ref_count)
-+        return;
-+
-+    for (int i = 0; i < FF_ARRAY_ELEMS(h->long_ref); i++) {
-+        const H264Picture *pic = h->long_ref[i];
-+        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
-+            fill_dpb_entry(&decode->dpb[entries++], pic);
-+    }
-+}
-+
-+static uint8_t get_dpb_index(struct v4l2_ctrl_h264_decode_params *decode, const H264Ref *ref)
-+{
-+    uint64_t timestamp;
-+
-+    if (!ref->parent)
-+        return 0;
-+
-+    timestamp = ff_v4l2_request_get_capture_timestamp(ref->parent->f);
-+
-+    for (uint8_t i = 0; i < FF_ARRAY_ELEMS(decode->dpb); i++) {
-+        struct v4l2_h264_dpb_entry *entry = &decode->dpb[i];
-+        if ((entry->flags & V4L2_H264_DPB_ENTRY_FLAG_VALID) &&
-+            entry->reference_ts == timestamp)
-+            return i;
-+    }
-+
-+    return 0;
-+}
-+
-+static void fill_sps(struct v4l2_ctrl_h264_sps *ctrl, const H264Context *h)
-+{
-+    const SPS *sps = h->ps.sps;
-+
-+    *ctrl = (struct v4l2_ctrl_h264_sps) {
-+        .profile_idc = sps->profile_idc,
-+        .constraint_set_flags = sps->constraint_set_flags,
-+        .level_idc = sps->level_idc,
-+        .seq_parameter_set_id = sps->sps_id,
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .bit_depth_luma_minus8 = sps->bit_depth_luma - 8,
-+        .bit_depth_chroma_minus8 = sps->bit_depth_chroma - 8,
-+        .log2_max_frame_num_minus4 = sps->log2_max_frame_num - 4,
-+        .pic_order_cnt_type = sps->poc_type,
-+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
-+        .max_num_ref_frames = sps->ref_frame_count,
-+        .num_ref_frames_in_pic_order_cnt_cycle = sps->poc_cycle_length,
-+        //.offset_for_ref_frame[255] - not required? not set by libva-v4l2-request - copy sps->offset_for_ref_frame
-+        .offset_for_non_ref_pic = sps->offset_for_non_ref_pic,
-+        .offset_for_top_to_bottom_field = sps->offset_for_top_to_bottom_field,
-+        .pic_width_in_mbs_minus1 = h->mb_width - 1,
-+        .pic_height_in_map_units_minus1 = sps->frame_mbs_only_flag ? h->mb_height - 1 : h->mb_height / 2 - 1,
-+    };
-+
-+    if (sps->residual_color_transform_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE;
-+    if (sps->transform_bypass)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS;
-+    if (sps->delta_pic_order_always_zero_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO;
-+    if (sps->gaps_in_frame_num_allowed_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED;
-+    if (sps->frame_mbs_only_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY;
-+    if (sps->mb_aff)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD;
-+    if (sps->direct_8x8_inference_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE;
-+}
-+
-+static void fill_pps(struct v4l2_ctrl_h264_pps *ctrl, const H264Context *h)
-+{
-+    const PPS *pps = h->ps.pps;
-+    const H264SliceContext *sl = &h->slice_ctx[0];
-+
-+    *ctrl = (struct v4l2_ctrl_h264_pps) {
-+        .pic_parameter_set_id = sl->pps_id,
-+        .seq_parameter_set_id = pps->sps_id,
-+        .num_slice_groups_minus1 = pps->slice_group_count - 1,
-+        .num_ref_idx_l0_default_active_minus1 = pps->ref_count[0] - 1,
-+        .num_ref_idx_l1_default_active_minus1 = pps->ref_count[1] - 1,
-+        .weighted_bipred_idc = pps->weighted_bipred_idc,
-+        .pic_init_qp_minus26 = pps->init_qp - 26,
-+        .pic_init_qs_minus26 = pps->init_qs - 26,
-+        .chroma_qp_index_offset = pps->chroma_qp_index_offset[0],
-+        .second_chroma_qp_index_offset = pps->chroma_qp_index_offset[1],
-+    };
-+
-+    if (pps->cabac)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE;
-+    if (pps->pic_order_present)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT;
-+    if (pps->weighted_pred)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED;
-+    if (pps->deblocking_filter_parameters_present)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT;
-+    if (pps->constrained_intra_pred)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED;
-+    if (pps->redundant_pic_cnt_present)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT;
-+    if (pps->transform_8x8_mode)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE;
-+}
-+
-+static int v4l2_request_h264_start_frame(AVCodecContext *avctx,
-+                                         av_unused const uint8_t *buffer,
-+                                         av_unused uint32_t size)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    const PPS *pps = h->ps.pps;
-+    const SPS *sps = h->ps.sps;
-+    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
-+
-+    fill_sps(&controls->sps, h);
-+    fill_pps(&controls->pps, h);
-+
-+    memcpy(controls->scaling_matrix.scaling_list_4x4, pps->scaling_matrix4, sizeof(controls->scaling_matrix.scaling_list_4x4));
-+    memcpy(controls->scaling_matrix.scaling_list_8x8[0], pps->scaling_matrix8[0], sizeof(controls->scaling_matrix.scaling_list_8x8[0]));
-+    memcpy(controls->scaling_matrix.scaling_list_8x8[1], pps->scaling_matrix8[3], sizeof(controls->scaling_matrix.scaling_list_8x8[1]));
-+
-+    if (sps->chroma_format_idc == 3) {
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[2], pps->scaling_matrix8[1], sizeof(controls->scaling_matrix.scaling_list_8x8[2]));
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[3], pps->scaling_matrix8[4], sizeof(controls->scaling_matrix.scaling_list_8x8[3]));
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[4], pps->scaling_matrix8[2], sizeof(controls->scaling_matrix.scaling_list_8x8[4]));
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[5], pps->scaling_matrix8[5], sizeof(controls->scaling_matrix.scaling_list_8x8[5]));
-+    }
-+
-+    controls->decode_params = (struct v4l2_ctrl_h264_decode_params) {
-+        .num_slices = 0,
-+        .nal_ref_idc = h->nal_ref_idc,
-+        .top_field_order_cnt = h->cur_pic_ptr->field_poc[0] != INT_MAX ? h->cur_pic_ptr->field_poc[0] : 0,
-+        .bottom_field_order_cnt = h->cur_pic_ptr->field_poc[1] != INT_MAX ? h->cur_pic_ptr->field_poc[1] : 0,
-+    };
-+
-+    if (h->picture_idr)
-+        controls->decode_params.flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC;
-+
-+    fill_dpb(&controls->decode_params, h);
-+
-+    controls->first_slice = !FIELD_PICTURE(h) || h->first_field;
-+
-+    return ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
-+}
-+
-+static int v4l2_request_h264_queue_decode(AVCodecContext *avctx, int last_slice)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
-+    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
-+            .ptr = &controls->sps,
-+            .size = sizeof(controls->sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
-+            .ptr = &controls->pps,
-+            .size = sizeof(controls->pps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS,
-+            .ptr = &controls->slice_params,
-+            .size = sizeof(controls->slice_params[0]) * FFMIN(controls->decode_params.num_slices, 16),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS,
-+            .ptr = &controls->decode_params,
-+            .size = sizeof(controls->decode_params),
-+        },
-+    };
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED)
-+        return ff_v4l2_request_decode_slice(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
-+
-+    return ff_v4l2_request_decode_frame(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    const PPS *pps = h->ps.pps;
-+    const H264SliceContext *sl = &h->slice_ctx[0];
-+    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
-+    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->cur_pic_ptr->f->data[0];
-+    int i, ret, count, slice = FFMIN(controls->decode_params.num_slices, 15);
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED && slice) {
-+        ret = v4l2_request_h264_queue_decode(avctx, 0);
-+        if (ret)
-+            return ret;
-+
-+        ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
-+        slice = controls->decode_params.num_slices = 0;
-+        controls->first_slice = 0;
-+    }
-+
-+    controls->slice_params[slice] = (struct v4l2_ctrl_h264_slice_params) {
-+        /* Size in bytes, including header */
-+        .size = 0,
-+        .start_byte_offset = req->output.used,
-+        /* Offset in bits to slice_data() from the beginning of this slice. */
-+        .header_bit_size = get_bits_count(&sl->gb),
-+
-+        .first_mb_in_slice = sl->first_mb_addr,
-+        .slice_type = ff_h264_get_slice_type(sl),
-+        .pic_parameter_set_id = sl->pps_id,
-+        .colour_plane_id = 0, /* what is this? */
-+        .frame_num = h->poc.frame_num,
-+        .idr_pic_id = 0, /* what is this? */
-+        .pic_order_cnt_lsb = sl->poc_lsb,
-+        .delta_pic_order_cnt_bottom = sl->delta_poc_bottom,
-+        .delta_pic_order_cnt0 = sl->delta_poc[0],
-+        .delta_pic_order_cnt1 = sl->delta_poc[1],
-+        .redundant_pic_cnt = sl->redundant_pic_count,
-+
-+        /* Size in bits of dec_ref_pic_marking() syntax element. */
-+        .dec_ref_pic_marking_bit_size = 0,
-+        /* Size in bits of pic order count syntax. */
-+        .pic_order_cnt_bit_size = 0,
-+
-+        .cabac_init_idc = sl->cabac_init_idc,
-+        .slice_qp_delta = sl->qscale - pps->init_qp,
-+        .slice_qs_delta = 0, /* XXX not implemented by FFmpeg */
-+        .disable_deblocking_filter_idc = sl->deblocking_filter < 2 ? !sl->deblocking_filter : sl->deblocking_filter,
-+        .slice_alpha_c0_offset_div2 = sl->slice_alpha_c0_offset / 2,
-+        .slice_beta_offset_div2 = sl->slice_beta_offset / 2,
-+        .slice_group_change_cycle = 0, /* what is this? */
-+
-+        .num_ref_idx_l0_active_minus1 = sl->list_count > 0 ? sl->ref_count[0] - 1 : 0,
-+        .num_ref_idx_l1_active_minus1 = sl->list_count > 1 ? sl->ref_count[1] - 1 : 0,
-+    };
-+
-+    if (FIELD_PICTURE(h))
-+        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_FIELD_PIC;
-+    if (h->picture_structure == PICT_BOTTOM_FIELD)
-+        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_BOTTOM_FIELD;
-+    if (sl->slice_type == AV_PICTURE_TYPE_B && sl->direct_spatial_mv_pred)
-+        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED;
-+
-+    controls->slice_params[slice].pred_weight_table.chroma_log2_weight_denom = sl->pwt.chroma_log2_weight_denom;
-+    controls->slice_params[slice].pred_weight_table.luma_log2_weight_denom = sl->pwt.luma_log2_weight_denom;
-+
-+    count = sl->list_count > 0 ? sl->ref_count[0] : 0;
-+    for (i = 0; i < count; i++)
-+        controls->slice_params[slice].ref_pic_list0[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[0][i]);
-+    if (count)
-+        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[0], 0, sl);
-+
-+    count = sl->list_count > 1 ? sl->ref_count[1] : 0;
-+    for (i = 0; i < count; i++)
-+        controls->slice_params[slice].ref_pic_list1[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[1][i]);
-+    if (count)
-+        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[1], 1, sl);
-+
-+    if (ctx->start_code == V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
-+        ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, nalu_slice_start_code, 3);
-+        if (ret)
-+            return ret;
-+    }
-+
-+    ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, buffer, size);
-+    if (ret)
-+        return ret;
-+
-+    controls->slice_params[slice].size = req->output.used - controls->slice_params[slice].start_byte_offset;
-+    controls->decode_params.num_slices++;
-+    return 0;
-+}
-+
-+static int v4l2_request_h264_end_frame(AVCodecContext *avctx)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    return v4l2_request_h264_queue_decode(avctx, !FIELD_PICTURE(h) || !h->first_field);
-+}
-+
-+static int v4l2_request_h264_set_controls(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_H264_START_CODE, },
-+    };
-+
-+    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE);
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_START_CODE);
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    control[0].value = ctx->decode_mode;
-+    control[1].value = ctx->start_code;
-+
-+    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_h264_init(AVCodecContext *avctx)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    struct v4l2_ctrl_h264_sps sps;
-+    struct v4l2_ctrl_h264_pps pps;
-+    int ret;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
-+            .ptr = &sps,
-+            .size = sizeof(sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
-+            .ptr = &pps,
-+            .size = sizeof(pps),
-+        },
-+    };
-+
-+    fill_sps(&sps, h);
-+    fill_pps(&pps, h);
-+
-+    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_H264_SLICE, 2 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
-+    if (ret)
-+        return ret;
-+
-+    return v4l2_request_h264_set_controls(avctx);
-+}
-+
-+const AVHWAccel ff_h264_v4l2request_hwaccel = {
-+    .name           = "h264_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_H264,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame    = v4l2_request_h264_start_frame,
-+    .decode_slice   = v4l2_request_h264_decode_slice,
-+    .end_frame      = v4l2_request_h264_end_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsH264),
-+    .init           = v4l2_request_h264_init,
-+    .uninit         = ff_v4l2_request_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContextH264),
-+    .frame_params   = ff_v4l2_request_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
-+};
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0005-Add-V4L2-request-API-hevc-hwaccel.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0005-Add-V4L2-request-API-hevc-hwaccel.patch
deleted file mode 100644
index 20bba586c0..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0005-Add-V4L2-request-API-hevc-hwaccel.patch
+++ /dev/null
@@ -1,636 +0,0 @@
-From 521d06aec85d55a28e946a222f591747a1d3f2f5 Mon Sep 17 00:00:00 2001
-From: Jernej Skrabec <jernej.skrabec@siol.net>
-Date: Sat, 15 Dec 2018 22:32:16 +0100
-Subject: [PATCH 05/12] Add V4L2 request API hevc hwaccel
-
-Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- configure                      |   3 +
- libavcodec/Makefile            |   1 +
- libavcodec/hevcdec.c           |  10 +
- libavcodec/hwaccels.h          |   1 +
- libavcodec/v4l2_request_hevc.c | 527 +++++++++++++++++++++++++++++++++
- 5 files changed, 542 insertions(+)
- create mode 100644 libavcodec/v4l2_request_hevc.c
-
-diff --git a/configure b/configure
-index 576a79ff09..d51ee62a70 100755
---- a/configure
-+++ b/configure
-@@ -2911,6 +2911,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
- hevc_dxva2_hwaccel_select="hevc_decoder"
- hevc_nvdec_hwaccel_deps="nvdec"
- hevc_nvdec_hwaccel_select="hevc_decoder"
-+hevc_v4l2request_hwaccel_deps="v4l2_request hevc_v4l2_request"
-+hevc_v4l2request_hwaccel_select="hevc_decoder"
- hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
- hevc_vaapi_hwaccel_select="hevc_decoder"
- hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
-@@ -6460,6 +6462,7 @@ check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
- 
- check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
- check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
-+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
- check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
- 
- check_headers sys/videoio.h
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 58bd444934..2b53d2b650 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -880,6 +880,7 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
- OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec_h2645.o
-+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o
- OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o
- OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
- OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
-diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index 8f1c162ace..4c9c92bfc2 100644
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -373,6 +373,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
-                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
-                      CONFIG_HEVC_NVDEC_HWACCEL + \
-+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
-                      CONFIG_HEVC_VAAPI_HWACCEL + \
-                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
-                      CONFIG_HEVC_VDPAU_HWACCEL)
-@@ -399,6 +400,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #endif
- #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
-         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-         break;
-     case AV_PIX_FMT_YUV420P10:
-@@ -417,6 +421,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #endif
- #if CONFIG_HEVC_NVDEC_HWACCEL
-         *fmt++ = AV_PIX_FMT_CUDA;
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-         break;
-     case AV_PIX_FMT_YUV444P:
-@@ -3592,6 +3599,9 @@ AVCodec ff_hevc_decoder = {
- #endif
- #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
-                                HWACCEL_VIDEOTOOLBOX(hevc),
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(hevc),
- #endif
-                                NULL
-                            },
-diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
-index 003200edea..d183675abe 100644
---- a/libavcodec/hwaccels.h
-+++ b/libavcodec/hwaccels.h
-@@ -35,6 +35,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
- extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
- extern const AVHWAccel ff_hevc_dxva2_hwaccel;
- extern const AVHWAccel ff_hevc_nvdec_hwaccel;
-+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
- extern const AVHWAccel ff_hevc_vaapi_hwaccel;
- extern const AVHWAccel ff_hevc_vdpau_hwaccel;
- extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-new file mode 100644
-index 0000000000..da1fd666d7
---- /dev/null
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,527 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hevcdec.h"
-+#include "hwaccel.h"
-+#include "v4l2_request.h"
-+
-+typedef struct V4L2RequestControlsHEVC {
-+    struct v4l2_ctrl_hevc_sps sps;
-+    struct v4l2_ctrl_hevc_pps pps;
-+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
-+    struct v4l2_ctrl_hevc_slice_params slice_params[16];
-+    int first_slice;
-+    int num_slices; //TODO: this should be in control
-+} V4L2RequestControlsHEVC;
-+
-+typedef struct V4L2RequestContextHEVC {
-+    V4L2RequestContext base;
-+    int decode_mode;
-+    int start_code;
-+} V4L2RequestContextHEVC;
-+
-+static void v4l2_request_hevc_fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
-+{
-+    int32_t luma_weight_denom, chroma_weight_denom;
-+    const SliceHeader *sh = &h->sh;
-+
-+    if (sh->slice_type == HEVC_SLICE_I ||
-+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
-+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
-+        return;
-+
-+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
-+
-+    if (h->ps.sps->chroma_format_idc)
-+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
-+
-+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
-+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
-+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
-+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
-+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
-+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
-+    }
-+
-+    if (sh->slice_type != HEVC_SLICE_B)
-+        return;
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
-+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
-+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
-+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
-+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
-+    }
-+}
-+
-+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
-+{
-+    const HEVCFrame *frame;
-+    int i;
-+
-+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_BEF].ref[i];
-+        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
-+    }
-+
-+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_AFT].ref[i];
-+        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
-+    }
-+
-+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
-+        frame = h->rps[LT_CURR].ref[i];
-+        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
-+    }
-+
-+    return 0;
-+}
-+
-+static uint8_t get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
-+                                 struct v4l2_ctrl_hevc_slice_params *slice_params)
-+{
-+    uint64_t timestamp;
-+
-+    if (!frame)
-+        return 0;
-+
-+    timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
-+
-+    for (uint8_t i = 0; i < slice_params->num_active_dpb_entries; i++) {
-+        struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[i];
-+        if (entry->timestamp == timestamp)
-+            return i;
-+    }
-+
-+    return 0;
-+}
-+
-+static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
-+                                                struct v4l2_ctrl_hevc_slice_params *slice_params)
-+{
-+    const HEVCFrame *pic = h->ref;
-+    const SliceHeader *sh = &h->sh;
-+    int i, entries = 0;
-+    RefPicList *rpl;
-+
-+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
-+        .bit_size = 0,
-+        .data_bit_offset = get_bits_count(&h->HEVClc->gb),
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_segment_addr = sh->slice_segment_addr,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+        .nal_unit_type = h->nal_unit_type,
-+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_type = sh->slice_type,
-+        .colour_plane_id = sh->colour_plane_id,
-+        .slice_pic_order_cnt = pic->poc,
-+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
-+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
-+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
-+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
-+        .slice_qp_delta = sh->slice_qp_delta,
-+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
-+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
-+        .slice_act_y_qp_offset = 0,
-+        .slice_act_cb_qp_offset = 0,
-+        .slice_act_cr_qp_offset = 0,
-+        .slice_beta_offset_div2 = sh->beta_offset / 2,
-+        .slice_tc_offset_div2 = sh->tc_offset / 2,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+        .pic_struct = h->sei.picture_timing.picture_struct,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
-+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
-+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
-+    };
-+
-+    if (sh->slice_sample_adaptive_offset_flag[0])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
-+
-+    if (sh->slice_sample_adaptive_offset_flag[1])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
-+
-+    if (sh->slice_temporal_mvp_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
-+
-+    if (sh->mvd_l1_zero_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
-+
-+    if (sh->cabac_init_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
-+
-+    if (sh->collocated_list == L0)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
-+
-+    if (sh->disable_deblocking_filter_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
-+
-+    if (sh->slice_loop_filter_across_slices_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
-+        const HEVCFrame *frame = &h->DPB[i];
-+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
-+            struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[entries++];
-+
-+            entry->timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
-+            entry->rps = find_frame_rps_type(h, entry->timestamp);
-+            entry->field_pic = frame->frame->interlaced_frame;
-+
-+            /* TODO: Interleaved: Get the POC for each field. */
-+            entry->pic_order_cnt[0] = frame->poc;
-+            entry->pic_order_cnt[1] = frame->poc;
-+        }
-+    }
-+
-+    slice_params->num_active_dpb_entries = entries;
-+
-+    if (sh->slice_type != HEVC_SLICE_I) {
-+        rpl = &h->ref->refPicList[0];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
-+    }
-+
-+    if (sh->slice_type == HEVC_SLICE_B) {
-+        rpl = &h->ref->refPicList[1];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
-+    }
-+
-+    v4l2_request_hevc_fill_pred_table(h, &slice_params->pred_weight_table);
-+
-+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
-+    if (slice_params->num_entry_point_offsets > 256) {
-+        slice_params->num_entry_point_offsets = 256;
-+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
-+    }
-+
-+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
-+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
-+}
-+
-+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
-+                                         av_unused const uint8_t *buffer,
-+                                         av_unused uint32_t size)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    const HEVCSPS *sps = h->ps.sps;
-+    const HEVCPPS *pps = h->ps.pps;
-+    const ScalingList *sl = pps->scaling_list_data_present_flag ?
-+                            &pps->scaling_list :
-+                            sps->scaling_list_enable_flag ?
-+                            &sps->scaling_list : NULL;
-+    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
-+
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+    controls->sps = (struct v4l2_ctrl_hevc_sps) {
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .pic_width_in_luma_samples = sps->width,
-+        .pic_height_in_luma_samples = sps->height,
-+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
-+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
-+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
-+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
-+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
-+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
-+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
-+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
-+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
-+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
-+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
-+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
-+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
-+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
-+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
-+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
-+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
-+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
-+    };
-+
-+    if (sps->separate_colour_plane_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
-+
-+    if (sps->scaling_list_enable_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
-+
-+    if (sps->amp_enabled_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
-+
-+    if (sps->sao_enabled)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
-+
-+    if (sps->pcm_enabled_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
-+
-+    if (sps->pcm.loop_filter_disable_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
-+
-+    if (sps->long_term_ref_pics_present_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
-+
-+    if (sps->sps_temporal_mvp_enabled_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
-+
-+    if (sps->sps_strong_intra_smoothing_enable_flag)
-+        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
-+
-+    if (sl) {
-+        for (int i = 0; i < 6; i++) {
-+            for (int j = 0; j < 16; j++)
-+                controls->scaling_matrix.scaling_list_4x4[i][j] = sl->sl[0][i][j];
-+            for (int j = 0; j < 64; j++) {
-+                controls->scaling_matrix.scaling_list_8x8[i][j]   = sl->sl[1][i][j];
-+                controls->scaling_matrix.scaling_list_16x16[i][j] = sl->sl[2][i][j];
-+                if (i < 2)
-+                    controls->scaling_matrix.scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
-+            }
-+            controls->scaling_matrix.scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
-+            if (i < 2)
-+                controls->scaling_matrix.scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
-+        }
-+    }
-+
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+    controls->pps = (struct v4l2_ctrl_hevc_pps) {
-+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
-+        .init_qp_minus26 = pps->pic_init_qp_minus26,
-+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
-+        .pps_cb_qp_offset = pps->cb_qp_offset,
-+        .pps_cr_qp_offset = pps->cr_qp_offset,
-+        .pps_beta_offset_div2 = pps->beta_offset / 2,
-+        .pps_tc_offset_div2 = pps->tc_offset / 2,
-+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
-+    };
-+
-+    if (pps->dependent_slice_segments_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT;
-+
-+    if (pps->output_flag_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
-+
-+    if (pps->sign_data_hiding_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
-+
-+    if (pps->cabac_init_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
-+
-+    if (pps->constrained_intra_pred_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
-+
-+    if (pps->transform_skip_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
-+
-+    if (pps->cu_qp_delta_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
-+
-+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
-+
-+    if (pps->weighted_pred_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
-+
-+    if (pps->weighted_bipred_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
-+
-+    if (pps->transquant_bypass_enable_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
-+
-+    if (pps->tiles_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
-+
-+    if (pps->entropy_coding_sync_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
-+
-+    if (pps->loop_filter_across_tiles_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
-+
-+    if (pps->seq_loop_filter_across_slices_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    if (pps->deblocking_filter_override_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
-+
-+    if (pps->disable_dbf)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
-+
-+    if (pps->lists_modification_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
-+
-+    if (pps->slice_header_extension_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
-+
-+    if (pps->tiles_enabled_flag) {
-+        controls->pps.num_tile_columns_minus1 = pps->num_tile_columns - 1;
-+        controls->pps.num_tile_rows_minus1 = pps->num_tile_rows - 1;
-+
-+        av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p tiles_enabled_flag=%d num_tile_columns=%d num_tile_rows=%d\n",
-+	       __func__, avctx, pps->tiles_enabled_flag, pps->num_tile_columns, pps->num_tile_rows);
-+
-+        for (int i = 0; i < pps->num_tile_columns; i++)
-+            controls->pps.column_width_minus1[i] = pps->column_width[i] - 1;
-+
-+        for (int i = 0; i < pps->num_tile_rows; i++)
-+            controls->pps.row_height_minus1[i] = pps->row_height[i] - 1;
-+    }
-+
-+    controls->first_slice = 1;
-+    controls->num_slices = 0;
-+
-+    return ff_v4l2_request_reset_frame(avctx, h->ref->frame);
-+}
-+
-+static int v4l2_request_hevc_queue_decode(AVCodecContext *avctx, int last_slice)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
-+            .ptr = &controls->sps,
-+            .size = sizeof(controls->sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
-+            .ptr = &controls->pps,
-+            .size = sizeof(controls->pps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
-+            .ptr = &controls->slice_params,
-+            .size = sizeof(controls->slice_params[0]) * FFMIN(controls->num_slices, 16),
-+        },
-+    };
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED)
-+        return ff_v4l2_request_decode_slice(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
-+
-+    return ff_v4l2_request_decode_frame(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
-+{
-+    return v4l2_request_hevc_queue_decode(avctx, 1);
-+}
-+
-+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->ref->frame->data[0];
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret, slice = FFMIN(controls->num_slices, 15);
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && slice) {
-+        ret = v4l2_request_hevc_queue_decode(avctx, 0);
-+        if (ret)
-+            return ret;
-+
-+	ff_v4l2_request_reset_frame(avctx, h->ref->frame);
-+        slice = controls->num_slices = 0;
-+        controls->first_slice = 0;
-+    }
-+
-+    v4l2_request_hevc_fill_slice_params(h, &controls->slice_params[slice]);
-+
-+    ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, buffer, size);
-+    if (ret)
-+        return ret;
-+
-+    controls->slice_params[slice].bit_size = req->output.used * 8; //FIXME
-+    controls->num_slices++;
-+
-+    return 0;
-+}
-+
-+static int v4l2_request_hevc_set_controls(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+    };
-+
-+    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE);
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_START_CODE);
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    control[0].value = ctx->decode_mode;
-+    control[1].value = ctx->start_code;
-+
-+    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_hevc_init(AVCodecContext *avctx)
-+{
-+    int ret;
-+
-+    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_HEVC_SLICE, 3 * 1024 * 1024, NULL, 0);
-+    if (ret)
-+        return ret;
-+
-+    return v4l2_request_hevc_set_controls(avctx);
-+}
-+
-+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
-+    .name           = "hevc_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame    = v4l2_request_hevc_start_frame,
-+    .decode_slice   = v4l2_request_hevc_decode_slice,
-+    .end_frame      = v4l2_request_hevc_end_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsHEVC),
-+    .init           = v4l2_request_hevc_init,
-+    .uninit         = ff_v4l2_request_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
-+    .frame_params   = ff_v4l2_request_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
-+};
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0006-Add-V4L2-request-API-vp8-hwaccel.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0006-Add-V4L2-request-API-vp8-hwaccel.patch
deleted file mode 100644
index fc8ceb65cb..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0006-Add-V4L2-request-API-vp8-hwaccel.patch
+++ /dev/null
@@ -1,282 +0,0 @@
-From 1084cad0a91e33a95150ff82f1d4ce2a0c180e2c Mon Sep 17 00:00:00 2001
-From: Boris Brezillon <boris.brezillon@collabora.com>
-Date: Wed, 22 May 2019 14:46:58 +0200
-Subject: [PATCH 06/12] Add V4L2 request API vp8 hwaccel
-
-Need to fix the STREAMOFF/STREAMON issue in a proper way.
-
-Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
-Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
----
- configure                     |   3 +
- libavcodec/Makefile           |   1 +
- libavcodec/hwaccels.h         |   1 +
- libavcodec/v4l2_request_vp8.c | 180 ++++++++++++++++++++++++++++++++++
- libavcodec/vp8.c              |   8 +-
- 5 files changed, 192 insertions(+), 1 deletion(-)
- create mode 100644 libavcodec/v4l2_request_vp8.c
-
-diff --git a/configure b/configure
-index d51ee62a70..6bdfe6fd95 100755
---- a/configure
-+++ b/configure
-@@ -2975,6 +2975,8 @@ vp8_nvdec_hwaccel_deps="nvdec"
- vp8_nvdec_hwaccel_select="vp8_decoder"
- vp8_vaapi_hwaccel_deps="vaapi"
- vp8_vaapi_hwaccel_select="vp8_decoder"
-+vp8_v4l2request_hwaccel_deps="v4l2_request vp8_v4l2_request"
-+vp8_v4l2request_hwaccel_select="vp8_decoder"
- vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
- vp9_d3d11va_hwaccel_select="vp9_decoder"
- vp9_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
-@@ -6464,6 +6466,7 @@ check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
- check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
- check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
- check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
-+check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;"
- 
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 2b53d2b650..55fa28548b 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -910,6 +910,7 @@ OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
- OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
- OBJS-$(CONFIG_VP8_NVDEC_HWACCEL)          += nvdec_vp8.o
- OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
-+OBJS-$(CONFIG_VP8_V4L2REQUEST_HWACCEL)    += v4l2_request_vp8.o
- OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
- OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
- OBJS-$(CONFIG_VP9_NVDEC_HWACCEL)          += nvdec_vp9.o
-diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
-index d183675abe..0fca5be557 100644
---- a/libavcodec/hwaccels.h
-+++ b/libavcodec/hwaccels.h
-@@ -66,6 +66,7 @@ extern const AVHWAccel ff_vc1_vaapi_hwaccel;
- extern const AVHWAccel ff_vc1_vdpau_hwaccel;
- extern const AVHWAccel ff_vp8_nvdec_hwaccel;
- extern const AVHWAccel ff_vp8_vaapi_hwaccel;
-+extern const AVHWAccel ff_vp8_v4l2request_hwaccel;
- extern const AVHWAccel ff_vp9_d3d11va_hwaccel;
- extern const AVHWAccel ff_vp9_d3d11va2_hwaccel;
- extern const AVHWAccel ff_vp9_dxva2_hwaccel;
-diff --git a/libavcodec/v4l2_request_vp8.c b/libavcodec/v4l2_request_vp8.c
-new file mode 100644
-index 0000000000..d24252c5e5
---- /dev/null
-+++ b/libavcodec/v4l2_request_vp8.c
-@@ -0,0 +1,180 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hwaccel.h"
-+#include "v4l2_request.h"
-+#include "vp8.h"
-+
-+typedef struct V4L2RequestControlsVP8 {
-+	struct v4l2_ctrl_vp8_frame_header ctrl;
-+} V4L2RequestControlsVP8;
-+
-+static int v4l2_request_vp8_start_frame(AVCodecContext          *avctx,
-+                                 av_unused const uint8_t *buffer,
-+                                 av_unused uint32_t       size)
-+{
-+    const VP8Context *s = avctx->priv_data;
-+    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
-+
-+    memset(&controls->ctrl, 0, sizeof(controls->ctrl));
-+    return ff_v4l2_request_reset_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f);
-+}
-+
-+static int v4l2_request_vp8_end_frame(AVCodecContext *avctx)
-+{
-+    const VP8Context *s = avctx->priv_data;
-+    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER,
-+	    .ptr = &controls->ctrl,
-+	    .size = sizeof(controls->ctrl),
-+        },
-+    };
-+
-+    return ff_v4l2_request_decode_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f,
-+					control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_vp8_decode_slice(AVCodecContext *avctx,
-+					 const uint8_t *buffer,
-+					 uint32_t size)
-+{
-+    const VP8Context *s = avctx->priv_data;
-+    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
-+    struct v4l2_ctrl_vp8_frame_header *hdr = &controls->ctrl;
-+    const uint8_t *data = buffer + 3 + 7 * s->keyframe;
-+    unsigned int i, j, k;
-+
-+    hdr->version = s->profile & 0x3;
-+    hdr->width = avctx->width;
-+    hdr->height = avctx->height;
-+    /* FIXME: set ->xx_scale */
-+    hdr->prob_skip_false = s->prob->mbskip;
-+    hdr->prob_intra = s->prob->intra;
-+    hdr->prob_gf = s->prob->golden;
-+    hdr->prob_last = s->prob->last;
-+    hdr->first_part_size = s->header_partition_size;
-+    hdr->first_part_header_bits = (8 * (s->coder_state_at_header_end.input - data) -
-+                                   s->coder_state_at_header_end.bit_count - 8);
-+    hdr->num_dct_parts = s->num_coeff_partitions;
-+    for (i = 0; i < 8; i++)
-+	    hdr->dct_part_sizes[i] = s->coeff_partition_size[i];
-+
-+    hdr->coder_state.range = s->coder_state_at_header_end.range;
-+    hdr->coder_state.value = s->coder_state_at_header_end.value;
-+    hdr->coder_state.bit_count = s->coder_state_at_header_end.bit_count;
-+    if (s->framep[VP56_FRAME_PREVIOUS])
-+        hdr->last_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_PREVIOUS]->tf.f);
-+    if (s->framep[VP56_FRAME_GOLDEN])
-+        hdr->golden_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN]->tf.f);
-+    if (s->framep[VP56_FRAME_GOLDEN2])
-+        hdr->alt_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN2]->tf.f);
-+    hdr->flags |= s->invisible ? 0 : V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME;
-+    hdr->flags |= s->mbskip_enabled ? V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF : 0;
-+    hdr->flags |= (s->profile & 0x4) ? V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL : 0;
-+    hdr->flags |= s->keyframe ? V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME : 0;
-+    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN	: 0;
-+    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN2] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT : 0;
-+    hdr->segment_header.flags |= s->segmentation.enabled ? V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED : 0;
-+    hdr->segment_header.flags |= s->segmentation.update_map ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP : 0;
-+    hdr->segment_header.flags |= s->segmentation.update_feature_data ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA : 0;
-+    hdr->segment_header.flags |= s->segmentation.absolute_vals ? 0 : V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE;
-+    for (i = 0; i < 4; i++) {
-+        hdr->segment_header.quant_update[i] = s->segmentation.base_quant[i];
-+        hdr->segment_header.lf_update[i] = s->segmentation.filter_level[i];
-+    }
-+
-+    for (i = 0; i < 3; i++)
-+        hdr->segment_header.segment_probs[i] = s->prob->segmentid[i];
-+
-+    hdr->lf_header.level = s->filter.level;
-+    hdr->lf_header.sharpness_level = s->filter.sharpness;
-+    hdr->lf_header.flags |= s->lf_delta.enabled ? V4L2_VP8_LF_HEADER_ADJ_ENABLE : 0;
-+    hdr->lf_header.flags |= s->lf_delta.update ? V4L2_VP8_LF_HEADER_DELTA_UPDATE : 0;
-+    hdr->lf_header.flags |= s->filter.simple ? V4L2_VP8_LF_FILTER_TYPE_SIMPLE : 0;
-+    for (i = 0; i < 4; i++) {
-+        hdr->lf_header.ref_frm_delta[i] = s->lf_delta.ref[i];
-+        hdr->lf_header.mb_mode_delta[i] = s->lf_delta.mode[i + MODE_I4x4];
-+    }
-+
-+    // Probabilites
-+    if (s->keyframe) {
-+        static const uint8_t keyframe_y_mode_probs[4] = {
-+            145, 156, 163, 128
-+        };
-+        static const uint8_t keyframe_uv_mode_probs[3] = {
-+            142, 114, 183
-+        };
-+
-+        memcpy(hdr->entropy_header.y_mode_probs, keyframe_y_mode_probs,  4);
-+        memcpy(hdr->entropy_header.uv_mode_probs, keyframe_uv_mode_probs, 3);
-+    } else {
-+        for (i = 0; i < 4; i++)
-+            hdr->entropy_header.y_mode_probs[i] = s->prob->pred16x16[i];
-+        for (i = 0; i < 3; i++)
-+            hdr->entropy_header.uv_mode_probs[i] = s->prob->pred8x8c[i];
-+    }
-+    for (i = 0; i < 2; i++)
-+        for (j = 0; j < 19; j++)
-+            hdr->entropy_header.mv_probs[i][j] = s->prob->mvc[i][j];
-+
-+    for (i = 0; i < 4; i++) {
-+        for (j = 0; j < 8; j++) {
-+            static const int coeff_bands_inverse[8] = {
-+                0, 1, 2, 3, 5, 6, 4, 15
-+            };
-+            int coeff_pos = coeff_bands_inverse[j];
-+
-+            for (k = 0; k < 3; k++) {
-+                memcpy(hdr->entropy_header.coeff_probs[i][j][k],
-+                       s->prob->token[i][coeff_pos][k], 11);
-+            }
-+        }
-+    }
-+
-+    hdr->quant_header.y_ac_qi = s->quant.yac_qi;
-+    hdr->quant_header.y_dc_delta = s->quant.ydc_delta;
-+    hdr->quant_header.y2_dc_delta = s->quant.y2dc_delta;
-+    hdr->quant_header.y2_ac_delta = s->quant.y2ac_delta;
-+    hdr->quant_header.uv_dc_delta = s->quant.uvdc_delta;
-+    hdr->quant_header.uv_ac_delta = s->quant.uvac_delta;
-+
-+    return ff_v4l2_request_append_output_buffer(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f, buffer, size);
-+}
-+
-+static int v4l2_request_vp8_init(AVCodecContext *avctx)
-+{
-+    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP8_FRAME, 1024 * 1024, NULL, 0);
-+}
-+
-+const AVHWAccel ff_vp8_v4l2request_hwaccel = {
-+    .name                 = "vp8_v4l2request",
-+    .type                 = AVMEDIA_TYPE_VIDEO,
-+    .id                   = AV_CODEC_ID_VP8,
-+    .pix_fmt              = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame          = v4l2_request_vp8_start_frame,
-+    .decode_slice         = v4l2_request_vp8_decode_slice,
-+    .end_frame            = v4l2_request_vp8_end_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsVP8),
-+    .init                 = v4l2_request_vp8_init,
-+    .uninit               = ff_v4l2_request_uninit,
-+    .priv_data_size       = sizeof(V4L2RequestContext),
-+    .frame_params         = ff_v4l2_request_frame_params,
-+    .caps_internal        = HWACCEL_CAP_ASYNC_SAFE,
-+};
-diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
-index 3ddc349a4d..d4289320a9 100644
---- a/libavcodec/vp8.c
-+++ b/libavcodec/vp8.c
-@@ -175,6 +175,9 @@ static enum AVPixelFormat get_pixel_format(VP8Context *s)
- #endif
- #if CONFIG_VP8_NVDEC_HWACCEL
-         AV_PIX_FMT_CUDA,
-+#endif
-+#if CONFIG_VP8_V4L2REQUEST_HWACCEL
-+        AV_PIX_FMT_DRM_PRIME,
- #endif
-         AV_PIX_FMT_YUV420P,
-         AV_PIX_FMT_NONE,
-@@ -198,7 +201,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
-             return ret;
-     }
- 
--    if (!s->actually_webp && !is_vp7) {
-+    if (!s->actually_webp && !is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
-         s->pix_fmt = get_pixel_format(s);
-         if (s->pix_fmt < 0)
-             return AVERROR(EINVAL);
-@@ -2981,6 +2984,9 @@ AVCodec ff_vp8_decoder = {
- #endif
- #if CONFIG_VP8_NVDEC_HWACCEL
-                                HWACCEL_NVDEC(vp8),
-+#endif
-+#if CONFIG_VP8_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(vp8),
- #endif
-                                NULL
-                            },
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0007-Add-and-use-private-linux-headers-for-V4L2-request-A.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0007-Add-and-use-private-linux-headers-for-V4L2-request-A.patch
deleted file mode 100644
index a5710e47ec..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0007-Add-and-use-private-linux-headers-for-V4L2-request-A.patch
+++ /dev/null
@@ -1,759 +0,0 @@
-From e19fbf3c8c161a24e039b93dee2281d5b22e2ca8 Mon Sep 17 00:00:00 2001
-From: Jernej Skrabec <jernej.skrabec@siol.net>
-Date: Thu, 14 Feb 2019 23:20:05 +0100
-Subject: [PATCH 07/12] Add and use private linux headers for V4L2 request API
- ctrls
-
-Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
----
- configure                       |   6 +-
- libavcodec/h264-ctrls.h         | 210 +++++++++++++++++++++++++++++
- libavcodec/hevc-ctrls.h         | 229 ++++++++++++++++++++++++++++++++
- libavcodec/mpeg2-ctrls.h        |  82 ++++++++++++
- libavcodec/v4l2_request_h264.c  |   1 +
- libavcodec/v4l2_request_hevc.c  |   1 +
- libavcodec/v4l2_request_mpeg2.c |   1 +
- libavcodec/v4l2_request_vp8.c   |   1 +
- libavcodec/vp8-ctrls.h          | 112 ++++++++++++++++
- 9 files changed, 640 insertions(+), 3 deletions(-)
- create mode 100644 libavcodec/h264-ctrls.h
- create mode 100644 libavcodec/hevc-ctrls.h
- create mode 100644 libavcodec/mpeg2-ctrls.h
- create mode 100644 libavcodec/vp8-ctrls.h
-
-diff --git a/configure b/configure
-index 6bdfe6fd95..e3a3d82395 100755
---- a/configure
-+++ b/configure
-@@ -2895,7 +2895,7 @@ h264_dxva2_hwaccel_deps="dxva2"
- h264_dxva2_hwaccel_select="h264_decoder"
- h264_nvdec_hwaccel_deps="nvdec"
- h264_nvdec_hwaccel_select="h264_decoder"
--h264_v4l2request_hwaccel_deps="v4l2_request h264_v4l2_request"
-+h264_v4l2request_hwaccel_deps="v4l2_request"
- h264_v4l2request_hwaccel_select="h264_decoder"
- h264_vaapi_hwaccel_deps="vaapi"
- h264_vaapi_hwaccel_select="h264_decoder"
-@@ -2911,7 +2911,7 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
- hevc_dxva2_hwaccel_select="hevc_decoder"
- hevc_nvdec_hwaccel_deps="nvdec"
- hevc_nvdec_hwaccel_select="hevc_decoder"
--hevc_v4l2request_hwaccel_deps="v4l2_request hevc_v4l2_request"
-+hevc_v4l2request_hwaccel_deps="v4l2_request"
- hevc_v4l2request_hwaccel_select="hevc_decoder"
- hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
- hevc_vaapi_hwaccel_select="hevc_decoder"
-@@ -2975,7 +2975,7 @@ vp8_nvdec_hwaccel_deps="nvdec"
- vp8_nvdec_hwaccel_select="vp8_decoder"
- vp8_vaapi_hwaccel_deps="vaapi"
- vp8_vaapi_hwaccel_select="vp8_decoder"
--vp8_v4l2request_hwaccel_deps="v4l2_request vp8_v4l2_request"
-+vp8_v4l2request_hwaccel_deps="v4l2_request"
- vp8_v4l2request_hwaccel_select="vp8_decoder"
- vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
- vp9_d3d11va_hwaccel_select="vp9_decoder"
-diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h
-new file mode 100644
-index 0000000000..e877bf1d53
---- /dev/null
-+++ b/libavcodec/h264-ctrls.h
-@@ -0,0 +1,210 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the H.264 state controls for use with stateless H.264
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _H264_CTRLS_H_
-+#define _H264_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* Our pixel format isn't stable at the moment */
-+#define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
-+
-+/*
-+ * This is put insanely high to avoid conflicting with controls that
-+ * would be added during the phase where those controls are not
-+ * stable. It should be fixed eventually.
-+ */
-+#define V4L2_CID_MPEG_VIDEO_H264_SPS		(V4L2_CID_MPEG_BASE+1000)
-+#define V4L2_CID_MPEG_VIDEO_H264_PPS		(V4L2_CID_MPEG_BASE+1001)
-+#define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
-+#define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
-+#define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
-+#define V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE	(V4L2_CID_MPEG_BASE+1005)
-+#define V4L2_CID_MPEG_VIDEO_H264_START_CODE	(V4L2_CID_MPEG_BASE+1006)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_H264_SPS			0x0110
-+#define V4L2_CTRL_TYPE_H264_PPS			0x0111
-+#define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
-+#define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
-+#define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
-+
-+enum v4l2_mpeg_video_h264_decode_mode {
-+	V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_h264_start_code {
-+	V4L2_MPEG_VIDEO_H264_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
-+#define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
-+#define V4L2_H264_SPS_CONSTRAINT_SET2_FLAG			0x04
-+#define V4L2_H264_SPS_CONSTRAINT_SET3_FLAG			0x08
-+#define V4L2_H264_SPS_CONSTRAINT_SET4_FLAG			0x10
-+#define V4L2_H264_SPS_CONSTRAINT_SET5_FLAG			0x20
-+
-+#define V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE		0x01
-+#define V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS	0x02
-+#define V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO		0x04
-+#define V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED	0x08
-+#define V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY			0x10
-+#define V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD		0x20
-+#define V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE			0x40
-+
-+struct v4l2_ctrl_h264_sps {
-+	__u8 profile_idc;
-+	__u8 constraint_set_flags;
-+	__u8 level_idc;
-+	__u8 seq_parameter_set_id;
-+	__u8 chroma_format_idc;
-+	__u8 bit_depth_luma_minus8;
-+	__u8 bit_depth_chroma_minus8;
-+	__u8 log2_max_frame_num_minus4;
-+	__u8 pic_order_cnt_type;
-+	__u8 log2_max_pic_order_cnt_lsb_minus4;
-+	__u8 max_num_ref_frames;
-+	__u8 num_ref_frames_in_pic_order_cnt_cycle;
-+	__s32 offset_for_ref_frame[255];
-+	__s32 offset_for_non_ref_pic;
-+	__s32 offset_for_top_to_bottom_field;
-+	__u16 pic_width_in_mbs_minus1;
-+	__u16 pic_height_in_map_units_minus1;
-+	__u32 flags;
-+};
-+
-+#define V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE				0x0001
-+#define V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT	0x0002
-+#define V4L2_H264_PPS_FLAG_WEIGHTED_PRED				0x0004
-+#define V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT		0x0008
-+#define V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED			0x0010
-+#define V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT			0x0020
-+#define V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE				0x0040
-+#define V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT			0x0080
-+
-+struct v4l2_ctrl_h264_pps {
-+	__u8 pic_parameter_set_id;
-+	__u8 seq_parameter_set_id;
-+	__u8 num_slice_groups_minus1;
-+	__u8 num_ref_idx_l0_default_active_minus1;
-+	__u8 num_ref_idx_l1_default_active_minus1;
-+	__u8 weighted_bipred_idc;
-+	__s8 pic_init_qp_minus26;
-+	__s8 pic_init_qs_minus26;
-+	__s8 chroma_qp_index_offset;
-+	__s8 second_chroma_qp_index_offset;
-+	__u16 flags;
-+};
-+
-+struct v4l2_ctrl_h264_scaling_matrix {
-+	__u8 scaling_list_4x4[6][16];
-+	__u8 scaling_list_8x8[6][64];
-+};
-+
-+struct v4l2_h264_weight_factors {
-+	__s16 luma_weight[32];
-+	__s16 luma_offset[32];
-+	__s16 chroma_weight[32][2];
-+	__s16 chroma_offset[32][2];
-+};
-+
-+struct v4l2_h264_pred_weight_table {
-+	__u16 luma_log2_weight_denom;
-+	__u16 chroma_log2_weight_denom;
-+	struct v4l2_h264_weight_factors weight_factors[2];
-+};
-+
-+#define V4L2_H264_SLICE_TYPE_P				0
-+#define V4L2_H264_SLICE_TYPE_B				1
-+#define V4L2_H264_SLICE_TYPE_I				2
-+#define V4L2_H264_SLICE_TYPE_SP				3
-+#define V4L2_H264_SLICE_TYPE_SI				4
-+
-+#define V4L2_H264_SLICE_FLAG_FIELD_PIC			0x01
-+#define V4L2_H264_SLICE_FLAG_BOTTOM_FIELD		0x02
-+#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED	0x04
-+#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH		0x08
-+
-+struct v4l2_ctrl_h264_slice_params {
-+	/* Size in bytes, including header */
-+	__u32 size;
-+
-+	/* Offset in bytes to the start of slice in the OUTPUT buffer. */
-+	__u32 start_byte_offset;
-+
-+	/* Offset in bits to slice_data() from the beginning of this slice. */
-+	__u32 header_bit_size;
-+
-+	__u16 first_mb_in_slice;
-+	__u8 slice_type;
-+	__u8 pic_parameter_set_id;
-+	__u8 colour_plane_id;
-+	__u8 redundant_pic_cnt;
-+	__u16 frame_num;
-+	__u16 idr_pic_id;
-+	__u16 pic_order_cnt_lsb;
-+	__s32 delta_pic_order_cnt_bottom;
-+	__s32 delta_pic_order_cnt0;
-+	__s32 delta_pic_order_cnt1;
-+
-+	struct v4l2_h264_pred_weight_table pred_weight_table;
-+	/* Size in bits of dec_ref_pic_marking() syntax element. */
-+	__u32 dec_ref_pic_marking_bit_size;
-+	/* Size in bits of pic order count syntax. */
-+	__u32 pic_order_cnt_bit_size;
-+
-+	__u8 cabac_init_idc;
-+	__s8 slice_qp_delta;
-+	__s8 slice_qs_delta;
-+	__u8 disable_deblocking_filter_idc;
-+	__s8 slice_alpha_c0_offset_div2;
-+	__s8 slice_beta_offset_div2;
-+	__u8 num_ref_idx_l0_active_minus1;
-+	__u8 num_ref_idx_l1_active_minus1;
-+	__u32 slice_group_change_cycle;
-+
-+	/*
-+	 * Entries on each list are indices into
-+	 * v4l2_ctrl_h264_decode_params.dpb[].
-+	 */
-+	__u8 ref_pic_list0[32];
-+	__u8 ref_pic_list1[32];
-+
-+	__u32 flags;
-+};
-+
-+#define V4L2_H264_DPB_ENTRY_FLAG_VALID		0x01
-+#define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE		0x02
-+#define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM	0x04
-+
-+struct v4l2_h264_dpb_entry {
-+	__u64 reference_ts;
-+	__u16 frame_num;
-+	__u16 pic_num;
-+	/* Note that field is indicated by v4l2_buffer.field */
-+	__s32 top_field_order_cnt;
-+	__s32 bottom_field_order_cnt;
-+	__u32 flags; /* V4L2_H264_DPB_ENTRY_FLAG_* */
-+};
-+
-+#define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC	0x01
-+
-+struct v4l2_ctrl_h264_decode_params {
-+	struct v4l2_h264_dpb_entry dpb[16];
-+	__u16 num_slices;
-+	__u16 nal_ref_idc;
-+	__s32 top_field_order_cnt;
-+	__s32 bottom_field_order_cnt;
-+	__u32 flags; /* V4L2_H264_DECODE_PARAM_FLAG_* */
-+};
-+
-+#endif
-diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h
-new file mode 100644
-index 0000000000..d1b094c8aa
---- /dev/null
-+++ b/libavcodec/hevc-ctrls.h
-@@ -0,0 +1,229 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+
-+	__u8	padding;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
-+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	rps;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	num_active_dpb_entries;
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	num_rps_poc_st_curr_before;
-+	__u8	num_rps_poc_st_curr_after;
-+	__u8	num_rps_poc_lt_curr;
-+
-+	__u8	padding;
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+#endif
-diff --git a/libavcodec/mpeg2-ctrls.h b/libavcodec/mpeg2-ctrls.h
-new file mode 100644
-index 0000000000..6601455b3d
---- /dev/null
-+++ b/libavcodec/mpeg2-ctrls.h
-@@ -0,0 +1,82 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the MPEG2 state controls for use with stateless MPEG-2
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _MPEG2_CTRLS_H_
-+#define _MPEG2_CTRLS_H_
-+
-+#define V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS		(V4L2_CID_MPEG_BASE+250)
-+#define V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION		(V4L2_CID_MPEG_BASE+251)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS 0x0103
-+#define	V4L2_CTRL_TYPE_MPEG2_QUANTIZATION 0x0104
-+
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_I	1
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_P	2
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_B	3
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_D	4
-+
-+struct v4l2_mpeg2_sequence {
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
-+	__u16	horizontal_size;
-+	__u16	vertical_size;
-+	__u32	vbv_buffer_size;
-+
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
-+	__u16	profile_and_level_indication;
-+	__u8	progressive_sequence;
-+	__u8	chroma_format;
-+};
-+
-+struct v4l2_mpeg2_picture {
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
-+	__u8	picture_coding_type;
-+
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
-+	__u8	f_code[2][2];
-+	__u8	intra_dc_precision;
-+	__u8	picture_structure;
-+	__u8	top_field_first;
-+	__u8	frame_pred_frame_dct;
-+	__u8	concealment_motion_vectors;
-+	__u8	q_scale_type;
-+	__u8	intra_vlc_format;
-+	__u8	alternate_scan;
-+	__u8	repeat_first_field;
-+	__u16	progressive_frame;
-+};
-+
-+struct v4l2_ctrl_mpeg2_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+	__u64	backward_ref_ts;
-+	__u64	forward_ref_ts;
-+
-+	struct v4l2_mpeg2_sequence sequence;
-+	struct v4l2_mpeg2_picture picture;
-+
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
-+	__u32	quantiser_scale_code;
-+};
-+
-+struct v4l2_ctrl_mpeg2_quantization {
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
-+	__u8	load_intra_quantiser_matrix;
-+	__u8	load_non_intra_quantiser_matrix;
-+	__u8	load_chroma_intra_quantiser_matrix;
-+	__u8	load_chroma_non_intra_quantiser_matrix;
-+
-+	__u8	intra_quantiser_matrix[64];
-+	__u8	non_intra_quantiser_matrix[64];
-+	__u8	chroma_intra_quantiser_matrix[64];
-+	__u8	chroma_non_intra_quantiser_matrix[64];
-+};
-+
-+#endif
-diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
-index 81b3c4b092..ca306b6a3f 100644
---- a/libavcodec/v4l2_request_h264.c
-+++ b/libavcodec/v4l2_request_h264.c
-@@ -19,6 +19,7 @@
- #include "h264dec.h"
- #include "hwaccel.h"
- #include "v4l2_request.h"
-+#include "h264-ctrls.h"
- 
- typedef struct V4L2RequestControlsH264 {
-     struct v4l2_ctrl_h264_sps sps;
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index da1fd666d7..94977c5d0e 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -19,6 +19,7 @@
- #include "hevcdec.h"
- #include "hwaccel.h"
- #include "v4l2_request.h"
-+#include "hevc-ctrls.h"
- 
- typedef struct V4L2RequestControlsHEVC {
-     struct v4l2_ctrl_hevc_sps sps;
-diff --git a/libavcodec/v4l2_request_mpeg2.c b/libavcodec/v4l2_request_mpeg2.c
-index 782b9c2471..37a4eae62c 100644
---- a/libavcodec/v4l2_request_mpeg2.c
-+++ b/libavcodec/v4l2_request_mpeg2.c
-@@ -19,6 +19,7 @@
- #include "hwaccel.h"
- #include "mpegvideo.h"
- #include "v4l2_request.h"
-+#include "mpeg2-ctrls.h"
- 
- typedef struct V4L2RequestControlsMPEG2 {
-     struct v4l2_ctrl_mpeg2_slice_params slice_params;
-diff --git a/libavcodec/v4l2_request_vp8.c b/libavcodec/v4l2_request_vp8.c
-index d24252c5e5..c290fe8b9a 100644
---- a/libavcodec/v4l2_request_vp8.c
-+++ b/libavcodec/v4l2_request_vp8.c
-@@ -19,6 +19,7 @@
- #include "hwaccel.h"
- #include "v4l2_request.h"
- #include "vp8.h"
-+#include "vp8-ctrls.h"
- 
- typedef struct V4L2RequestControlsVP8 {
- 	struct v4l2_ctrl_vp8_frame_header ctrl;
-diff --git a/libavcodec/vp8-ctrls.h b/libavcodec/vp8-ctrls.h
-new file mode 100644
-index 0000000000..53cba826e4
---- /dev/null
-+++ b/libavcodec/vp8-ctrls.h
-@@ -0,0 +1,112 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the VP8 state controls for use with stateless VP8
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _VP8_CTRLS_H_
-+#define _VP8_CTRLS_H_
-+
-+#include <linux/types.h>
-+
-+#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F')
-+
-+#define V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (V4L2_CID_MPEG_BASE + 2000)
-+#define V4L2_CTRL_TYPE_VP8_FRAME_HEADER 0x301
-+
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED              0x01
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP           0x02
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA  0x04
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE     0x08
-+
-+struct v4l2_vp8_segment_header {
-+	__s8 quant_update[4];
-+	__s8 lf_update[4];
-+	__u8 segment_probs[3];
-+	__u8 padding;
-+	__u32 flags;
-+};
-+
-+#define V4L2_VP8_LF_HEADER_ADJ_ENABLE	0x01
-+#define V4L2_VP8_LF_HEADER_DELTA_UPDATE	0x02
-+#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE	0x04
-+struct v4l2_vp8_loopfilter_header {
-+	__s8 ref_frm_delta[4];
-+	__s8 mb_mode_delta[4];
-+	__u8 sharpness_level;
-+	__u8 level;
-+	__u16 padding;
-+	__u32 flags;
-+};
-+
-+struct v4l2_vp8_quantization_header {
-+	__u8 y_ac_qi;
-+	__s8 y_dc_delta;
-+	__s8 y2_dc_delta;
-+	__s8 y2_ac_delta;
-+	__s8 uv_dc_delta;
-+	__s8 uv_ac_delta;
-+	__u16 padding;
-+};
-+
-+struct v4l2_vp8_entropy_header {
-+	__u8 coeff_probs[4][8][3][11];
-+	__u8 y_mode_probs[4];
-+	__u8 uv_mode_probs[3];
-+	__u8 mv_probs[2][19];
-+	__u8 padding[3];
-+};
-+
-+struct v4l2_vp8_entropy_coder_state {
-+	__u8 range;
-+	__u8 value;
-+	__u8 bit_count;
-+	__u8 padding;
-+};
-+
-+#define V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME		0x01
-+#define V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL		0x02
-+#define V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME		0x04
-+#define V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF	0x08
-+#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN	0x10
-+#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT	0x20
-+
-+#define VP8_FRAME_IS_KEY_FRAME(hdr) \
-+	(!!((hdr)->flags & V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME))
-+
-+struct v4l2_ctrl_vp8_frame_header {
-+	struct v4l2_vp8_segment_header segment_header;
-+	struct v4l2_vp8_loopfilter_header lf_header;
-+	struct v4l2_vp8_quantization_header quant_header;
-+	struct v4l2_vp8_entropy_header entropy_header;
-+	struct v4l2_vp8_entropy_coder_state coder_state;
-+
-+	__u16 width;
-+	__u16 height;
-+
-+	__u8 horizontal_scale;
-+	__u8 vertical_scale;
-+
-+	__u8 version;
-+	__u8 prob_skip_false;
-+	__u8 prob_intra;
-+	__u8 prob_last;
-+	__u8 prob_gf;
-+	__u8 num_dct_parts;
-+
-+	__u32 first_part_size;
-+	__u32 first_part_header_bits;
-+	__u32 dct_part_sizes[8];
-+
-+	__u64 last_frame_ts;
-+	__u64 golden_frame_ts;
-+	__u64 alt_frame_ts;
-+
-+	__u64 flags;
-+};
-+
-+#endif
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0008-hwcontext_drm-do-not-require-drm-device.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0008-hwcontext_drm-do-not-require-drm-device.patch
deleted file mode 100644
index 3bc62b6025..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0008-hwcontext_drm-do-not-require-drm-device.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From 60383b61fa64aa0e2806de0b8cf8a3f8d85c41e4 Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Mon, 29 Apr 2019 22:08:59 +0000
-Subject: [PATCH 08/12] hwcontext_drm: do not require drm device
-
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- libavutil/hwcontext_drm.c | 5 +++++
- 1 file changed, 5 insertions(+)
-
-diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
-index 32cbde82eb..aa4794c5e6 100644
---- a/libavutil/hwcontext_drm.c
-+++ b/libavutil/hwcontext_drm.c
-@@ -43,6 +43,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
-     AVDRMDeviceContext *hwctx = hwdev->hwctx;
-     drmVersionPtr version;
- 
-+    if (device == NULL) {
-+      hwctx->fd = -1;
-+      return 0;
-+    }
-+
-     hwctx->fd = open(device, O_RDWR);
-     if (hwctx->fd < 0)
-         return AVERROR(errno);
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0009-avcodec-h264-parse-idr_pic_id.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0009-avcodec-h264-parse-idr_pic_id.patch
deleted file mode 100644
index 0f31d2c768..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0009-avcodec-h264-parse-idr_pic_id.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-From 4e1a25f7cbbe9978548680171580afdfbf84d603 Mon Sep 17 00:00:00 2001
-From: Ezequiel Garcia <ezequiel@collabora.com>
-Date: Wed, 20 Feb 2019 11:18:00 -0300
-Subject: [PATCH 09/12] avcodec/h264: parse idr_pic_id
-
-Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
----
- libavcodec/h264_slice.c        | 2 +-
- libavcodec/h264dec.h           | 2 ++
- libavcodec/v4l2_request_h264.c | 2 +-
- 3 files changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
-index 9912ede703..18ba196246 100644
---- a/libavcodec/h264_slice.c
-+++ b/libavcodec/h264_slice.c
-@@ -1824,7 +1824,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-     }
- 
-     if (nal->type == H264_NAL_IDR_SLICE)
--        get_ue_golomb_long(&sl->gb); /* idr_pic_id */
-+        sl->idr_pic_id = get_ue_golomb_long(&sl->gb);
- 
-     if (sps->poc_type == 0) {
-         sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
-diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h
-index b0b42b7672..38efab5c60 100644
---- a/libavcodec/h264dec.h
-+++ b/libavcodec/h264dec.h
-@@ -184,6 +184,8 @@ typedef struct H264SliceContext {
-     int slice_type_nos;         ///< S free slice type (SI/SP are remapped to I/P)
-     int slice_type_fixed;
- 
-+    int idr_pic_id;
-+
-     int qscale;
-     int chroma_qp[2];   // QPc
-     int qp_thresh;      ///< QP threshold to skip loopfilter
-diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
-index ca306b6a3f..f21c8b3508 100644
---- a/libavcodec/v4l2_request_h264.c
-+++ b/libavcodec/v4l2_request_h264.c
-@@ -301,7 +301,7 @@ static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *
-         .pic_parameter_set_id = sl->pps_id,
-         .colour_plane_id = 0, /* what is this? */
-         .frame_num = h->poc.frame_num,
--        .idr_pic_id = 0, /* what is this? */
-+        .idr_pic_id = sl->idr_pic_id,
-         .pic_order_cnt_lsb = sl->poc_lsb,
-         .delta_pic_order_cnt_bottom = sl->delta_poc_bottom,
-         .delta_pic_order_cnt0 = sl->delta_poc[0],
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0010-avcodec-h264-parse-ref_pic_marking_size_in_bits-and-.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0010-avcodec-h264-parse-ref_pic_marking_size_in_bits-and-.patch
deleted file mode 100644
index 9bd9efacf7..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0010-avcodec-h264-parse-ref_pic_marking_size_in_bits-and-.patch
+++ /dev/null
@@ -1,88 +0,0 @@
-From 8309a49f4049b6ec8d4fd76a5dc18f8d23f8315c Mon Sep 17 00:00:00 2001
-From: Boris Brezillon <boris.brezillon@collabora.com>
-Date: Wed, 22 May 2019 14:44:22 +0200
-Subject: [PATCH 10/12] avcodec/h264: parse ref_pic_marking_size_in_bits and
- pic_order_cnt_bit_size
-
-Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
----
- libavcodec/h264_slice.c        | 6 +++++-
- libavcodec/h264dec.h           | 2 ++
- libavcodec/v4l2_request_h264.c | 4 ++--
- 3 files changed, 9 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
-index 18ba196246..1bd3388f1e 100644
---- a/libavcodec/h264_slice.c
-+++ b/libavcodec/h264_slice.c
-@@ -1736,7 +1736,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-     unsigned int slice_type, tmp, i;
-     int field_pic_flag, bottom_field_flag;
-     int first_slice = sl == h->slice_ctx && !h->current_slice;
--    int picture_structure;
-+    int picture_structure, pos;
- 
-     if (first_slice)
-         av_assert0(!h->setup_finished);
-@@ -1826,6 +1826,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-     if (nal->type == H264_NAL_IDR_SLICE)
-         sl->idr_pic_id = get_ue_golomb_long(&sl->gb);
- 
-+    pos = sl->gb.index;
-     if (sps->poc_type == 0) {
-         sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
- 
-@@ -1839,6 +1840,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-         if (pps->pic_order_present == 1 && picture_structure == PICT_FRAME)
-             sl->delta_poc[1] = get_se_golomb(&sl->gb);
-     }
-+    sl->pic_order_cnt_bit_size = sl->gb.index - pos;
- 
-     sl->redundant_pic_count = 0;
-     if (pps->redundant_pic_cnt_present)
-@@ -1878,9 +1880,11 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
- 
-     sl->explicit_ref_marking = 0;
-     if (nal->ref_idc) {
-+        int bit_pos = sl->gb.index;
-         ret = ff_h264_decode_ref_pic_marking(sl, &sl->gb, nal, h->avctx);
-         if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
-             return AVERROR_INVALIDDATA;
-+        sl->ref_pic_marking_size_in_bits = sl->gb.index - bit_pos;
-     }
- 
-     if (sl->slice_type_nos != AV_PICTURE_TYPE_I && pps->cabac) {
-diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h
-index 38efab5c60..8e894f565b 100644
---- a/libavcodec/h264dec.h
-+++ b/libavcodec/h264dec.h
-@@ -324,11 +324,13 @@ typedef struct H264SliceContext {
-     MMCO mmco[MAX_MMCO_COUNT];
-     int  nb_mmco;
-     int explicit_ref_marking;
-+    int ref_pic_marking_size_in_bits;
- 
-     int frame_num;
-     int poc_lsb;
-     int delta_poc_bottom;
-     int delta_poc[2];
-+    int pic_order_cnt_bit_size;
-     int curr_pic_num;
-     int max_pic_num;
- } H264SliceContext;
-diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
-index f21c8b3508..1a7fb873a0 100644
---- a/libavcodec/v4l2_request_h264.c
-+++ b/libavcodec/v4l2_request_h264.c
-@@ -309,9 +309,9 @@ static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *
-         .redundant_pic_cnt = sl->redundant_pic_count,
- 
-         /* Size in bits of dec_ref_pic_marking() syntax element. */
--        .dec_ref_pic_marking_bit_size = 0,
-+        .dec_ref_pic_marking_bit_size = sl->ref_pic_marking_size_in_bits,
-         /* Size in bits of pic order count syntax. */
--        .pic_order_cnt_bit_size = 0,
-+        .pic_order_cnt_bit_size = sl->pic_order_cnt_bit_size,
- 
-         .cabac_init_idc = sl->cabac_init_idc,
-         .slice_qp_delta = sl->qscale - pps->init_qp,
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0011-HACK-add-dpb-flags-for-reference-usage-and-field-pic.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0011-HACK-add-dpb-flags-for-reference-usage-and-field-pic.patch
deleted file mode 100644
index f139080389..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0011-HACK-add-dpb-flags-for-reference-usage-and-field-pic.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From ace9eff28953b97fc1832913d4655c0f4227ea16 Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Fri, 24 May 2019 22:58:24 +0000
-Subject: [PATCH 11/12] HACK: add dpb flags for reference usage and field
- picture
-
-This or something similar needs to be upstreamed to kernel h264 ctrls
-
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- libavcodec/h264-ctrls.h        | 4 ++++
- libavcodec/v4l2_request_h264.c | 6 +++++-
- 2 files changed, 9 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h
-index e877bf1d53..76020ebd1e 100644
---- a/libavcodec/h264-ctrls.h
-+++ b/libavcodec/h264-ctrls.h
-@@ -185,6 +185,10 @@ struct v4l2_ctrl_h264_slice_params {
- #define V4L2_H264_DPB_ENTRY_FLAG_VALID		0x01
- #define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE		0x02
- #define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM	0x04
-+#define V4L2_H264_DPB_ENTRY_FLAG_FIELD_PICTURE	0x08
-+#define V4L2_H264_DPB_ENTRY_FLAG_REF_TOP	0x10
-+#define V4L2_H264_DPB_ENTRY_FLAG_REF_BOTTOM	0x20
-+#define V4L2_H264_DPB_ENTRY_FLAG_REF_FRAME	0x30
- 
- struct v4l2_h264_dpb_entry {
- 	__u64 reference_ts;
-diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
-index 1a7fb873a0..13fac3f6f9 100644
---- a/libavcodec/v4l2_request_h264.c
-+++ b/libavcodec/v4l2_request_h264.c
-@@ -66,10 +66,13 @@ static void fill_dpb_entry(struct v4l2_h264_dpb_entry *entry, const H264Picture
-     entry->frame_num = pic->frame_num;
-     entry->pic_num = pic->pic_id;
-     entry->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
-+    entry->flags |= (pic->reference & 3) << 4;
-     if (pic->reference)
-         entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE;
-     if (pic->long_ref)
-         entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
-+    if (pic->field_picture)
-+        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_FIELD_PICTURE;
-     if (pic->field_poc[0] != INT_MAX)
-         entry->top_field_order_cnt = pic->field_poc[0];
-     if (pic->field_poc[1] != INT_MAX)
-@@ -109,7 +112,8 @@ static uint8_t get_dpb_index(struct v4l2_ctrl_h264_decode_params *decode, const
-         struct v4l2_h264_dpb_entry *entry = &decode->dpb[i];
-         if ((entry->flags & V4L2_H264_DPB_ENTRY_FLAG_VALID) &&
-             entry->reference_ts == timestamp)
--            return i;
-+            // TODO: signal reference type, possible using top 2 bits
-+            return i | ((ref->reference & 3) << 6);
-     }
- 
-     return 0;
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0012-WIP-v4l2-request-rolling-timestamps.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0012-WIP-v4l2-request-rolling-timestamps.patch
deleted file mode 100644
index 1374b2fc68..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0012-WIP-v4l2-request-rolling-timestamps.patch
+++ /dev/null
@@ -1,69 +0,0 @@
-From fa3f88530ec9083ff15d9637a9019a8a9408435b Mon Sep 17 00:00:00 2001
-From: Jernej Skrabec <jernej.skrabec@siol.net>
-Date: Sat, 9 Nov 2019 10:02:43 +0000
-Subject: [PATCH 12/12] WIP: v4l2-request: rolling timestamps
-
----
- libavcodec/v4l2_request.c | 10 +++++++---
- libavcodec/v4l2_request.h |  1 +
- 2 files changed, 8 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c
-index 1dabf77689..611c22f8b6 100644
---- a/libavcodec/v4l2_request.c
-+++ b/libavcodec/v4l2_request.c
-@@ -105,12 +105,14 @@ static int v4l2_request_queue_buffer(V4L2RequestContext *ctx, int request_fd, V4
-         .type = buf->buffer.type,
-         .memory = buf->buffer.memory,
-         .index = buf->index,
--        .timestamp.tv_usec = buf->index + 1,
-+        .timestamp.tv_usec = ctx->timestamp,
-         .bytesused = buf->used,
-         .request_fd = request_fd,
-         .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags,
-     };
- 
-+    buf->buffer.timestamp = buffer.timestamp;
-+
-     if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
-         planes[0].bytesused = buf->used;
-         buffer.bytesused = 0;
-@@ -200,6 +202,9 @@ static int v4l2_request_queue_decode(AVCodecContext *avctx, AVFrame *frame, stru
- 
-     av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice);
- 
-+    if (first_slice)
-+        ctx->timestamp++;
-+
-     ret = v4l2_request_set_controls(ctx, req->request_fd, control, count);
-     if (ret < 0) {
-         av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-@@ -651,6 +656,7 @@ int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t b
- 
-     ctx->media_fd = -1;
-     ctx->video_fd = -1;
-+    ctx->timestamp = 0;
- 
-     udev = udev_new();
-     if (!udev) {
-@@ -784,8 +790,6 @@ static int v4l2_request_buffer_alloc(AVCodecContext *avctx, V4L2RequestBuffer *b
-         return ret;
-     }
- 
--    buf->buffer.timestamp.tv_usec = buf->index + 1;
--
-     if (V4L2_TYPE_IS_OUTPUT(type)) {
-         void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset);
-         if (addr == MAP_FAILED) {
-diff --git a/libavcodec/v4l2_request.h b/libavcodec/v4l2_request.h
-index d4146bd4ee..72698f6f3c 100644
---- a/libavcodec/v4l2_request.h
-+++ b/libavcodec/v4l2_request.h
-@@ -28,6 +28,7 @@ typedef struct V4L2RequestContext {
-     int media_fd;
-     enum v4l2_buf_type output_type;
-     struct v4l2_format format;
-+    int timestamp;
- } V4L2RequestContext;
- 
- typedef struct V4L2RequestBuffer {
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0013-v4l2-request-hevc-Set-SPS-control-at-initialization.patch b/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0013-v4l2-request-hevc-Set-SPS-control-at-initialization.patch
deleted file mode 100644
index c40e19a527..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-request-api/ffmpeg-95.0013-v4l2-request-hevc-Set-SPS-control-at-initialization.patch
+++ /dev/null
@@ -1,121 +0,0 @@
-From 2756ad266f18d546551a3eab0650c95ddb62e0ff Mon Sep 17 00:00:00 2001
-From: Jernej Skrabec <jernej.skrabec@siol.net>
-Date: Sat, 14 Mar 2020 22:21:42 +0000
-Subject: [PATCH] v4l2 request hevc: Set SPS control at initialization
-
-Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
----
- libavcodec/v4l2_request_hevc.c | 61 ++++++++++++++++++++++------------
- 1 file changed, 40 insertions(+), 21 deletions(-)
-
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index 94977c5d0e..0ab1c201b0 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -231,21 +231,12 @@ static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
-         slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
- }
- 
--static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
--                                         av_unused const uint8_t *buffer,
--                                         av_unused uint32_t size)
-+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCContext *h)
- {
--    const HEVCContext *h = avctx->priv_data;
-     const HEVCSPS *sps = h->ps.sps;
--    const HEVCPPS *pps = h->ps.pps;
--    const ScalingList *sl = pps->scaling_list_data_present_flag ?
--                            &pps->scaling_list :
--                            sps->scaling_list_enable_flag ?
--                            &sps->scaling_list : NULL;
--    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
- 
-     /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
--    controls->sps = (struct v4l2_ctrl_hevc_sps) {
-+    *ctrl = (struct v4l2_ctrl_hevc_sps) {
-         .chroma_format_idc = sps->chroma_format_idc,
-         .pic_width_in_luma_samples = sps->width,
-         .pic_height_in_luma_samples = sps->height,
-@@ -270,31 +261,47 @@ static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
-     };
- 
-     if (sps->separate_colour_plane_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
- 
-     if (sps->scaling_list_enable_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
- 
-     if (sps->amp_enabled_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
- 
-     if (sps->sao_enabled)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
- 
-     if (sps->pcm_enabled_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
- 
-     if (sps->pcm.loop_filter_disable_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
- 
-     if (sps->long_term_ref_pics_present_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
- 
-     if (sps->sps_temporal_mvp_enabled_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
- 
-     if (sps->sps_strong_intra_smoothing_enable_flag)
--        controls->sps.flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
-+}
-+
-+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
-+                                         av_unused const uint8_t *buffer,
-+                                         av_unused uint32_t size)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    const HEVCSPS *sps = h->ps.sps;
-+    const HEVCPPS *pps = h->ps.pps;
-+    const ScalingList *sl = pps->scaling_list_data_present_flag ?
-+                            &pps->scaling_list :
-+                            sps->scaling_list_enable_flag ?
-+                            &sps->scaling_list : NULL;
-+    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
-+
-+    fill_sps(&controls->sps, h);
- 
-     if (sl) {
-         for (int i = 0; i < 6; i++) {
-@@ -502,9 +509,21 @@ static int v4l2_request_hevc_set_controls(AVCodecContext *avctx)
- 
- static int v4l2_request_hevc_init(AVCodecContext *avctx)
- {
-+    const HEVCContext *h = avctx->priv_data;
-+    struct v4l2_ctrl_hevc_sps sps;
-     int ret;
- 
--    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_HEVC_SLICE, 3 * 1024 * 1024, NULL, 0);
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
-+            .ptr = &sps,
-+            .size = sizeof(sps),
-+        }
-+    };
-+
-+    fill_sps(&sps, h);
-+
-+    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_HEVC_SLICE, 3 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
-     if (ret)
-         return ret;
- 
--- 
-2.25.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-rpi/0000-revert-le-patches.patch b/packages/multimedia/ffmpeg/patches/v4l2-rpi/0000-revert-le-patches.patch
deleted file mode 100644
index 3cea57002d..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-rpi/0000-revert-le-patches.patch
+++ /dev/null
@@ -1,78 +0,0 @@
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index fd87481a1c..d234271c5b 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -2612,7 +2612,6 @@ typedef struct AVCodecContext {
- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED       16384
- #define FF_BUG_IEDGE           32768
--#define FF_BUG_GMC_UNSUPPORTED (1<<30)
- 
-     /**
-      * strictly follow the standard (MPEG-4, ...).
-diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
-index 1bbb83eda3..12c63245f8 100644
---- a/libavcodec/libdav1d.c
-+++ b/libavcodec/libdav1d.c
-@@ -53,16 +53,6 @@ static const enum AVPixelFormat pix_fmt_rgb[3] = {
-     AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12,
- };
- 
--static enum AVPixelFormat libdav1d_get_format(AVCodecContext *avctx, const Dav1dPicture *p)
--{
--   enum AVPixelFormat pix_fmts[2], *fmt = pix_fmts;
--
--   *fmt++ = pix_fmt[p->p.layout][p->seq_hdr->hbd];
--   *fmt = AV_PIX_FMT_NONE;
--
--   return ff_get_format(avctx, pix_fmts);
--}
--
- static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl)
- {
-     AVCodecContext *c = opaque;
-@@ -239,7 +229,6 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
-     c->profile = p->seq_hdr->profile;
-     c->level = ((p->seq_hdr->operating_points[0].major_level - 2) << 2)
-                | p->seq_hdr->operating_points[0].minor_level;
--    frame->format = c->pix_fmt = libdav1d_get_format(c, p);
-     frame->width = p->p.w;
-     frame->height = p->p.h;
-     if (c->width != p->p.w || c->height != p->p.h) {
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index fa208660c8..055afabc7e 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2662,9 +2662,6 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
- 
-         if (ctx->divx_version >= 0)
-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
--
--        if (ctx->num_sprite_warping_points > 1)
--            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-     }
- 
-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2689,7 +2686,6 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
- 
--    avctx->workaround_bugs = s->workaround_bugs;
-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-         s->codec_id == AV_CODEC_ID_MPEG4 &&
-         avctx->idct_algo == FF_IDCT_AUTO) {
-diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
-index 6fb32fac77..d0df061e4d 100644
---- a/libswscale/yuv2rgb.c
-+++ b/libswscale/yuv2rgb.c
-@@ -687,6 +687,10 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
-     if (t)
-         return t;
- 
-+    av_log(c, AV_LOG_WARNING,
-+           "No accelerated colorspace conversion found from %s to %s.\n",
-+           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
-+
-     switch (c->dstFormat) {
-     case AV_PIX_FMT_BGR48BE:
-     case AV_PIX_FMT_BGR48LE:
diff --git a/packages/multimedia/ffmpeg/patches/v4l2-rpi/0001-popcornmix-kodi-gbm.patch b/packages/multimedia/ffmpeg/patches/v4l2-rpi/0001-popcornmix-kodi-gbm.patch
deleted file mode 100644
index 149aa368ae..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2-rpi/0001-popcornmix-kodi-gbm.patch
+++ /dev/null
@@ -1,54411 +0,0 @@
-diff --git a/.gitignore b/.gitignore
-index 2450ee8fc5..4bcc3ae643 100644
---- a/.gitignore
-+++ b/.gitignore
-@@ -1,6 +1,7 @@
- *.a
- *.o
- *.o.*
-+*.bin
- *.d
- *.def
- *.dll
-@@ -26,6 +27,7 @@
- .\#*
- /.config
- /.version
-+/build/
- /ffmpeg
- /ffplay
- /ffprobe
-diff --git a/configure b/configure
-index 34c2adb4a4..6083a0aa0f 100755
---- a/configure
-+++ b/configure
-@@ -271,6 +271,7 @@ External library support:
-   --enable-libtls          enable LibreSSL (via libtls), needed for https support
-                            if openssl, gnutls or mbedtls is not used [no]
-   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
-+  --enable-libudev         enable libudev [no]
-   --enable-libv4l2         enable libv4l2/v4l-utils [no]
-   --enable-libvidstab      enable video stabilization using vid.stab [no]
-   --enable-libvmaf         enable vmaf filter via libvmaf [no]
-@@ -331,12 +332,16 @@ External library support:
-   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
-   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
-   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
-+  --enable-rpi             enable other rpi specific stuff [no]
-+  --enable-vout-drm-kludge enable the vout_drm module and associated kludges
-+                           breaks some normal ffmpeg decodes - for internal testing only [no]
-   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
-   --disable-nvenc          disable Nvidia video encoding code [autodetect]
-   --enable-omx             enable OpenMAX IL code [no]
-   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
-   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
-   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
-+  --enable-v4l2-request    enable V4L2 request API code [no]
-   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
-   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
-   --disable-videotoolbox   disable VideoToolbox code [autodetect]
-@@ -1797,6 +1802,7 @@ EXTERNAL_LIBRARY_LIST="
-     libtesseract
-     libtheora
-     libtwolame
-+    libudev
-     libv4l2
-     libvorbis
-     libvpx
-@@ -1851,6 +1857,9 @@ HWACCEL_LIBRARY_LIST="
-     mmal
-     omx
-     opencl
-+    v4l2_request
-+    rpi4_8
-+    rpi4_10
- "
- 
- DOCUMENT_LIST="
-@@ -1866,12 +1875,14 @@ FEATURE_LIST="
-     gray
-     hardcoded_tables
-     omx_rpi
-+    rpi
-     runtime_cpudetect
-     safe_bitstream_reader
-     shared
-     small
-     static
-     swscale_alpha
-+    vout_drm_kludge
- "
- 
- # this list should be kept in linking order
-@@ -1912,6 +1923,7 @@ SUBSYSTEM_LIST="
-     pixelutils
-     network
-     rdft
-+    rpi
- "
- 
- # COMPONENT_LIST needs to come last to ensure correct dependency checking
-@@ -2387,9 +2399,11 @@ CONFIG_EXTRA="
-     rangecoder
-     riffdec
-     riffenc
-+    rpi
-     rtpdec
-     rtpenc_chain
-     rv34dsp
-+    sand
-     scene_sad
-     sinewin
-     snappy
-@@ -2715,6 +2729,8 @@ hap_decoder_select="snappy texturedsp"
- hap_encoder_deps="libsnappy"
- hap_encoder_select="texturedspenc"
- hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
-+hevc_rpi_decoder_deps="rpi"
-+hevc_rpi_decoder_select="hevc_decoder sand"
- huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
- huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
- hymt_decoder_select="huffyuv_decoder"
-@@ -2873,6 +2889,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
- dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
- ffnvcodec_deps_any="libdl LoadLibrary"
- nvdec_deps="ffnvcodec"
-+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
- vaapi_x11_deps="xlib"
- videotoolbox_hwaccel_deps="videotoolbox pthreads"
- videotoolbox_hwaccel_extralibs="-framework QuartzCore"
-@@ -2890,6 +2907,8 @@ h264_dxva2_hwaccel_deps="dxva2"
- h264_dxva2_hwaccel_select="h264_decoder"
- h264_nvdec_hwaccel_deps="nvdec"
- h264_nvdec_hwaccel_select="h264_decoder"
-+h264_v4l2request_hwaccel_deps="v4l2_request"
-+h264_v4l2request_hwaccel_select="h264_decoder"
- h264_vaapi_hwaccel_deps="vaapi"
- h264_vaapi_hwaccel_select="h264_decoder"
- h264_vdpau_hwaccel_deps="vdpau"
-@@ -2904,6 +2923,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
- hevc_dxva2_hwaccel_select="hevc_decoder"
- hevc_nvdec_hwaccel_deps="nvdec"
- hevc_nvdec_hwaccel_select="hevc_decoder"
-+hevc_rpi4_10_hwaccel_deps="rpi"
-+hevc_rpi4_10_hwaccel_select="hevc_decoder"
-+hevc_rpi4_8_hwaccel_deps="rpi"
-+hevc_rpi4_8_hwaccel_select="hevc_decoder"
-+hevc_v4l2request_hwaccel_deps="v4l2_request"
-+hevc_v4l2request_hwaccel_select="hevc_decoder"
- hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
- hevc_vaapi_hwaccel_select="hevc_decoder"
- hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
-@@ -2932,6 +2957,8 @@ mpeg2_dxva2_hwaccel_deps="dxva2"
- mpeg2_dxva2_hwaccel_select="mpeg2video_decoder"
- mpeg2_nvdec_hwaccel_deps="nvdec"
- mpeg2_nvdec_hwaccel_select="mpeg2video_decoder"
-+mpeg2_v4l2request_hwaccel_deps="v4l2_request mpeg2_v4l2_request"
-+mpeg2_v4l2request_hwaccel_select="mpeg2video_decoder"
- mpeg2_vaapi_hwaccel_deps="vaapi"
- mpeg2_vaapi_hwaccel_select="mpeg2video_decoder"
- mpeg2_vdpau_hwaccel_deps="vdpau"
-@@ -2964,6 +2991,8 @@ vp8_nvdec_hwaccel_deps="nvdec"
- vp8_nvdec_hwaccel_select="vp8_decoder"
- vp8_vaapi_hwaccel_deps="vaapi"
- vp8_vaapi_hwaccel_select="vp8_decoder"
-+vp8_v4l2request_hwaccel_deps="v4l2_request"
-+vp8_v4l2request_hwaccel_select="vp8_decoder"
- vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
- vp9_d3d11va_hwaccel_select="vp9_decoder"
- vp9_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
-@@ -3343,8 +3372,12 @@ sndio_indev_deps="sndio"
- sndio_outdev_deps="sndio"
- v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_indev_suggest="libv4l2"
-+v4l2_outdev_deps="libdrm"
- v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_outdev_suggest="libv4l2"
-+vout_drm_outdev_deps="libdrm vout_drm_kludge"
-+vout_rpi_outdev_deps="rpi"
-+vout_rpi_outdev_select="sand"
- vfwcap_indev_deps="vfw32 vfwcap_defines"
- xcbgrab_indev_deps="libxcb"
- xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
-@@ -3543,6 +3576,8 @@ tonemap_filter_deps="const_nan"
- tonemap_opencl_filter_deps="opencl const_nan"
- transpose_opencl_filter_deps="opencl"
- transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
-+unsand_filter_deps="rpi"
-+unsand_filter_select="sand"
- unsharp_opencl_filter_deps="opencl"
- uspp_filter_deps="gpl avcodec"
- vaguedenoiser_filter_deps="gpl"
-@@ -6270,6 +6305,7 @@ enabled libtls            && require_pkg_config libtls libtls tls.h tls_configur
- enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
-                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
-                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
-+enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
- enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
- enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
- enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf
-@@ -6324,11 +6360,12 @@ enabled mbedtls           && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt
-                                check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
-                                die "ERROR: mbedTLS not found"; }
- enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
--enabled mmal              && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
-+( enabled rpi ||
-+  enabled mmal )          && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
-                                { ! enabled cross_compile &&
-                                  add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
-                                  add_ldflags -L/opt/vc/lib/ &&
--                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
-+                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
-                                die "ERROR: mmal not found" &&
-                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
- enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
-@@ -6365,6 +6402,10 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
-                                { enabled libdrm ||
-                                  die "ERROR: rkmpp requires --enable-libdrm"; }
-                              }
-+enabled v4l2_request      && { enabled libdrm ||
-+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
-+                             { enabled libudev ||
-+                               die "ERROR: v4l2-request requires --enable-libudev"; }
- enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
- 
- 
-@@ -6444,6 +6485,12 @@ check_cc h264_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_H264;"
- check_cc vp8_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP8;"
- check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
- 
-+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
-+check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
-+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
-+check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
-+check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;"
-+
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
- 
-diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index 01f04103cf..5d53248137 100644
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -2141,8 +2141,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
-                        ifilter->channel_layout != frame->channel_layout;
-         break;
-     case AVMEDIA_TYPE_VIDEO:
--        need_reinit |= ifilter->width  != frame->width ||
--                       ifilter->height != frame->height;
-+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
-+                       ifilter->height != av_frame_cropped_height(frame);
-         break;
-     }
- 
-@@ -2390,6 +2390,8 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
-         if (ist->dec_ctx->codec_id == AV_CODEC_ID_H264) {
-             ist->st->codecpar->video_delay = ist->dec_ctx->has_b_frames;
-         } else
-+        {
-+#if 0
-             av_log(ist->dec_ctx, AV_LOG_WARNING,
-                    "video_delay is larger in decoder than demuxer %d > %d.\n"
-                    "If you want to help, upload a sample "
-@@ -2397,6 +2399,8 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
-                    "and contact the ffmpeg-devel mailing list. (ffmpeg-devel@ffmpeg.org)\n",
-                    ist->dec_ctx->has_b_frames,
-                    ist->st->codecpar->video_delay);
-+#endif
-+        }
-     }
- 
-     if (ret != AVERROR_EOF)
-@@ -2423,12 +2427,13 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
-         decoded_frame->top_field_first = ist->top_field_first;
- 
-     ist->frames_decoded++;
--
-+#if !CONFIG_VOUT_DRM_KLUDGE
-     if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
-         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
-         if (err < 0)
-             goto fail;
-     }
-+#endif
-     ist->hwaccel_retrieved_pix_fmt = decoded_frame->format;
- 
-     best_effort_timestamp= decoded_frame->best_effort_timestamp;
-@@ -2936,6 +2941,15 @@ static int init_input_stream(int ist_index, char *error, int error_len)
-             return ret;
-         }
- 
-+#if CONFIG_HEVC_RPI_DECODER
-+        ret = -1;
-+        if (strcmp(codec->name, "hevc_rpi") == 0 &&
-+            (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
-+            ist->dec = codec = avcodec_find_decoder_by_name("hevc");
-+            av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
-+        }
-+        if (ret < 0)
-+#endif
-         if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
-             if (ret == AVERROR_EXPERIMENTAL)
-                 abort_codec_experimental(codec, 0);
-diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
-index 7b6f802082..38de432ae6 100644
---- a/fftools/ffmpeg.h
-+++ b/fftools/ffmpeg.h
-@@ -62,6 +62,7 @@ enum HWAccelID {
-     HWACCEL_VIDEOTOOLBOX,
-     HWACCEL_QSV,
-     HWACCEL_CUVID,
-+    HWACCEL_RPI,
- };
- 
- typedef struct HWAccel {
-@@ -655,6 +656,7 @@ int ffmpeg_parse_options(int argc, char **argv);
- int videotoolbox_init(AVCodecContext *s);
- int qsv_init(AVCodecContext *s);
- int cuvid_init(AVCodecContext *s);
-+int rpi_init(AVCodecContext *s);
- 
- HWDevice *hw_device_get_by_name(const char *name);
- int hw_device_init_from_string(const char *arg, HWDevice **dev);
-diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
-index 72838de1e2..6922cedc5d 100644
---- a/fftools/ffmpeg_filter.c
-+++ b/fftools/ffmpeg_filter.c
-@@ -1188,8 +1188,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
- 
-     ifilter->format = frame->format;
- 
--    ifilter->width               = frame->width;
--    ifilter->height              = frame->height;
-+    ifilter->width               = av_frame_cropped_width(frame);
-+    ifilter->height              = av_frame_cropped_height(frame);
-     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
- 
-     ifilter->sample_rate         = frame->sample_rate;
-diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
-index f5ca18aa64..1c4e0d4ef4 100644
---- a/fftools/ffmpeg_opt.c
-+++ b/fftools/ffmpeg_opt.c
-@@ -65,6 +65,12 @@
-     }\
- }
- 
-+#if CONFIG_RPI
-+int rpi_init(AVCodecContext *avctx) {
-+    return 0;
-+}
-+#endif
-+
- const HWAccel hwaccels[] = {
- #if CONFIG_VIDEOTOOLBOX
-     { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
-@@ -74,6 +80,10 @@ const HWAccel hwaccels[] = {
- #endif
- #if CONFIG_CUVID
-     { "cuvid", cuvid_init, HWACCEL_CUVID, AV_PIX_FMT_CUDA },
-+#endif
-+#if CONFIG_RPI
-+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
-+    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
- #endif
-     { 0 },
- };
-@@ -702,7 +712,9 @@ static AVCodec *choose_decoder(OptionsContext *o, AVFormatContext *s, AVStream *
-         st->codecpar->codec_id = codec->id;
-         return codec;
-     } else
-+    {
-         return avcodec_find_decoder(st->codecpar->codec_id);
-+    }
- }
- 
- /* Add all the streams from the given input file to the global
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 3cd73fbcc6..3a0f1dbc25 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -6,6 +6,7 @@ HEADERS = ac3_parser.h                                                  \
-           avcodec.h                                                     \
-           avdct.h                                                       \
-           avfft.h                                                       \
-+          rpi_zc.h                                                      \
-           d3d11va.h                                                     \
-           dirac.h                                                       \
-           dv_profile.h                                                  \
-@@ -132,6 +133,7 @@ OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
- OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
- OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
- OBJS-$(CONFIG_RDFT)                    += rdft.o
-+OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
- OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
- OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
- OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
-@@ -147,6 +149,7 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
- OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
- OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
- OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
-+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_request.o v4l2_phase.o
- OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
- OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
- 
-@@ -368,6 +371,15 @@ OBJS-$(CONFIG_HCOM_DECODER)            += hcom.o
- OBJS-$(CONFIG_HEVC_DECODER)            += hevcdec.o hevc_mvs.o \
-                                           hevc_cabac.o hevc_refs.o hevcpred.o    \
-                                           hevcdsp.o hevc_filter.o hevc_data.o
-+OBJS-$(CONFIG_RPI)                     += rpi_mem.o \
-+                                          rpi_mailbox.o rpi_zc.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
-+                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
-+                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
-+                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
-+                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
-+                                          rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
-+OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuvid.o
- OBJS-$(CONFIG_HEVC_AMF_ENCODER)        += amfenc_hevc.o
- OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuviddec.o
- OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
-@@ -871,6 +883,7 @@ OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
- OBJS-$(CONFIG_H264_DXVA2_HWACCEL)         += dxva2_h264.o
- OBJS-$(CONFIG_H264_NVDEC_HWACCEL)         += nvdec_h264.o
- OBJS-$(CONFIG_H264_QSV_HWACCEL)           += qsvdec_h2645.o
-+OBJS-$(CONFIG_H264_V4L2REQUEST_HWACCEL)   += v4l2_request_h264.o
- OBJS-$(CONFIG_H264_VAAPI_HWACCEL)         += vaapi_h264.o
- OBJS-$(CONFIG_H264_VDPAU_HWACCEL)         += vdpau_h264.o
- OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
-@@ -878,8 +891,11 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
- OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec_h2645.o
-+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o
- OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o
- OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
-+OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
-+OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
- OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
- OBJS-$(CONFIG_MJPEG_VAAPI_HWACCEL)        += vaapi_mjpeg.o
- OBJS-$(CONFIG_MPEG1_NVDEC_HWACCEL)        += nvdec_mpeg12.o
-@@ -890,6 +906,7 @@ OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL)      += dxva2_mpeg2.o
- OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL)        += dxva2_mpeg2.o
- OBJS-$(CONFIG_MPEG2_NVDEC_HWACCEL)        += nvdec_mpeg12.o
- OBJS-$(CONFIG_MPEG2_QSV_HWACCEL)          += qsvdec_other.o
-+OBJS-$(CONFIG_MPEG2_V4L2REQUEST_HWACCEL)  += v4l2_request_mpeg2.o
- OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL)        += vaapi_mpeg2.o
- OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL)        += vdpau_mpeg12.o
- OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
-@@ -906,6 +923,7 @@ OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
- OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
- OBJS-$(CONFIG_VP8_NVDEC_HWACCEL)          += nvdec_vp8.o
- OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
-+OBJS-$(CONFIG_VP8_V4L2REQUEST_HWACCEL)    += v4l2_request_vp8.o
- OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
- OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
- OBJS-$(CONFIG_VP9_NVDEC_HWACCEL)          += nvdec_vp9.o
-@@ -1223,3 +1241,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
- $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
- $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
- endif
-+
-+ifdef CONFIG_HEVC_RPI_DECODER
-+QASM_PY := ../local/bin/qasm.py
-+VASMVIDCORE := ../local/bin/vasmvidcore_std
-+
-+ifneq ("$(wildcard $(QASM_PY))","")
-+$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+
-+$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+endif
-+
-+ifneq ("$(wildcard $(VASMVIDCORE))","")
-+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
-+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
-+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
-+	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
-+
-+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
-+	python pi-util/make_array.py $<
-+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
-+	python pi-util/make_array.py $<
-+endif
-+
-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
-+$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
-+endif
-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index d2f9a39ce5..63dd7528ce 100644
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -148,6 +148,7 @@ extern AVCodec ff_hap_decoder;
- extern AVCodec ff_hevc_decoder;
- extern AVCodec ff_hevc_qsv_decoder;
- extern AVCodec ff_hevc_rkmpp_decoder;
-+extern AVCodec ff_hevc_rpi_decoder;
- extern AVCodec ff_hevc_v4l2m2m_decoder;
- extern AVCodec ff_hnm4_video_decoder;
- extern AVCodec ff_hq_hqa_decoder;
-@@ -861,6 +862,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
-     }
- }
- 
-+static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
-+{
-+    const enum AVPixelFormat *pf = p->pix_fmts;
-+
-+    // Assume good if we lack info
-+    if (pf == NULL)
-+        return 1;
-+    if (fmt == AV_PIX_FMT_NONE)
-+        return 0;
-+
-+    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
-+        if (*pf == fmt)
-+            return 1;
-+    }
-+    return 0;
-+}
-+
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
-+{
-+    const AVCodec *p, *experimental = NULL;
-+    void *i = 0;
-+
-+    id= remap_deprecated_codec_id(id);
-+    while ((p = av_codec_iterate(&i))) {
-+        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
-+            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
-+                experimental = p;
-+            } else
-+                return (AVCodec *)p;
-+        }
-+        p = p->next;
-+    }
-+    return (AVCodec *)experimental;
-+}
-+
- static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
- {
-     const AVCodec *p, *experimental = NULL;
-diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index e656011c3c..f8801dfab6 100644
---- a/libavcodec/arm/Makefile
-+++ b/libavcodec/arm/Makefile
-@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
-                                           arm/sbrdsp_init_arm.o
- OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
- OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o    \
-+                                          arm/rpi_hevcpred_init_arm.o
- OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
- OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
- OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
-@@ -136,10 +138,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
- NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
- NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
- NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                          arm/hevcdsp_idct_neon.o    \
-                                           arm/hevcdsp_deblock_neon.o    \
-                                           arm/hevcdsp_idct_neon.o       \
-                                           arm/hevcdsp_qpel_neon.o       \
-                                           arm/hevcdsp_sao_neon.o
-+NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
-+                                          arm/rpi_hevc_misc_neon.o       \
-+                                          arm/rpi_hevcdsp_deblock_neon.o \
-+                                          arm/rpi_hevcdsp_idct_neon.o    \
-+                                          arm/rpi_hevcdsp_res8_neon.o    \
-+                                          arm/rpi_hevcdsp_res16_neon.o   \
-+                                          arm/rpi_hevcdsp_sao_neon.o     \
-+                                          arm/rpi_hevcpred_init_neon.o   \
-+                                          arm/rpi_hevcpred_intra_angular_neon.o \
-+                                          arm/rpi_hevcpred_intra_dc_neon.o \
-+                                          arm/rpi_hevcpred_intra_filter_neon.o \
-+                                          arm/rpi_hevcpred_intra_hv_neon.o \
-+                                          arm/rpi_hevcpred_intra_planar_neon.o
- NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
- NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
-                                           arm/rv40dsp_neon.o
-diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b45e..4755f20e2e 100644
---- a/libavcodec/arm/cabac.h
-+++ b/libavcodec/arm/cabac.h
-@@ -26,83 +26,209 @@
- #include "libavutil/internal.h"
- #include "libavcodec/cabac.h"
- 
-+
- #define get_cabac_inline get_cabac_inline_arm
- static av_always_inline int get_cabac_inline_arm(CABACContext *c,
--                                                 uint8_t *const state)
-+                                                 uint8_t *state)
- {
--    int bit;
--    void *reg_b, *reg_c, *tmp;
-+    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
-+    int bit, ptr, low, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
-+        "ldrb    %[ptr], [%[state]]                       \n\t"
-+        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
-+        "and     %[tmp2], %[bit], #0xc0                   \n\t"
-+        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
-+        "ldr     %[low], [%[c], %[low_off]]               \n\t"
-+        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
-+        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
-+        "mov     %[tmp1], %[bit]                          \n\t"
-+        "cmp     %[low], %[bit], lsl #17                  \n\t"
-+        "itt     ge                                       \n\t"
-+        "movge   %[tmp1], %[tmp2]                         \n\t"
-+        "mvnge   %[ptr], %[ptr]                           \n\t"
-+        "clz     %[tmp2], %[tmp1]                         \n\t"
-+        "it      ge                                       \n\t"
-+        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
-+        "sub     %[tmp2], %[tmp2], #23                    \n\t"
-+        "and     %[bit], %[ptr], #1                       \n\t"
-+        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
-+        "lsl     %[low], %[low], %[tmp2]                  \n\t"
-+        "lsls    %[ptr], %[low], #16                      \n\t"
-+        "bne     1f                                       \n\t"
-+        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
-+        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "rbit    %[state], %[low]                         \n\t"
-+        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
-+#else
-+        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "rbit    %[state], %[low]                         \n\t"
-+        "cmp     %[tmp1], %[ptr]                          \n\t"
-+#if CONFIG_THUMB
-+        "it      cs                                       \n\t"
-+        "ldrhcs  %[tmp1], [%[ptr]], #2                    \n\t"
-+#else
-+        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
-+#endif
-+#endif
-+        "clz     %[state], %[state]                       \n\t"
-+        "movw    %[mlps_tables], #0xffff                  \n\t"
-+        "sub     %[state], %[state], #16                  \n\t"
-+        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
-+        "rev     %[tmp1], %[tmp1]                         \n\t"
-+        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
-+        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
-+        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[tmp1], %[tmp1], %[state]               \n\t"
-+        "add     %[low], %[low], %[tmp1]                  \n\t"
-+#else
-+        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
-+#endif
-+        "str     %[low], [%[c], %[low_off]]               \n\t"
-+        "b       2f                                       \n\t"
-+        "1:                                               \n\t"
-+        "strb    %[mlps_tables], [%[state]]               \n\t"
-+        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
-+        "str     %[low], [%[c], %[low_off]]               \n\t"
-+        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
-+        "2:                                               \n\t"
-+    :  // Outputs
-+             [state]"+r"(state),
-+       [mlps_tables]"+r"(mlps_tables),
-+               [bit]"=&r"(bit),
-+               [ptr]"=&r"(ptr),
-+               [low]"=&r"(low),
-+              [tmp1]"=&r"(tmp1),
-+              [tmp2]"=&r"(tmp2)
-+    :  // Inputs
-+               [c]"r"(c),
-+         [low_off]"J"(offsetof(CABACContext, low)),
-+       [range_off]"J"(offsetof(CABACContext, range)),
-+         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
-+         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+    :  // Clobbers
-+       "cc", "memory"
-+    );
-+    return bit;
-+}
- 
--    __asm__ volatile(
--        "ldrb       %[bit]        , [%[state]]                  \n\t"
--        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
--        "mov        %[tmp]        , %[range]                    \n\t"
--        "and        %[range]      , %[range]    , #0xC0         \n\t"
--        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
--        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
--        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
--        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
--        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
--        "cmp        %[tmp]        , %[low]                      \n\t"
--        "it         gt                                          \n\t"
--        "movgt      %[range]      , %[r_c]                      \n\t"
--        "itt        cc                                          \n\t"
--        "mvncc      %[bit]        , %[bit]                      \n\t"
--        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
--        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
--        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
--        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
--        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
--        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
--        "uxth       %[r_c]        , %[low]                      \n\t"
--        "strb       %[r_b]        , [%[state]]                  \n\t"
--        "tst        %[r_c]        , %[r_c]                      \n\t"
--        "bne        2f                                          \n\t"
--        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
-+#define get_cabac_bypass get_cabac_bypass_arm
-+static inline int get_cabac_bypass_arm(CABACContext * const c)
-+{
-+    uint32_t low = c->low, range, ptr, tmp;
-+    int rv;
-+    __asm volatile (
-+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
-+        "mov        %[rv]    , #0                   \n\t"
-+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "lsl        %[low]   , #1                   \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
-+#endif
-+        "cmp        %[low]   , %[range], lsl #17    \n\t"
-+        "itt         cs                              \n\t"
-+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
-+        "movcs      %[rv]    , #1                   \n\t"
- #if UNCHECKED_BITSTREAM_READER
--        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
--        "add        %[r_c]        , %[r_c]      , #2            \n\t"
--        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
-+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "cmp        %[tmp]   , %[ptr]               \n\t"
-+#if CONFIG_THUMB
-+        "it         cs                              \n\t"
-+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
- #else
--        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
--        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
--        "cmp        %[r_c]        , %[r_b]                      \n\t"
--        "itt        lt                                          \n\t"
--        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
--        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
-+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
-+#endif
- #endif
--        "sub        %[r_c]        , %[low]      , #1            \n\t"
--        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
--        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
--        "rev        %[tmp]        , %[tmp]                      \n\t"
--        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
--        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
--        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
--        "movw       %[r_b]        , #0xFFFF                     \n\t"
--        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
--        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
--        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
--        "add        %[low]        , %[low]      , %[tmp]        \n\t"
--        "2:                                                     \n\t"
--        :    [bit]"=&r"(bit),
--             [low]"+&r"(c->low),
--           [range]"+&r"(c->range),
--             [r_b]"=&r"(reg_b),
--             [r_c]"=&r"(reg_c),
--             [tmp]"=&r"(tmp)
--        :        [c]"r"(c),
--             [state]"r"(state),
--            [tables]"r"(ff_h264_cabac_tables),
--              [byte]"M"(offsetof(CABACContext, bytestream)),
--               [end]"M"(offsetof(CABACContext, bytestream_end)),
--          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
--           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
--          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
--        : "memory", "cc"
--        );
-+        "lsls       %[range] , %[low], #16          \n\t"
-+        "bne        1f                              \n\t"
- 
--    return bit & 1;
-+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "rev        %[tmp]   , %[tmp]               \n\t"
-+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]   , 0xFFFF               \n\t"
-+        "sub        %[low]   , %[tmp]               \n\t"
-+        "1:                                         \n\t"
-+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
-+        : // Outputs
-+               [rv]"=&r"(rv),
-+              [low]"+r"(low),
-+            [range]"=&r"(range),
-+              [ptr]"=&r"(ptr),
-+              [tmp]"=&r"(tmp)
-+        : // Inputs
-+                    [c]"r"(c),
-+              [low_off]"J"(offsetof(CABACContext, low)),
-+            [range_off]"J"(offsetof(CABACContext, range)),
-+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+        : // Clobbers
-+            "memory", "cc"
-+    );
-+    return rv;
- }
-+
-+
-+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
-+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
-+{
-+    uint32_t low = c->low, range, ptr, tmp;
-+    __asm volatile (
-+        "ldr        %[range] , [%[c], %[range_off]] \n\t"
-+        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "lsl        %[low]   , #1                   \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
-+#endif
-+        "cmp        %[low]   , %[range], lsl #17    \n\t"
-+        "it         cs                              \n\t"
-+        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
-+        "it         cc                              \n\t"
-+        "rsbcc      %[rv]    , %[rv], #0            \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "cmp        %[tmp]   , %[ptr]               \n\t"
-+#if CONFIG_THUMB
-+        "it         cs                              \n\t"
-+        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
-+#else
-+        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
-+#endif
-+#endif
-+        "lsls       %[range] , %[low], #16          \n\t"
-+        "bne        1f                              \n\t"
-+
-+        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
-+        "rev        %[tmp]   , %[tmp]               \n\t"
-+        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
-+        "movw       %[tmp]   , 0xFFFF               \n\t"
-+        "sub        %[low]   , %[tmp]               \n\t"
-+        "1:                                         \n\t"
-+        "str        %[low]   , [%[c], %[low_off]]   \n\t"
-+        : // Outputs
-+               [rv]"+r"(rv),
-+              [low]"+r"(low),
-+            [range]"=&r"(range),
-+              [ptr]"=&r"(ptr),
-+              [tmp]"=&r"(tmp)
-+        : // Inputs
-+                    [c]"r"(c),
-+              [low_off]"J"(offsetof(CABACContext, low)),
-+            [range_off]"J"(offsetof(CABACContext, range)),
-+              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+              [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+        : // Clobbers
-+            "memory", "cc"
-+    );
-+    return rv;
-+}
-+
- #endif /* HAVE_ARMV6T2_INLINE */
- 
- #endif /* AVCODEC_ARM_CABAC_H */
-diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
-new file mode 100644
-index 0000000000..c88dec6eff
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_cabac.h
-@@ -0,0 +1,607 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVC_CABAC_H
-+#define AVCODEC_ARM_HEVC_CABAC_H
-+
-+#include "config.h"
-+#if HAVE_ARMV6T2_INLINE
-+
-+#define hevc_mem_bits32 hevc_mem_bits32_arm
-+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
-+{
-+    unsigned int n;
-+    __asm__ (
-+        "rev        %[n], %[x]                     \n\t"
-+        : [n]"=r"(n)
-+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
-+        :
-+        );
-+    return n << (bits & 7);
-+}
-+
-+
-+// ---------------------------------------------------------------------------
-+//
-+// Helper fns - little bits of code where ARM has an instraction that the
-+// compiler doesn't know about / use
-+
-+#define trans_scale_sat trans_scale_sat_arm
-+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    int rv;
-+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
-+
-+    __asm__ (
-+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
-+    : [rv]"=r"(rv)
-+    : [t]"r"(t)
-+    :
-+    );
-+    return rv;
-+}
-+
-+#define update_rice update_rice_arm
-+static inline void update_rice_arm(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    int t = last_coeff_abs_level_remaining << 1;
-+    __asm__ (
-+    "lsrs  %[t], %[t], %[shift]             \n\t"
-+
-+    "it    eq                               \n\t"
-+    "subeq %[stat], %[stat], #1             \n\t"
-+    "cmp   %[t], #6                         \n\t"
-+    "adc   %[stat], %[stat], #0             \n\t"
-+    "usat  %[stat], #8, %[stat]             \n\t"
-+    : [stat]"+r"(*stat_coeff),
-+         [t]"+r"(t)
-+    :  [shift]"r"(c_rice_param)
-+    : "cc"
-+    );
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC get loops
-+//
-+// Where the loop is simple enough we can normally do 10-30% better than the
-+// compiler
-+
-+// Get the residual greater than 1 bits
-+
-+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
-+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i, reg_b, st, tmp, bit, rv;
-+     __asm__ (
-+         "mov        %[i]          , #0                          \n\t"
-+         "mov        %[rv]         , #0                          \n\t"
-+         "1:                                                     \n\t"
-+         "add        %[i]          , %[i]        , #1            \n\t"
-+         "cmp        %[rv]         , #0                          \n\t"
-+         "ite        eq                                          \n\t"
-+         "usateq     %[st]         , #2          , %[i]          \n\t"
-+         "movne      %[st]         , #0                          \n\t"
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "and        %[bit]        , %[bit]      , #1            \n\t"
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "it         ne                                          \n\t"
-+         "cmpne      %[n]          , %[i]                        \n\t"
-+         "bne        1b                                          \n\t"
-+
-+// If reload is not required then we must have run out of flags to decode
-+         "tst        %[tmp]        , %[tmp]                      \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "rbit       %[bit]        , %[low]                      \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "clz        %[bit]        , %[bit]                      \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "sub        %[bit]        , %[bit]      , #16           \n\t"
-+         "cmp        %[n]          , %[i]                        \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+r"(c->low),
-+            [range]"+r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+r"(c->bytestream),
-+                [i]"=&r"(i),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st),
-+               [rv]"=&r"(rv)
-+          :  [state0]"r"(state0),
-+                  [n]"r"(n),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+    return rv;
-+}
-+
-+
-+// n must be > 0 on entry
-+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t * ctx_map,
-+    uint8_t * p)
-+{
-+    unsigned int reg_b, tmp, st, bit;
-+     __asm__ (
-+// Get bin from map
-+#if CONFIG_THUMB
-+         "add        %[ctx_map]    , %[n]                        \n\t"
-+         "ldrb       %[st]         , [%[ctx_map]]                \n\t"
-+#else
-+         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
-+#endif
-+         "1:                                                     \n\t"
-+
-+// Load state & ranges
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
-+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
-+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
-+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "cmp        %[low]        , %[range], lsl #17           \n\t"
-+         "ittt       ge                                          \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "movge      %[range]      , %[tmp]                      \n\t"
-+
-+// Renorm
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
-+         "tst        %[bit]        , #1                          \n\t"
-+         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+// GCC asm seems to need strbne written differently for thumb and arm
-+#if CONFIG_THUMB
-+         "it         ne                                          \n\t"
-+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
-+#else
-+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
-+#endif
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+         "subs       %[n]          , %[n]        , #1            \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+#if CONFIG_THUMB
-+         "itt        ne                                          \n\t"
-+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
-+#else
-+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
-+#endif
-+         "bne        1b                                          \n\t"
-+
-+// If we have bits left then n must be 0 so give up now
-+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        2f                                          \n\t"
-+
-+// Do reload
-+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
-+         "rbit       %[bit]        , %[low]                      \n\t"
-+         "movw       %[r_b]        , #0xFFFF                     \n\t"
-+         "clz        %[bit]        , %[bit]                      \n\t"
-+         "cmp        %[n]          , #0                          \n\t"
-+         "rev        %[tmp]        , %[tmp]                      \n\t"
-+         "sub        %[bit]        , %[bit]      , #16           \n\t"
-+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
-+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+// Check to see if we still have more to do
-+         "bne        1b                                          \n\t"
-+         "2:                                                     \n\t"
-+         :    [bit]"=&r"(bit),
-+              [low]"+r"(c->low),
-+            [range]"+r"(c->range),
-+              [r_b]"=&r"(reg_b),
-+             [bptr]"+r"(c->bytestream),
-+              [idx]"+r"(p),
-+                [n]"+r"(n),
-+              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st),
-+          [ctx_map]"+r"(ctx_map)
-+          :  [state0]"r"(state0),
-+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+         : "memory", "cc"
-+    );
-+
-+    return p;
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC_BY22 functions
-+
-+
-+#define get_cabac_by22_start get_cabac_by22_start_arm
-+static inline void get_cabac_by22_start_arm(CABACContext * const c)
-+{
-+    const uint8_t *ptr = c->bytestream;
-+    register uint32_t low __asm__("r1"), range __asm__("r2");
-+    uint32_t m, range8, bits;
-+#if !USE_BY22_DIV
-+    uintptr_t inv;
-+#endif
-+
-+    av_assert2(offsetof (CABACContext, low) == 0);
-+    av_assert2(offsetof (CABACContext, range) == 4);
-+    av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
-+    __asm__ volatile (
-+        "ldmia   %[c], {%[low], %[range]}                         \n\t"
-+        : // Outputs
-+               [low]"=r"(low),
-+             [range]"=r"(range)
-+        : // Inputs
-+                 [c]"r"(c)
-+        : // Clobbers
-+    );
-+#if !USE_BY22_DIV
-+    inv = (uintptr_t)cabac_by22_inv_range;
-+#endif
-+    __asm__ volatile (
-+        "ldr     %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
-+#if !USE_BY22_DIV
-+        "uxtb    %[range8], %[range]                              \n\t"
-+#endif
-+        "rbit    %[bits], %[low]                                  \n\t"
-+        "lsl     %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+        "clz     %[bits], %[bits]                                 \n\t"
-+        "str     %[ptr], [%[c], %[ptr_off]]                       \n\t"
-+        "rev     %[m], %[m]                                       \n\t"
-+        "rsb     %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+        "eor     %[m], %[m], #0x80000000                          \n\t"
-+#if !USE_BY22_DIV
-+        "ldr     %[inv], [%[inv], %[range8], lsl #2]              \n\t"
-+        "pkhbt   %[range], %[bits], %[range], lsl #16             \n\t"
-+        "str     %[range], [%[c], %[bits_off]]                    \n\t"
-+#else
-+        "strh    %[bits], [%[c], %[bits_off]]                     \n\t"
-+#endif
-+#if CONFIG_THUMB
-+        "lsr     %[m], %[ptr]                                     \n\t"
-+        "eor     %[range], %[low], %[m]                           \n\t"
-+#else
-+        "eor     %[range], %[low], %[m], lsr %[ptr]               \n\t"
-+#endif
-+        : // Outputs
-+               [ptr]"+&r"(ptr),
-+               [low]"+&r"(low),
-+             [range]"+&r"(range),
-+#if !USE_BY22_DIV
-+               [inv]"+&r"(inv),
-+#endif
-+                 [m]"=&r"(m),
-+            [range8]"=&r"(range8),
-+              [bits]"=&r"(bits)
-+        : // Inputs
-+                   [c]"r"(c),
-+            [bits_off]"J"(offsetof (CABACContext, by22.bits)),
-+             [ptr_off]"J"(offsetof (CABACContext, bytestream))
-+        : // Clobbers
-+            "memory"
-+    );
-+    c->low = range;
-+#if !USE_BY22_DIV
-+    c->range = inv;
-+#endif
-+}
-+
-+#define get_cabac_by22_peek get_cabac_by22_peek_arm
-+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
-+{
-+    uint32_t rv = c->low &~ 1, tmp;
-+    __asm__ (
-+        "cmp      %[inv] , #0                    \n\t"
-+        "it       ne                             \n\t"
-+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
-+        :  // Outputs
-+             [rv]"+r"(rv),
-+             [tmp]"=r"(tmp)
-+        :  // Inputs
-+             [inv]"r"(c->range)
-+        :  // Clobbers
-+                "cc"
-+    );
-+    return rv << 1;
-+}
-+
-+#define get_cabac_by22_flush get_cabac_by22_flush_arm
-+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
-+{
-+    uint32_t bits, ptr, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
-+        "rsb     %[tmp1], %[n], #32                \n\t"
-+        "add     %[bits], %[bits], %[n]            \n\t"
-+        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
-+        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
-+        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
-+#if CONFIG_THUMB
-+        "add     %[ptr], %[ptr], %[bits], lsr #3   \n\t"
-+        "ldr     %[ptr], [%[ptr]]                  \n\t"
-+#else
-+        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
-+#endif
-+        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
-+        "and     %[tmp2], %[bits], #7              \n\t"
-+        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
-+        "rev     %[ptr], %[ptr]                    \n\t"
-+        "lsl     %[tmp1], %[tmp1], #23             \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[val], %[n]                      \n\t"
-+        "sub     %[val], %[tmp1]                   \n\t"
-+#else
-+        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
-+#endif
-+        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
-+        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
-+        "str     %[val], [%[cc], %[low_off]]       \n\t"
-+        :  // Outputs
-+            [val]"+r"(val),
-+           [bits]"=&r"(bits),
-+            [ptr]"=&r"(ptr),
-+           [tmp1]"=&r"(tmp1),
-+           [tmp2]"=&r"(tmp2)
-+        :  // Inputs
-+                  [cc]"r"(c),
-+                   [n]"r"(n),
-+            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+           [range_off]"J"(offsetof(CABACContext, by22.range)),
-+             [low_off]"J"(offsetof(CABACContext, low))
-+        :  // Clobbers
-+           "memory"
-+    );
-+}
-+
-+#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
-+static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
-+{
-+    uint32_t last_coeff_abs_level_remaining;
-+    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
-+    __asm__ volatile (
-+        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
-+        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
-+        "bic     %[remain], %[remain], #1                     \n\t"
-+        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
-+        "cmp     %[prefix], #0                                \n\t"
-+        "it      ne                                           \n\t"
-+        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
-+        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
-+        "lsl     %[remain], %[remain], #1                     \n\t"
-+        "mvn     %[prefix], %[remain]                         \n\t"
-+        "clz     %[prefix], %[prefix]                         \n\t"
-+        "rsbs    %[n1], %[prefix], #2                         \n\t"
-+        "bcc     1f                                           \n\t"
-+        "adc     %[n1], %[rice], %[prefix]                    \n\t"
-+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
-+        "rsb     %[n2], %[n1], #32                            \n\t"
-+        "and     %[tmp1], %[tmp2], #7                         \n\t"
-+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
-+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
-+        "mul     %[n2], %[range], %[n2]                       \n\t"
-+        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
-+        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
-+        "rsb     %[tmp2], %[rice], #31                        \n\t"
-+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
-+        "lsl     %[n2], %[n2], #23                            \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[range], %[n1]                              \n\t"
-+        "sub     %[range], %[n2]                              \n\t"
-+#else
-+        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
-+#endif
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[remain], %[tmp2]                           \n\t"
-+        "add     %[remain], %[n2]                             \n\t"
-+#else
-+        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
-+#endif
-+        "b       3f                                           \n\t"
-+        "1:                                                   \n\t"
-+        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
-+        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
-+        "bhi     2f                                           \n\t"
-+        "sub     %[n1], %[n2], #2                             \n\t"
-+        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
-+        "rsb     %[n2], %[n1], #32                            \n\t"
-+        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
-+        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
-+        "lsr     %[n2], %[remain], %[n2]                      \n\t"
-+        "mul     %[n2], %[range], %[n2]                       \n\t"
-+        "rsb     %[range], %[rice], #34                       \n\t"
-+        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
-+        "and     %[tmp1], %[tmp2], #7                         \n\t"
-+        "lsl     %[remain], %[remain], %[prefix]              \n\t"
-+        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
-+        "rsb     %[prefix], %[prefix], %[range]               \n\t"
-+        "orr     %[remain], %[remain], #0x80000000            \n\t"
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "lsl     %[n2], %[n2], #23                            \n\t"
-+        "mov     %[range], #2                                 \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[tmp2], %[n1]                               \n\t"
-+        "sub     %[tmp2], %[n2]                               \n\t"
-+#else
-+        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
-+#endif
-+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
-+        "lsl     %[rice], %[range], %[rice]                   \n\t"
-+        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[remain], %[prefix]                         \n\t"
-+        "add     %[remain], %[rice]                           \n\t"
-+#else
-+        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
-+#endif
-+        "b       4f                                           \n\t"
-+        "2:                                                   \n\t"
-+        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
-+#if CONFIG_THUMB
-+        "add     %[tmp2], %[ptr], %[n1], lsr #3               \n\t"
-+        "ldr     %[tmp2], [%[tmp2]]                           \n\t"
-+#else
-+        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
-+#endif
-+        "rsb     %[tmp1], %[prefix], #32                      \n\t"
-+        "push    {%[rice]}                                    \n\t"
-+        "and     %[rice], %[n1], #7                           \n\t"
-+        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
-+        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
-+        "mul     %[remain], %[range], %[tmp1]                 \n\t"
-+        "rev     %[tmp2], %[tmp2]                             \n\t"
-+        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
-+        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
-+        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
-+        "sub     %[tmp2], %[n2], #2                           \n\t"
-+        "lsl     %[remain], %[remain], #23                    \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[ptr], %[prefix]                            \n\t"
-+        "rsb     %[remain], %[ptr]                            \n\t"
-+#else
-+        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
-+#endif
-+        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
-+        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
-+        "bic     %[n1], %[remain], #1                         \n\t"
-+        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
-+        "cmp     %[tmp1], #0                                  \n\t"
-+        "rsb     %[rice], %[tmp2], #32                        \n\t"
-+        "it      ne                                           \n\t"
-+        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
-+        "and     %[tmp1], %[prefix], #7                       \n\t"
-+#if CONFIG_THUMB
-+        "add     %[ptr], %[ptr], %[prefix], lsr #3            \n\t"
-+        "ldr     %[ptr], [%[ptr]]                             \n\t"
-+#else
-+        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
-+#endif
-+        "lsl     %[n1], %[n1], #1                             \n\t"
-+        "lsr     %[rice], %[n1], %[rice]                      \n\t"
-+        "rsb     %[n2], %[n2], #34                            \n\t"
-+        "mul     %[range], %[range], %[rice]                  \n\t"
-+        "pop     {%[rice]}                                    \n\t"
-+        "rev     %[ptr], %[ptr]                               \n\t"
-+        "orr     %[n1], %[n1], #0x80000000                    \n\t"
-+        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
-+        "mov     %[prefix], #2                                \n\t"
-+        "lsl     %[range], %[range], #23                      \n\t"
-+#if CONFIG_THUMB
-+        "lsl     %[remain], %[tmp2]                           \n\t"
-+        "rsb     %[range], %[remain]                          \n\t"
-+#else
-+        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
-+#endif
-+        "lsl     %[remain], %[prefix], %[rice]                \n\t"
-+#if CONFIG_THUMB
-+        "lsr     %[n1], %[n2]                                 \n\t"
-+        "add     %[remain], %[n1]                             \n\t"
-+#else
-+        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
-+#endif
-+        "3:                                                   \n\t"
-+        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
-+        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
-+        "4:                                                   \n\t"
-+        "str     %[range], [%[cc], %[low_off]]                \n\t"
-+        :  // Outputs
-+            [remain]"=&r"(last_coeff_abs_level_remaining),
-+              [rice]"+r"(rice_param),
-+            [prefix]"=&r"(prefix),
-+                [n1]"=&r"(n1),
-+             [range]"=&r"(range),
-+                [n2]"=&r"(n2),
-+               [ptr]"=&r"(ptr),
-+              [tmp1]"=&r"(tmp1),
-+              [tmp2]"=&r"(tmp2)
-+        :  // Inputs
-+                          [cc]"r"(c),
-+            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
-+                     [low_off]"J"(offsetof(CABACContext, low)),
-+                   [range_off]"J"(offsetof(CABACContext, range)),
-+               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
-+                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
-+        :  // Clobbers
-+           "cc", "memory"
-+    );
-+    return last_coeff_abs_level_remaining;
-+}
-+
-+#endif /* HAVE_ARMV6T2_INLINE */
-+
-+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
-new file mode 100644
-index 0000000000..978b7b6947
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
-@@ -0,0 +1,183 @@
-+/*
-+ * ARM NEON optimised IDCT functions for HEVC decoding
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+@ Included multiple times from hevc_idct_neon.S
-+@ Macros defined there
-+
-+#define DC_SHIFT  (15 - BIT_DEPTH)
-+#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
-+#define TRN_SHIFT (20 - BIT_DEPTH)
-+
-+function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r1, #DC_ADD
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q0, r1
-+        vdup.16     q1, r1
-+        vst1.16     {q0, q1}, [r0]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+        vst1.16     {q8, q9}, [r0], r3
-+        vst1.16     {q8, q9}, [r2], r3
-+        vst1.16     {q8, q9}, [r0]
-+        vst1.16     {q8, q9}, [r2]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        mov         ip, #16*16
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+1:      vst1.16     {q8, q9}, [r0], r3
-+        subs        ip, ip, #32
-+        vst1.16     {q8, q9}, [r2], r3
-+        bhi         1b
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
-+        ldrsh       r1, [r0]
-+        add         r2, r0, #32
-+        mov         r3, #64
-+        add         r1, #DC_ADD
-+        mov         ip, #32*32
-+        asr         r1, #DC_SHIFT
-+        vdup.16     q8, r1
-+        vdup.16     q9, r1
-+1:      vst1.16     {q8, q9}, [r0], r3
-+        subs        ip, ip, #32
-+        vst1.16     {q8, q9}, [r2], r3
-+        bhi         1b
-+        bx lr
-+endfunc
-+
-+
-+function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
-+        vldr.i32    s0, =0x00240053 // 36 and 83
-+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
-+
-+        tr4_shift   #7
-+
-+        vzip.16     d28, d29
-+        vzip.16     d30, d31
-+        vzip.32     q14, q15
-+
-+        tr4_shift   #TRN_SHIFT
-+
-+        vst4.16     {q14, q15}, [r0 :256]
-+        bx lr
-+
-+        .ltorg
-+endfunc
-+
-+
-+
-+function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
-+        vmov.i32    d0, #0x4a  // 74
-+        vld1.16     {q14, q15}, [r0 :256]  // coeffs
-+        vmov.i32    d1, #0x1d  // 29
-+        vmov.i32    d2, #0x37  // 55
-+
-+        tr4_luma_shift #7
-+
-+        vzip.16     d28, d29
-+        vzip.16     d30, d31
-+        vzip.32     q14, q15
-+
-+        tr4_luma_shift #TRN_SHIFT
-+
-+        vst4.16     {q14, q15}, [r0 :256]
-+        bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
-+        add      r2, r0, #16
-+        adr      r3, tr4f
-+        vpush    {d8-d15}
-+        vld1.16  {d0, d1}, [r3]
-+        mov      r3, #32
-+
-+        tr8_vert  d16, d17, d18, d19, d24, d25, d26, d27, q8,  q9,  \
-+            "sub      r0, r0, #128-8",                              \
-+            "sub      r2, r2, #128-8",                              \
-+            "cmp      r1, #4"
-+        ble      2f
-+
-+        tr8_vert  d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
-+            "sub      r0, r0, #128+8",                              \
-+            "sub      r2, r2, #128+8+16-32",                        \
-+            "mov      r3, #64"
-+
-+        vzip.16  d16, d17
-+        vzip.16  d18, d19
-+
-+        vzip.16  d20, d21
-+        vzip.16  d22, d23
-+        vzip.16  d28, d29
-+        vzip.16  d30, d31
-+        vzip.32  q10, q11
-+        vzip.32  q14, q15
-+1:
-+        vzip.16  d24, d25
-+        vzip.16  d26, d27
-+        vzip.32  q8, q9
-+        vzip.32  q12, q13
-+
-+        tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8,  q9,  TRN_SHIFT
-+        tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
-+
-+        vpop     {d8-d15}
-+        bx       lr
-+
-+2:      vmov.i64 q10, #0
-+        sub      r0, r0, #8
-+        vmov.i64 q11, #0
-+        sub      r2, r2, #8+16-32
-+        vmov.i64 q14, #0
-+        mov      r3, #64
-+        vmov.i64 q15, #0
-+
-+        vzip.16  d16, d17
-+        vzip.16  d18, d19
-+
-+        b        1b
-+
-+endfunc
-+
-+#undef DC_SHIFT
-+#undef DC_ADD
-+#undef TRN_SHIFT
-+
-diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
-new file mode 100644
-index 0000000000..161bb0d7c9
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.S
-@@ -0,0 +1,267 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Written by John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ rpi_zap_coeff_vals_neon(
-+@   uint16_t * buf,          [r0]
-+@   unsigned int log_n_m2)   [r1]
-+
-+function rpi_zap_coeff_vals_neon, export=1
-+        mov      ip, #1
-+        vmov.i64 q0, #0
-+        teq      r1, #0
-+        vmov.i64 q1, #0
-+        beq      2f
-+
-+        lsl      ip, r1    @ 2, 4 or 8
-+        add      r2, r0, #32
-+        lsl      ip, r1    @ 4, 16 or 64 = number of 32-byte blocks to zero
-+        mov      r3, #64
-+1:      vst1.8   {q0,q1}, [r0:256], r3
-+        subs     ip, #2
-+        vst1.8   {q0,q1}, [r2:256], r3
-+        bne      1b
-+        bx       lr
-+
-+2:      vst1.8   {q0,q1}, [r0:256]
-+        bx       lr
-+endfunc
-+
-+@ PIC jump tables are more expensive than absolute for A32 code
-+.set jent_pic, CONFIG_PIC || CONFIG_THUMB
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+T       .short ((0 + \lab) - (0 + 98b)) / 2
-+A       .short (0 + \lab) - (4 + 98b)
-+.else
-+T       .word   1 + \lab
-+A       .word   \lab
-+.endif
-+.endm
-+
-+.set expected_next, 0
-+
-+.macro cpy_compound val, p1, p2, drop_thru=0
-+.if \p1 + \p2 != \val
-+.error "Bad addition!  \p1 + \p2 != \val"
-+.endif
-+.if expected_next != 0 && expected_next != \val
-+.error "Drop thru failure"
-+.endif
-+\val\():
-+        push       {r0-r3}
-+        bl          100\p1\()b
-+        pop        {r0-r3}
-+        add         r0, #\p1
-+        add         r2, #\p1
-+.if \drop_thru == 0
-+        b           \p2\()b
-+.set expected_next, 0
-+.else
-+.set expected_next, \p2
-+.endif
-+.endm
-+
-+@ ff_hevc_cpy_blks8x4_neon(
-+@   dst         [r0]
-+@   dst_stride  [r1]
-+@   src         [r2]
-+@   src_stride  [r3]
-+@   width       [sp, #0] (bytes)
-+@   height)     [sp, #4]
-+@
-+@ Power of 2 widths are directly coded, all others are done in stripes
-+@ We expect the vast majority of calls to be power of 2
-+@
-+@ Currently has min width of 8, but we could make that 4 without issue
-+@ Min height is 4
-+
-+function ff_hevc_rpi_cpy_blks8x4_neon, export=1
-+        ldr         r12, [sp, #0]
-+        push       {r11, lr}
-+.if jent_pic
-+A       adr         lr,  98f - 2
-+.else
-+A       adr         lr,  98f - 4
-+.endif
-+        lsr         r12, #3
-+        ldr         r11, [sp, #(8 + 4)]
-+.if jent_pic
-+A       lsl         r12, #1
-+A       ldrsh       lr,  [lr,  r12]
-+A       add         pc,  lr
-+T       tbh         [pc, r12, lsl #1]
-+.else
-+        @ A32 only, Thumb is always PIC
-+        ldr         pc,  [lr,  r12, lsl #2]
-+.endif
-+
-+98:
-+T       .short      0 @ unused
-+        jent        8f
-+        jent        16f
-+        jent        24f
-+        jent        32f
-+        jent        40f
-+        jent        48f
-+        jent        56f
-+        jent        64f
-+        jent        72f
-+        jent        80f
-+        jent        88f
-+        jent        96f
-+        jent        104f
-+        jent        112f
-+        jent        120f
-+        jent        128f
-+
-+1008:
-+        push       {r11, lr}
-+8:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {d0 }, [r2],  r3
-+        vld1.32    {d1 }, [lr],  r3
-+        vld1.32    {d2 }, [r2],  r3
-+        vld1.32    {d3 }, [lr],  r3
-+        subs        r11,  #4
-+        vst1.32    {d0 }, [r0],  r1
-+        vst1.32    {d1 }, [r12], r1
-+        vst1.32    {d2 }, [r0],  r1
-+        vst1.32    {d3 }, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10016:
-+        push       {r11, lr}
-+16:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {q0 }, [r2],  r3
-+        vld1.32    {q1 }, [lr],  r3
-+        vld1.32    {q2 }, [r2],  r3
-+        vld1.32    {q3 }, [lr],  r3
-+        subs        r11, #4
-+        vst1.32    {q0 }, [r0],  r1
-+        vst1.32    {q1 }, [r12], r1
-+        vst1.32    {q2 }, [r0],  r1
-+        vst1.32    {q3 }, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10032:
-+        push       {r11, lr}
-+32:
-+        add         lr,  r2,  r3
-+        lsl         r3,  #1
-+        add         r12, r0,  r1
-+        lsl         r1,  #1
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r3
-+        vld1.32    {q10, q11}, [lr],  r3
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #4
-+        vst1.32    {q8,  q9 }, [r0],  r1
-+        vst1.32    {q10, q11}, [r12], r1
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+10064:
-+        push       {r11, lr}
-+64:
-+        add         lr,  r2,  #32
-+        add         r12, r0,  #32
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r3
-+        vld1.32    {q10, q11}, [lr],  r3
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #2
-+        vst1.32    {q8,  q9 }, [r0],  r1
-+        vst1.32    {q10, q11}, [r12], r1
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r11, pc}
-+
-+128:
-+        push       {r4, r5}
-+        @ We could do this with fewer registers if we jump around but I
-+        @ have a primative urge to load sequentially
-+        mov         r4,  #64
-+        add         lr,  r2,  #32
-+        add         r12, r0,  #32
-+        sub         r3,  r4
-+        sub         r1,  r4
-+1:
-+        vld1.32    {q8,  q9 }, [r2],  r4
-+        vld1.32    {q10, q11}, [lr],  r4
-+        vld1.32    {q12, q13}, [r2],  r3
-+        vld1.32    {q14, q15}, [lr],  r3
-+        subs        r11, #1
-+        vst1.32    {q8,  q9 }, [r0],  r4
-+        vst1.32    {q10, q11}, [r12], r4
-+        vst1.32    {q12, q13}, [r0],  r1
-+        vst1.32    {q14, q15}, [r12], r1
-+        bgt         1b
-+        pop        {r4, r5, r11, pc}
-+
-+@ Use drop_thru where we can
-+cpy_compound 104, 64, 40, 1
-+cpy_compound 40, 32, 8
-+
-+cpy_compound 112, 64, 48, 1
-+cpy_compound 48, 32, 16
-+
-+cpy_compound 120, 64, 56, 1
-+cpy_compound 56, 32, 24, 1
-+cpy_compound 24, 16, 8
-+
-+cpy_compound 72, 64, 8
-+cpy_compound 80, 64, 16
-+cpy_compound 88, 64, 24
-+cpy_compound 96, 64, 32
-+
-+
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h
-new file mode 100644
-index 0000000000..9d21f6a882
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.h
-@@ -0,0 +1,438 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
-+#define AVCODEC_ARM_RPI_HEVC_MISC_H
-+
-+#include "config.h"
-+#if HAVE_NEON_INLINE && !CONFIG_THUMB
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                       ptrdiff_t stride_src)
-+{
-+    const uint8_t *src2 = src + stride_src;
-+    stride_src <<= 1;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {q0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {q1}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.32     {q0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.32     {q1}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.16     d0, d1                            \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.16     d2, d3                            \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d2}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vzip.16     d0, d1                            \n\t"
-+                "vst1.16     {d0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vzip.16     d2, d3                            \n\t"
-+                "vst1.16     {d2}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.8      d0, d1                            \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d0}, [%[dst]]!                   \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
-+                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+                "vzip.8      d2, d3                            \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d2}, [%[dst]]!                   \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vzip.8      d0, d1                            \n\t"
-+                "vst1.8      {d0}, [%[dst]]                    \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vzip.8      d2, d3                            \n\t"
-+                "vst1.8      {d2}, [%[dst]]                    \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [src]"+r"(src),
-+                          [src2]"+r"(src2),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                      ptrdiff_t stride_dst)
-+{
-+    uint8_t *dst2 = dst + stride_dst;
-+    stride_dst <<= 1;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.32     {q0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.32     {q1}, [%[src]]!                   \n\t"
-+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.32     {q0}, [%[src]]!                   \n\t"
-+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
-+                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
-+                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "subs        %[height], #4                     \n\t"
-+                "vld1.16     {d0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.16     {d2}, [%[src]]!                   \n\t"
-+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.16     {d0}, [%[src]]!                   \n\t"
-+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #4                     \n\t"
-+                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
-+                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
-+                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "subs        %[height], #8                     \n\t"
-+                "vld1.8      {d0}, [%[src]]!                   \n\t"
-+                "beq         2f                                \n\t"
-+                "1:                                            \n\t"
-+                "vld1.8      {d2}, [%[src]]!                   \n\t"
-+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
-+                "beq         3f                                \n\t"
-+                "vld1.8      {d0}, [%[src]]!                   \n\t"
-+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
-+                "subs        %[height], #8                     \n\t"
-+                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
-+                "bne         1b                                \n\t"
-+                "2:                                            \n\t"
-+                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
-+                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
-+                "b           4f                                \n\t"
-+                "3:                                            \n\t"
-+                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
-+                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
-+                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
-+                "4:                                            \n\t"
-+                :  // Outputs
-+                           [dst]"+r"(dst),
-+                          [dst2]"+r"(dst2),
-+                           [src]"+r"(src),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
-+                                                       int pixel_shift, int height,
-+                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    int x, y;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            __asm__ volatile (
-+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
-+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
-+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldr         %[x], [%[src]], %[stride_src] \n\t"
-+                "str         %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldr         %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "str         %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "str         %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        case 1:
-+            __asm__ volatile (
-+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
-+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
-+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
-+                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "strh        %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+        default:
-+            __asm__ volatile (
-+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
-+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
-+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "sub         %[height], #2                 \n\t"
-+                "1:                                        \n\t"
-+                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
-+                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
-+                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
-+                "subs        %[height], #2                 \n\t"
-+                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
-+                "bne         1b                            \n\t"
-+                "strb        %[y], [%[dst]]                \n\t"
-+                :  // Outputs
-+                             [x]"=&r"(x),
-+                             [y]"=&r"(y),
-+                           [src]"+r"(src),
-+                           [dst]"+r"(dst),
-+                        [height]"+r"(height)
-+                :  // Inputs
-+                    [stride_src]"r"(stride_src),
-+                    [stride_dst]"r"(stride_dst)
-+                :  // Clobbers
-+                    "cc", "memory"
-+            );
-+            break;
-+    }
-+}
-+
-+#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
-+static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
-+                                              int pixel_shift, int height,
-+                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    if (stride_dst == 1 << pixel_shift)
-+        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
-+    else if (stride_src == 1 << pixel_shift)
-+        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
-+    else
-+        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
-+}
-+
-+#endif /* HAVE_NEON_INLINE */
-+
-+#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
-diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h
-new file mode 100644
-index 0000000000..325c26a49b
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_mv_arm.h
-@@ -0,0 +1,93 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Written by John Cox, Ben Avison
-+*/
-+
-+#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
-+#define AVCODEC_ARM_RPI_HEVC_MV_H
-+
-+#if HAVE_ARMV6T2_INLINE
-+static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
-+{
-+    MvXY r;
-+    __asm__ (
-+        "sadd16    %[r], %[a], %[b]        \n\t"
-+        : [r]"=r"(r)
-+        : [a]"r"(a),
-+          [b]"r"(b)
-+        :
-+        );
-+    return r;
-+}
-+#define mvxy_add mvxy_add_arm
-+#endif
-+
-+#if HAVE_ARMV6T2_INLINE
-+#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
-+static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
-+{
-+    int t;
-+    __asm__ (
-+    "ssat   %[td], #8,    %[td]          \n\t"
-+    "ssat   %[tb], #8,    %[tb]          \n\t"
-+    "eor    %[t],  %[td], %[td], asr #31 \n\t"
-+    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
-+    "asr    %[t],  #1                    \n\t"
-+    "add    %[t],  #0x4000               \n\t"
-+    "it ne                               \n\t"
-+    "sdivne %[t],  %[t],  %[td]          \n\t"
-+    "mov    %[td], #32                   \n\t"
-+    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
-+    "ssat   %[td], #13,   %[td], asr #6  \n\t"
-+    "mov    %[tb], #127                  \n\t"
-+    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
-+    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
-+// This takes the sign of x & y for rounding at the "wrong" point
-+// (i.e. after adding 127) but for the range of values (-1,-127)
-+// where it does the wrong thing you get the right answer (0) anyway
-+    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
-+    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
-+    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
-+    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
-+    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
-+    :
-+         [t]"=&r"(t),
-+        [xy]"+r"(xy),
-+        [td]"+r"(td),
-+        [tb]"+r"(tb)
-+    :
-+    :
-+        "cc"
-+    );
-+    return xy;
-+}
-+#define mv_scale_xy mv_scale_xy_arm
-+#endif
-+#endif
-+
-+#endif // AVCODEC_ARM_RPI_HEVC_MV_H
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
-new file mode 100644
-index 0000000000..62b9326532
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_arm.h
-@@ -0,0 +1,26 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
-+#define AVCODEC_ARM_HEVCDSP_ARM_H
-+
-+#include "libavcodec/rpi_hevcdsp.h"
-+
-+void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
-diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-new file mode 100644
-index 0000000000..88a3b4e5e7
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-@@ -0,0 +1,1634 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
-+        vsubl.u8  q0, \Q0a, \P0a
-+        vsubl.u8  q1, \P1a, \Q1a
-+        vdup.16   d4, r2
-+        \I1
-+        vshl.i16  q0, #2
-+        \I2
-+        vadd.i16  q0, q1
-+        \I3
-+        vmovl.u8  q2, d4
-+        \I4
-+        vneg.s16  q1, q2
-+        \I5
-+        vrshr.s16 q0, #3
-+        \I6
-+        \I7
-+        \I8
-+        vmin.s16  q0, q2
-+        vmovl.u8  q2, \Q0a
-+        vmax.s16  q0, q1
-+        vaddw.u8  q1, q0, \P0a
-+        vsub.i16  q0, q2, q0
-+        vqmovun.s16 \P0a, q1
-+        vqmovun.s16 \Q0a, q0
-+.endm
-+
-+
-+.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
-+        vsubl.u8  q0, \Q0a, \P0a  @ q0a - p0a
-+        lsr       r12, r2, #16
-+        vsubl.u8  q1, \Q0b, \P0b  @ q0b - p0b
-+        vsubl.u8  q2, \P1a, \Q1a  @ p1a - q1a
-+        vsubl.u8  q3, \P1b, \Q1b  @ p1b - q1b
-+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
-+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
-+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
-+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
-+        vdup.16   d4, r2          @ tc0a, tc0b
-+        vdup.16   d6, r12         @ tc1a, tc1b
-+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+        \I1
-+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+        \I2
-+        vmovl.u8  q2, d4          @ tc0a, tc0b
-+        \I3
-+        vmovl.u8  q3, d6          @ tc1a, tc1b
-+        \I4
-+        vmin.s16  q0, q2
-+        \I5
-+        vneg.s16  q2, q2          @ -tc0a, -tc0b
-+        \I6
-+        vmin.s16  q1, q3
-+        \I7
-+        vneg.s16  q3, q3          @ -tc1a, -tc1b
-+        vmax.s16  q0, q2          @ delta0a
-+        vmovl.u8  q2, \Q0a
-+        vmax.s16  q1, q3          @ delta0b
-+        vaddw.u8  q3, q0, \P0a    @ p0a + delta0a
-+        vsub.i16  q0, q2, q0      @ q0a - delta0a
-+        vmovl.u8  q2, \Q0b
-+        vsub.i16  q2, q1          @ q0b - delta0b
-+        vaddw.u8  q1, \P0b        @ p0b + delta0b
-+        vqmovun.s16 \Q0a, q0
-+        vqmovun.s16 \P0a, q3
-+        vqmovun.s16 \Q0b, q2
-+        vqmovun.s16 \P0b, q1
-+.endm
-+
-+
-+@ Preserves r12
-+@ Clobbers r2
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@   [0..7]   tc U a
-+@   [8..15]  tc V a
-+
-+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
-+        vsub.i16  q0, \Q0a, \P0a
-+        vsub.i16  q1, \P1a, \Q1a
-+        vdup.16   d4, r2
-+        \I1
-+        vshl.i16  q0, #2
-+        \I2
-+        vadd.i16  q0, q1
-+        \I3
-+        vshll.u8  q2, d4, #\bit_depth - 8
-+        \I4
-+        vneg.s16  q1, q2
-+        \I5
-+        vrshr.s16 q0, #3
-+        \I6
-+        \I7
-+        \I8
-+        vmin.s16  q0, q2
-+        vmov.i16  q2, #0
-+        vmax.s16  q0, q1
-+        vadd.i16  \P0a, q0
-+        vsub.i16  \Q0a, q0
-+        vmov.i16  q1, #(1 << \bit_depth) - 1
-+        vmax.s16  \P0a, q2
-+        vmax.s16  \Q0a, q2
-+        vmin.s16  \P0a, q1
-+        vmin.s16  \Q0a, q1
-+.endm
-+
-+@ Clobbers r2, r12
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@   [0..7]   tc U a
-+@   [8..15]  tc V a
-+@  [16..23]  tc U b
-+@  [24..31]  tc V b
-+
-+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
-+        vsub.i16  q0, \Q0a, \P0a  @ q0a - p0a
-+        lsr       r12, r2, #16
-+        vsub.i16  q1, \Q0b, \P0b  @ q0b - p0b
-+        vsub.i16  q2, \P1a, \Q1a  @ p1a - q1a
-+        vsub.i16  q3, \P1b, \Q1b  @ p1b - q1b
-+        vshl.i16  q0, #2          @ (q0a - p0a) * 4
-+        vshl.i16  q1, #2          @ (q0b - p0b) * 4
-+        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
-+        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
-+        vdup.16   d4, r2          @ tc0a, tc0b
-+        vdup.16   d6, r12         @ tc1a, tc1b
-+        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+        \I1
-+        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+        \I2
-+        vshll.u8  q2, d4, #\bit_depth - 8 @ tc0a, tc0b
-+        \I3
-+        vshll.u8  q3, d6, #\bit_depth - 8 @ tc1a, tc1b
-+        \I4
-+        vmin.s16  q0, q2
-+        \I5
-+        vneg.s16  q2, q2          @ -tc0a, -tc0b
-+        \I6
-+        vmin.s16  q1, q3
-+        \I7
-+        vneg.s16  q3, q3          @ -tc1a, -tc1b
-+        vmax.s16  q0, q2          @ delta0a
-+        vadd.i16  \P0a, q0        @ p0a + delta0a
-+        vsub.i16  \Q0a, q0        @ q0a - delta0a
-+        vmax.s16  q1, q3          @ delta0b
-+        vadd.i16  \P0b, q1        @ p0b + delta0b
-+        vsub.i16  \Q0b, q1        @ q0b - delta0b
-+        vmov.i16  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        vmax.s16  \P0a, q2
-+        vmax.s16  \Q0a, q2
-+        vmax.s16  \P0b, q2
-+        vmax.s16  \Q0b, q2
-+        vmin.s16  \P0a, q3
-+        vmin.s16  \Q0a, q3
-+        vmin.s16  \P0b, q3
-+        vmin.s16  \Q0b, q3
-+.endm
-+
-+
-+
-+@   uint8_t *_no_p,     [sp+0]
-+@   uint8_t *_no_q)     [sp+4]
-+
-+.macro hevc_loop_filter_luma_start
-+        ldr     r12, [r3]
-+        ldr      r3, [r3, #4]
-+        orrs     r3, r12, r3, lsl #16
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldrd     r4, r5, [sp, #32]      @ &_no_p
-+        ldrb     r4, [r4]
-+        ldrb     r5, [r5]
-+        movs     r10, r4
-+        it ne
-+        movne    r10, #1
-+        cmp      r5, #0
-+        it ne
-+        orrne    r10, #2
-+.endm
-+
-+@ Input:
-+@  r2          beta    (raw: needs shift for bitdepth > 8)
-+@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
-+@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
-+@
-+@ Input & output
-+@  8-bit: d16-d23      (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
-+@ 16-bit:  q8-q15
-+@
-+@  r1         -r1
-+@  r10        b1->C, b0->N  (r10 junk)
-+@
-+@ Junks:
-+@  r5, r6, r7, r8, r9
-+
-+.macro m_filter_luma bit_depth, Q11, Q15
-+.if \bit_depth == 8
-+        vmovl.u8    q14, d22      @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
-+        vmovl.u8    q13, d21      @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
-+        vmovl.u8    q12, d20      @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
-+        vmovl.u8    \Q11, d19     @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
-+        vmovl.u8    q10, d18      @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
-+        vmovl.u8    q9, d17       @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
-+.endif
-+        vadd.i16    q0, q9, \Q11  @ P2 + P0
-+.if \bit_depth > 8
-+        lsl         r3, r3, #(\bit_depth - 8)
-+.endif
-+        vadd.i16    q1, q14, q12  @ Q2 + Q0
-+.if \bit_depth > 8
-+        lsl         r2, r2, #(\bit_depth - 8)
-+.endif
-+        vsub.i16    q0, q10       @ P2 - P1 + P0
-+        lsr         r5, r3, #16
-+        vsub.i16    q1, q13       @ Q2 - Q1 + Q0
-+.if \bit_depth == 8
-+        vmovl.u8    q8, d16       @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
-+        vmovl.u8    \Q15, d23     @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
-+.endif
-+        vabd.s16    q0, q10       @ dp0 = abs(P2 - 2 * P1 + P0)
-+        vabd.s16    q1, q13       @ dq0 = abs(Q2 - 2 * Q1 + Q0)
-+        vmov.i64    q2, #0xffffffff0000
-+        vbic        q0, q2        @ only dp0(') and dp3(')
-+        vbic        q1, q2        @ only dq0(') and dq3(')
-+        vsra.u64    q0, #16
-+        vsra.u64    q1, #16
-+        vdup.16     q3, r2        @ beta
-+        vdup.16     d14, r3       @ tC[0]
-+        vdup.16     d15, r5       @ tC[1]
-+        vabd.s16    q4, q8, \Q11  @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
-+        vmovn.i32   d0, q0        @ dp3' dp0' dp3 dp0
-+        vmovn.i32   d1, q1        @ dq3' dq0' dq3 dq0
-+        vadd.i16    d5, d0, d1    @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
-+        vabd.s16    q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
-+        vaba.s16    q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
-+        vpadd.i16   d2, d5, d5    @ dontcare dontcare d0'+d3' d0+d3
-+        vshl.s16    q6, q7, #2    @ tC[] * 4
-+        vrhadd.s16  q6, q7        @ tc25 = (tc[] * 5 + 1) >> 1
-+        vcgt.s16    d2, d6, d2    @ if (d0 + d3 < beta)
-+        vmov        r7, s4        @ (d2) r7 = mask of blocks to apply filtering (16b/block)
-+        vshr.s16    q1, q3, #3    @ beta_3 = beta >> 3
-+        cmp         r7, #0
-+        beq         .Lbypasswrite
-+
-+        vcgt.s16    q5, q6, q5    @ if < tc25
-+        vcgt.s16    q4, q1, q4    @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
-+        vand        q4, q5
-+        vbic        d8, d4
-+        vbic        d9, d4
-+        vshr.s16    q3, #2        @ beta_2 = beta >> 2
-+        vsra.u64    q4, #16
-+        vshl.s16    d5, #1        @ d3'<<1 d0'<<1 d3<<1 d0<<1
-+        vshl.i16    q7, #1        @ tc2 = tC[] << 1
-+        vcgt.s16    d6, d5        @ if (d3'<<1 < beta_2) etc
-+        vmovn.i32   d8, q4        @ beta_3 && tc25 tests, prime block in ms half
-+        vand        d6, d8        @ && beta_2 tests, prime in ms half
-+        vpadd.i16   d0, d1        @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
-+        vneg.s16    q6, q7        @ -tc2
-+        vmovn.i32   d8, q3
-+        vshrn.i32   d6, q3, #16
-+        vand        d6, d8
-+        vmov        r5, r6, d0    @ r5 = dp0'+dp3' dp0+dp3  r6 = dq0'+dq3' dq0+dq3
-+        vmov        r8, s12       @ (d6) r8 = mask of strong filtering blocks (16b/block)
-+        vadd.i16    q0, \Q11, q12 @ p0 + q0
-+        ands        r9, r7, r8
-+        beq         1f
-+
-+        vadd.i16    q2, q0, q10   @ p1 + p0 + q0
-+        vadd.i16    q3, q0, q13   @ p0 + q0 + q1
-+        lsr         r3, r9, #16
-+        vadd.i16    q1, q2, q9    @ p2 + p1 + p0 + q0 (new P1 before clipping)
-+        vadd.i16    q4, q3, q14   @ p0 + q0 + q1 + q2 (new Q1 before clipping)
-+        vadd.i16    q0, q8, q9    @ p3 + p2
-+        vadd.i16    q5, \Q15, q14 @ q2 + q3
-+        vadd.i16    q2, q1        @ p2 + 2 * p1 + 2 * p0 + 2 * q0
-+        vadd.i16    q3, q4        @ 2 * p0 + 2 * q0 + 2 * q1 + q2
-+        vshl.i16    q0, #1        @ 2 * p3 + 2 * p2
-+        vshl.i16    q5, #1        @ 2 * q2 + 2 * q3
-+        vadd.i16    q0, q1        @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
-+        vadd.i16    q5, q4        @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
-+        vadd.i16    q2, q13       @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
-+        vadd.i16    q3, q10       @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
-+        vrshr.s16   q0, #3        @ scale, with rounding
-+        vrshr.s16   q5, #3
-+        vrshr.s16   q1, #2
-+        vrshr.s16   q4, #2
-+        vrshr.s16   q2, #3
-+        vrshr.s16   q3, #3
-+        vsub.i16    q0, q9        @ find difference
-+        vsub.i16    q5, q14
-+        vsub.i16    q1, q10
-+        vsub.i16    q4, q13
-+        vsub.i16    q2, \Q11
-+        vsub.i16    q3, q12
-+        vmax.s16    q0, q6        @ clip difference to -tc2 .. tc2
-+        vmax.s16    q5, q6
-+        vmax.s16    q1, q6
-+        vmax.s16    q4, q6
-+        vmax.s16    q2, q6
-+        vmax.s16    q3, q6
-+        vdup.16     d12, r9       @ expand mask, reuse q6 due to register pressure
-+        vdup.16     d13, r3
-+        vmin.s16    q0, q7
-+        vmin.s16    q5, q7
-+        vmin.s16    q1, q7
-+        vmin.s16    q4, q7
-+        vmin.s16    q2, q7
-+        vmin.s16    q3, q7
-+        vadd.i16    q0, q9        @ apply difference
-+        vadd.i16    q5, q14
-+        vadd.i16    q1, q10
-+        vadd.i16    q4, q13
-+        vadd.i16    q2, \Q11
-+        vadd.i16    q3, q12
-+        vbit        q9, q0, q6    @ apply filtered values according to mask
-+        vbit        q14, q5, q6
-+        vbit        q10, q1, q6
-+        vbit        q13, q4, q6
-+        vbit        \Q11, q2, q6
-+        vbit        q12, q3, q6
-+        vneg.s16    q6, q7        @ restore -tc2
-+
-+1:
-+        bics        r9, r7, r8
-+        beq         2f
-+
-+        vsub.i16    q0, q12, \Q11 @ q0 - p0
-+        vsub.i16    q1, q13, q10  @ q1 - p1
-+        lsr         r3, r9, #16
-+        vshl.i16    q2, q0, #3
-+        lsr         r7, r5, #16
-+        vadd.i16    q3, q0, q2    @ 9 * (q0 - p0)
-+        lsr         r8, r6, #16
-+        vshl.i16    q2, q1, #1
-+        vadd.i16    q4, q1, q2    @ 3 * (q1 - p1)
-+        vshr.s16    q6, #1        @ -tc = -tc2 >> 1
-+        vsub.i16    q5, q3, q4
-+        vrhadd.s16  q1, q9, \Q11  @ (p2 + p0 + 1) >> 1
-+        vrhadd.s16  q3, q14, q12  @ (q2 + q0 + 1) >> 1
-+        vrshr.s16   q5, #4        @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
-+        vsub.i16    q1, q10       @ ((p2 + p0 + 1) >> 1) - p1
-+        vsub.i16    q3, q13       @ ((q2 + q0 + 1) >> 1) - q1
-+        vmax.s16    q6, q5        @
-+        vshr.s16    q4, q7, #1    @ tc = tc2 >> 1
-+        vdup.16     q0, r2        @ beta
-+        vmin.s16    q6, q4        @ delta0 clamped to [-tc, tc]
-+        vshr.s16    q4, #1        @ tc_2 = tc >> 1
-+        vhadd.s16   q1, q6        @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
-+        vhsub.s16   q3, q6        @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
-+        vshr.s16    q2, q0, #1    @ beta >> 1
-+        vadd.i16    q2, q0        @ beta + (beta >> 1)
-+        vneg.s16    q0, q4        @ -tc_2
-+        vabs.s16    q5, q5        @ abs(original delta0)
-+        vshr.s16    q2, #3        @ (beta + (beta >> 1)) >> 3
-+        vmax.s16    q1, q0
-+        vmax.s16    q3, q0
-+        vshl.s16    q0, q7, #2    @ 8 * tc
-+        vadd.i16    q7, q0        @ 10 * tc
-+        vdup.16     d0, r9
-+        vdup.16     d1, r3        @ q0 = mask of blocks to apply filtering
-+        vmin.s16    q1, q4        @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
-+        vmin.s16    q3, q4        @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
-+        vdup.16     d8, r5        @ dp0 + dp3
-+        vdup.16     d9, r7        @ dp0' + dp3'
-+        vcgt.s16    q7, q5        @ if ((10 * tc) > abs(delta0))
-+        vdup.16     d10, r6       @ dq0 + dq3
-+        vdup.16     d11, r8       @ dq0' + dq3'
-+        vand        q7, q0        @ AND block and line masks
-+        vcgt.s16    q4, q2, q4    @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
-+        vadd.i16    q0, q1, q10   @ p1 + deltap1
-+        vcgt.s16    q5, q2, q5    @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
-+        vadd.i16    q3, q3, q13   @ q1 + deltaq1
-+        vadd.i16    q1, \Q11, q6  @ p0 + delta0
-+        vsub.i16    q2, q12, q6   @ q0 - delta0
-+        vand        q4, q7        @ AND nd_p test with block/line masks
-+        vand        q5, q7        @ AND nd_q test with block/line masks
-+        vbit        q10, q0, q4
-+        vbit        \Q11, q1, q7
-+        vbit        q12, q2, q7
-+        vbit        q13, q3, q5
-+
-+2:
-+.if \bit_depth == 8
-+        vmovn.i16 d16, q8
-+        vmovn.i16 d23, \Q15
-+        neg       r1, r1
-+        vqmovun.s16 d17, q9
-+        vqmovun.s16 d18, q10
-+        vqmovun.s16 d19, \Q11
-+        lsls      r10, #31
-+        vqmovun.s16 d20, q12
-+        vqmovun.s16 d21, q13
-+        vqmovun.s16 d22, q14
-+.else
-+        vmov.i16  q0, #0
-+        vmov.i16  q1, #(1 << \bit_depth - 1)
-+        @ q8 & q15 should be unaltered and so don't require clipping
-+        neg       r1, r1
-+        vmax.s16  q9,  q0
-+        vmax.s16  q10, q0
-+        vmax.s16  q11, q0
-+        vmax.s16  q12, q0
-+        vmax.s16  q13, q0
-+        vmax.s16  q14, q0
-+        lsls      r10, #31
-+        vmin.s16  q9,  q1
-+        vmin.s16  q10, q1
-+        vmin.s16  q11, q1
-+        vmin.s16  q12, q1
-+        vmin.s16  q13, q1
-+        vmin.s16  q14, q1
-+.endif
-+        bx        lr
-+.endm
-+
-+function hevc_loop_filter_luma_body
-+        m_filter_luma 8, q15, q11
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
-+@   uint8_t *_pix,      [r0]
-+@   ptrdiff_t _stride,  [r1]
-+@   int _beta,          [r2]
-+@   int *_tc,           [r3]
-+@   uint8_t *_no_p,     [sp+0]
-+@   uint8_t *_no_q)     [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
-+        hevc_loop_filter_luma_start
-+
-+        sub      r4, r0, #4
-+        b        .Lv_loop_luma_common
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
-+@   uint8_t * pix_r,    [r0]
-+@   ptrdiff_t _stride,  [r1]
-+@   int _beta,          [r2]
-+@   int tc2,            [r3]
-+@   int no_f,           [sp+0]
-+@   uint8_t * pix_l)    [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r4, [sp, #36]
-+        ldr      r10, [sp, #32]
-+
-+.Lv_loop_luma_common:
-+        vpush    {d8-d15}
-+
-+        @ It's slightly faster to do unlaned loads and transpose in the
-+        @ 8-bit case, even though it needs more instructions, because
-+        @ VLD4.8 is a really slow way to read from memory.
-+        vld1.32 {d16[0]}, [r4:32], r1
-+        vld1.32 {d20[0]}, [r0:32], r1
-+        vld1.32 {d16[1]}, [r4:32], r1
-+        vld1.32 {d20[1]}, [r0:32], r1
-+        vld1.32 {d17[0]}, [r4:32], r1
-+        vld1.32 {d21[0]}, [r0:32], r1
-+        vld1.32 {d17[1]}, [r4:32], r1
-+        vld1.32 {d21[1]}, [r0:32], r1
-+        vld1.32 {d18[0]}, [r4:32], r1
-+        vld1.32 {d22[0]}, [r0:32], r1
-+        vld1.32 {d18[1]}, [r4:32], r1
-+        vld1.32 {d22[1]}, [r0:32], r1
-+        vld1.32 {d19[0]}, [r4:32], r1
-+        vld1.32 {d23[0]}, [r0:32], r1
-+        vld1.32 {d19[1]}, [r4:32]
-+        vld1.32 {d23[1]}, [r0:32]
-+        vuzp.16 q8, q9
-+        vuzp.16 q10, q11
-+        vuzp.8  q8, q9
-+        vuzp.8  q10, q11
-+        vswp    d17, d18
-+        vswp    d21, d22
-+
-+        bl hevc_loop_filter_luma_body
-+
-+        add     r6, r4, r1
-+        add     r2, r0, r1
-+        lsl     r1, #1
-+
-+        vpop     {d8-d15}
-+
-+        @ no_p[1]
-+        bmi     1f
-+        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
-+        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
-+        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
-+
-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
-+1:
-+        @ no_q[1]
-+        bcs     1f
-+        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
-+        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
-+        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
-+
-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
-+1:
-+        pop      {r4-r10,pc}
-+
-+.Lbypasswrite:
-+        vpop     {d8-d15}
-+        pop      {r4-r10,pc}
-+endfunc
-+
-+.macro m_filter_v_luma_16 bit_depth
-+        vpush    {d8-d15}
-+
-+        @ Uses slightly fewer instructions to do laned loads than unlaned
-+        @ and transpose.  This also means that we can use the same code for
-+        @ both split & unsplit deblock
-+        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
-+        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
-+
-+        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+
-+        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
-+        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
-+
-+        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+
-+        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
-+        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
-+
-+        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+
-+        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
-+        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
-+
-+        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
-+        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
-+
-+        bl hevc_loop_filter_luma_body_\bit_depth
-+
-+        add      r6, r4, r1
-+        add      r2, r0, r1
-+        lsl      r1, #1
-+
-+        vpop     {d8-d15}
-+
-+        @ p[1]
-+        bmi      1f
-+        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
-+        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
-+        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
-+        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
-+        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r6]
-+1:
-+        @ q[1]
-+        bcs      1f
-+        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
-+        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
-+        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
-+        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
-+        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
-+1:
-+        pop      {r4-r10,pc}
-+.endm
-+
-+
-+
-+
-+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
-+@                                 ptrdiff_t stride, [r1]
-+@                                 int beta,         [r2]
-+@                                 int32_t *tc,      [r3]
-+@                                 uint8_t *no_p,    sp[0]
-+@                                 uint8_t *no_q);   sp[4]
-+@
-+@ Src should always be on 8 byte boundry & all in the same slice
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
-+        hevc_loop_filter_luma_start
-+        b        .Lh_loop_filter_luma_common_8
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r10, [sp, #32]
-+
-+.Lh_loop_filter_luma_common_8:
-+        sub      r4, r0, r1, lsl #2
-+        add      r0, r4, r1
-+        lsl      r1, #1
-+        vpush    {d8-d15}
-+
-+        vld1.8  {d16}, [r4], r1
-+        vld1.8  {d17}, [r0], r1
-+        vld1.8  {d18}, [r4], r1
-+        vld1.8  {d19}, [r0], r1
-+        vld1.8  {d20}, [r4], r1
-+        vld1.8  {d21}, [r0], r1
-+        vld1.8  {d22}, [r4]
-+        vld1.8  {d23}, [r0]
-+
-+        bl hevc_loop_filter_luma_body
-+
-+        add      r0, r0, r1, lsl #1
-+        add      r2, r4, r1, lsl #1
-+        add      r6, r4, r1, asr #1
-+        vpop     {d8-d15}
-+
-+        @ P2-P0
-+        bcs      1f
-+        vst1.8   {d22}, [r4], r1
-+        vst1.8   {d21}, [r6]
-+        vst1.8   {d20}, [r4]
-+1:
-+        @ Q0-Q2
-+        bmi      1f
-+        vst1.8   {d19}, [r0], r1
-+        vst1.8   {d18}, [r2]
-+        vst1.8   {d17}, [r0]
-+1:
-+        pop      {r4-r10,pc}
-+endfunc
-+
-+
-+.macro m_filter_h_luma_16 bit_depth
-+        sub      r4, r0, r1, lsl #2
-+        add      r0, r4, r1
-+        lsl      r1, #1
-+        vpush    {d8-d15}
-+
-+        vld1.16 { q8}, [r4], r1
-+        vld1.16 { q9}, [r0], r1
-+        vld1.16 {q10}, [r4], r1
-+        vld1.16 {q11}, [r0], r1
-+        vld1.16 {q12}, [r4], r1
-+        vld1.16 {q13}, [r0], r1
-+        vld1.16 {q14}, [r4]
-+        vld1.16 {q15}, [r0]
-+
-+        bl hevc_loop_filter_luma_body_\bit_depth
-+
-+        add      r0, r0, r1, lsl #1
-+        add      r2, r4, r1, lsl #1
-+        add      r6, r4, r1, asr #1
-+        vpop     {d8-d15}
-+
-+        @ P2-P0
-+        bcs      1f
-+        vst1.16  {q14}, [r4], r1
-+        vst1.16  {q13}, [r6]
-+        vst1.16  {q12}, [r4]
-+1:
-+        bmi      1f
-+        vst1.16  {q11}, [r0], r1
-+        vst1.16  {q10}, [r2]
-+        vst1.16  { q9}, [r0]
-+1:
-+        pop      {r4-r10,pc}
-+.endm
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     unsigned int no_f);    // r3
-+@
-+@ no_f
-+@ 0  tl P0
-+@ 1  tr P1
-+@ 2  bl Q0
-+@ 3  br Q1
-+@
-+@ Probably not worth having the P/Qa only special case in this direction
-+@ Given layout we won't save any memory reads or avoid any cache dirtying
-+@ We would save a bit of computation but I expect the partials to be less
-+@ common in the H direction than V due to how we arrange deblock.
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
-+        sub      r12, r0, r1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        vld1.8   {d26,d27}, [r0]
-+        lsl      r1, #1
-+        sub      r0, r1
-+        vld1.8   {d18,d19}, [r12], r1
-+        vld1.8   {d16,d17}, [r0], r1
-+        vld1.8   {d28,d29}, [r12]
-+
-+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
-+        "sub      r12, r0, r1, asr #1"
-+
-+        lsls     r3, #29                @ b2 -> N, b3 -> C
-+        it pl
-+        vstrpl   d26, [r0, #0]
-+        it cc
-+        vstrcc   d27, [r0, #8]
-+        lsls     r3, #2                 @ b0 -> N, b1 -> C
-+        it pl
-+        vstrpl   d18, [r12, #0]
-+        it cc
-+        vstrcc   d19, [r12, #8]
-+        bx       lr
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     unsigned int no_f);    // r3
-+@
-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
-+@
-+@ Macro here actual function near bottom
-+
-+.macro m_filter_h_uv_16 bit_depth
-+        sub      r12, r0, r1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        vld1.16  {q12, q13}, [r0]
-+        lsl      r1, #1
-+        sub      r0, r1
-+        vld1.16  {q10, q11}, [r12], r1
-+        vld1.16  {q8,  q9 }, [r0], r1
-+        vld1.16  {q14, q15}, [r12]
-+
-+        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
-+        "sub      r12, r0, r1, asr #1", \
-+        "cmp      r3, #0"
-+
-+        bne      1f
-+        vst1.16  {q10, q11}, [r12]
-+        vst1.16  {q12, q13}, [r0]
-+        bx       lr
-+
-+        @ At least one no_f bit is set
-+        @ Which means we need to break this apart in an ugly fashion
-+1:
-+        lsls     r3, #29                @ b2 -> N, b3 -> C
-+        itt pl
-+        vstrpl   d24, [r0, #0]
-+        vstrpl   d25, [r0, #8]
-+        itt cc
-+        vstrcc   d26, [r0, #16]
-+        vstrcc   d27, [r0, #24]
-+        lsls     r3, #2                 @ b0 -> N, b1 -> C
-+        itt pl
-+        vstrpl   d20, [r12, #0]
-+        vstrpl   d21, [r12, #8]
-+        itt cc
-+        vstrcc   d22, [r12, #16]
-+        vstrcc   d23, [r12, #24]
-+        bx       lr
-+.endm
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     uint8_t * src_l,       // r3
-+@                                     unsigned int no_f);   // sp[0]
-+@
-+@ no_f:
-+@ 0  tl P0
-+@ 1  tr Q0
-+@ 2  bl P1
-+@ 3  br Q1
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        push     {lr}
-+        vld2.16  {d16[0], d18[0]}, [r3], r1
-+        vld2.16  {d20[0], d22[0]}, [r0], r1
-+
-+        cmp      r2, #0x10000
-+        vld2.16  {d16[1], d18[1]}, [r3], r1
-+        vld2.16  {d20[1], d22[1]}, [r0], r1
-+
-+        vld2.16  {d16[2], d18[2]}, [r3], r1
-+        vld2.16  {d20[2], d22[2]}, [r0], r1
-+
-+        vld2.16  {d16[3], d18[3]}, [r3], r1
-+        vld2.16  {d20[3], d22[3]}, [r0], r1
-+        blo      10f
-+
-+        vld2.16  {d17[0], d19[0]}, [r3], r1
-+        vld2.16  {d21[0], d23[0]}, [r0], r1
-+
-+        sub      ip, r0, r3
-+        vld2.16  {d17[1], d19[1]}, [r3], r1
-+        vld2.16  {d21[1], d23[1]}, [r0], r1
-+
-+        cmp      ip, #4
-+        vld2.16  {d17[2], d19[2]}, [r3], r1
-+        vld2.16  {d21[2], d23[2]}, [r0], r1
-+
-+        vld2.16  {d17[3], d19[3]}, [r3]
-+        vld2.16  {d21[3], d23[3]}, [r0]
-+
-+        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
-+        "ldr      lr, [sp, #4]", \
-+        "neg      r1, r1",       \
-+        "it eq; cmpeq lr, #0",   \
-+        "add      r3, #2",       \
-+        "add      ip, r3, r1",   \
-+        "add      r2, r0, r1",   \
-+        "lsl      r1, #1"
-+
-+        bne      1f
-+
-+@ Much/most of the time r0 == r3 + 4 and no_f == 0
-+@ so it is worth having this special case
-+        vst2.16   {d19[3], d21[3]}, [r3], r1    @ P0b, Q0b
-+        vst2.16   {d19[2], d21[2]}, [ip], r1
-+        vst2.16   {d19[1], d21[1]}, [r3], r1
-+        vst2.16   {d19[0], d21[0]}, [ip], r1
-+        vst2.16   {d18[3], d20[3]}, [r3], r1    @ P0a, Q0a
-+        vst2.16   {d18[2], d20[2]}, [ip], r1
-+        vst2.16   {d18[1], d20[1]}, [r3]
-+        vst2.16   {d18[0], d20[0]}, [ip]
-+        pop       {pc}
-+
-+@ Either split or partial
-+1:
-+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+        ittt cs
-+        addcs    r0, r0, r1, lsl #1
-+        addcs    r2, r2, r1, lsl #1
-+        bcs      1f
-+        @ Q0b
-+        vst1.16  {d21[3]}, [r0], r1
-+        vst1.16  {d21[2]}, [r2], r1
-+        vst1.16  {d21[1]}, [r0], r1
-+        vst1.16  {d21[0]}, [r2], r1
-+1:
-+        ittt mi
-+        addmi    r3, r3, r1, lsl #1
-+        addmi    ip, ip, r1, lsl #1
-+        bmi      1f
-+        @ P0b
-+        vst1.16  {d19[3]}, [r3], r1
-+        vst1.16  {d19[2]}, [ip], r1
-+        vst1.16  {d19[1]}, [r3], r1
-+        vst1.16  {d19[0]}, [ip], r1
-+1:
-+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+        bcs      1f
-+        @ Q0a
-+        vst1.16  {d20[3]}, [r0], r1
-+        vst1.16  {d20[2]}, [r2], r1
-+        vst1.16  {d20[1]}, [r0]
-+        vst1.16  {d20[0]}, [r2]
-+1:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.16  {d18[3]}, [r3], r1
-+        vst1.16  {d18[2]}, [ip], r1
-+        vst1.16  {d18[1]}, [r3]
-+        vst1.16  {d18[0]}, [ip]
-+        pop      {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+        @ As we have post inced r0/r3 in the load the easiest thing to do is
-+        @ to subtract and write forwards, rather than backwards (as above)
-+        @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+        hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
-+        "ldr      lr, [sp, #4]",       \
-+        "add      r3, #2",             \
-+        "sub      r0, r0, r1, lsl #2", \
-+        "sub      r3, r3, r1, lsl #2", \
-+        "lsls     lr, #31",            \
-+        "add      r2, r0, r1",         \
-+        "add      ip, r3, r1",         \
-+        "lsl      r1, #1"
-+
-+        bcs      3f
-+        @ Q0a
-+        vst1.16  {d20[0]}, [r0], r1
-+        vst1.16  {d20[1]}, [r2], r1
-+        vst1.16  {d20[2]}, [r0]
-+        vst1.16  {d20[3]}, [r2]
-+3:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.16  {d18[0]}, [r3], r1
-+        vst1.16  {d18[1]}, [ip], r1
-+        vst1.16  {d18[2]}, [r3]
-+        vst1.16  {d18[3]}, [ip]
-+        pop      {pc}
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
-+@                                     unsigned int stride,   // r1
-+@                                     uint32_t tc4,          // r2
-+@                                     uint8_t * src_l,       // r3
-+@                                     unsigned int no_f);   // sp[0]
-+@
-+
-+@ no_f
-+@ 0  tl P0a
-+@ 1  tr Q0a
-+@ 2  bl P0b
-+@ 3  br Q0b
-+
-+@ P1: q8,  q12
-+@ P0: q9,  q13
-+@ Q0: q10, q14
-+@ Q1: q11, q15
-+
-+.macro m_filter_v_uv2_16 bit_depth
-+        cmp      r2, #0
-+        it eq
-+        bxeq     lr
-+        push     {lr}
-+        vld2.32  {d16[0], d18[0]}, [r3], r1
-+        vld2.32  {d20[0], d22[0]}, [r0], r1
-+
-+        cmp      r2, #0x10000
-+        vld2.32  {d16[1], d18[1]}, [r3], r1
-+        vld2.32  {d20[1], d22[1]}, [r0], r1
-+
-+        vld2.32  {d17[0], d19[0]}, [r3], r1
-+        vld2.32  {d21[0], d23[0]}, [r0], r1
-+
-+        vld2.32  {d17[1], d19[1]}, [r3], r1
-+        vld2.32  {d21[1], d23[1]}, [r0], r1
-+        blo      10f
-+
-+        vld2.32  {d24[0], d26[0]}, [r3], r1
-+        vld2.32  {d28[0], d30[0]}, [r0], r1
-+
-+        sub      ip, r0, r3
-+        vld2.32  {d24[1], d26[1]}, [r3], r1
-+        vld2.32  {d28[1], d30[1]}, [r0], r1
-+
-+        cmp      ip, #8
-+        vld2.32  {d25[0], d27[0]}, [r3], r1
-+        vld2.32  {d29[0], d31[0]}, [r0], r1
-+
-+        vld2.32  {d25[1], d27[1]}, [r3]
-+        vld2.32  {d29[1], d31[1]}, [r0]
-+
-+        hevc_loop_filter_uv_body2_16  q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
-+        "ldr      lr, [sp, #4]", \
-+        "neg      r1, r1",       \
-+        "it eq; cmpeq lr, #0",   \
-+        "add      r3, #4",       \
-+        "add      ip, r3, r1",   \
-+        "add      r2, r0, r1",   \
-+        "lsl      r1, #1"
-+
-+        bne      1f
-+
-+@ Much/most of the time r0 == r3 + 8 and no_f == 0
-+@ so it is worth having this special case
-+        vst2.32   {d27[1], d29[1]}, [r3], r1    @ P0b, Q0b
-+        vst2.32   {d27[0], d29[0]}, [ip], r1
-+        vst2.32   {d26[1], d28[1]}, [r3], r1
-+        vst2.32   {d26[0], d28[0]}, [ip], r1
-+        vst2.32   {d19[1], d21[1]}, [r3], r1    @ P0a, Q0a
-+        vst2.32   {d19[0], d21[0]}, [ip], r1
-+        vst2.32   {d18[1], d20[1]}, [r3]
-+        vst2.32   {d18[0], d20[0]}, [ip]
-+        pop       {pc}
-+
-+@ Either split or partial
-+1:
-+        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+        ittt cs
-+        addcs    r0, r0, r1, lsl #1
-+        addcs    r2, r2, r1, lsl #1
-+        bcs      1f
-+        @ Q0b
-+        vst1.32  {d29[1]}, [r0], r1
-+        vst1.32  {d29[0]}, [r2], r1
-+        vst1.32  {d28[1]}, [r0], r1
-+        vst1.32  {d28[0]}, [r2], r1
-+1:
-+        ittt mi
-+        addmi    r3, r3, r1, lsl #1
-+        addmi    ip, ip, r1, lsl #1
-+        bmi      1f
-+        @ P0b
-+        vst1.32  {d27[1]}, [r3], r1
-+        vst1.32  {d27[0]}, [ip], r1
-+        vst1.32  {d26[1]}, [r3], r1
-+        vst1.32  {d26[0]}, [ip], r1
-+1:
-+        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+        bcs      1f
-+        @ Q0a
-+        vst1.32  {d21[1]}, [r0], r1
-+        vst1.32  {d21[0]}, [r2], r1
-+        vst1.32  {d20[1]}, [r0]
-+        vst1.32  {d20[0]}, [r2]
-+1:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.32  {d19[1]}, [r3], r1
-+        vst1.32  {d19[0]}, [ip], r1
-+        vst1.32  {d18[1]}, [r3]
-+        vst1.32  {d18[0]}, [ip]
-+        pop      {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+        @ As we have post inced r0/r3 in the load the easiest thing to do is
-+        @ to subtract and write forwards, rather than backwards (as above)
-+        @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+        hevc_loop_filter_uv_body1_16  q8, q9, q10, q11, \bit_depth, \
-+        "ldr      lr, [sp, #4]",       \
-+        "add      r3, #4",             \
-+        "sub      r0, r0, r1, lsl #2", \
-+        "sub      r3, r3, r1, lsl #2", \
-+        "lsls     lr, #31",            \
-+        "add      r2, r0, r1",         \
-+        "add      ip, r3, r1",         \
-+        "lsl      r1, #1"
-+
-+        bcs      3f
-+        @ Q0a
-+        vst1.32  {d20[0]}, [r0], r1
-+        vst1.32  {d20[1]}, [r2], r1
-+        vst1.32  {d21[0]}, [r0]
-+        vst1.32  {d21[1]}, [r2]
-+3:
-+        it       mi
-+        popmi    {pc}
-+        @ P0a
-+        vst1.32  {d18[0]}, [r3], r1
-+        vst1.32  {d18[1]}, [ip], r1
-+        vst1.32  {d19[0]}, [r3]
-+        vst1.32  {d19[1]}, [ip]
-+        pop      {pc}
-+.endm
-+
-+
-+@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
-+@ But in real world testing it is ~20% slower, presumably due to code size
-+
-+#if 0 // NEON version
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc0, int in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+        mov         ip, sp
-+        push        {a1-a3,v1-v8,lr}
-+        ldm         ip, {v1-v6}
-+        cmp         a1, #2
-+        bls         2f
-+        vpush       {d8-d13}
-+        sub         v5, v5, #10
-+        sub         v6, v6, #10
-+1:
-+        vld2.32     {d0[0], d2[0]}, [a3]!
-+        vld2.32     {d4[0], d6[0]}, [a4]!
-+          vmov.u8     q12, #0
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        v8, [a3], #1
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[0]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[0]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d16[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d20[0]}, [ip]
-+        vld1.32     {d18[0]}, [v8]
-+        vld1.32     {d22[0]}, [lr]
-+
-+        vld2.32     {d0[1], d2[1]}, [a3]!
-+        vld2.32     {d4[1], d6[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+          vmov.u16    d12, #1
-+        ldrb        ip, [a4], #1
-+          vmov.u16    d13, #2
-+        ldrb        v8, [a3], #1
-+          vmov.u16    d27, #4
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[2]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[2]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d16[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d20[1]}, [ip]
-+        vld1.32     {d18[1]}, [v8]
-+        vld1.32     {d22[1]}, [lr]
-+
-+        vld2.32     {d1[0], d3[0]}, [a3]!
-+        vld2.32     {d5[0], d7[0]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        lr, [a4], #1
-+        ldrb        v8, [a3], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[4]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[4]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d17[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d21[0]}, [ip]
-+        vld1.32     {d19[0]}, [v8]
-+        vld1.32     {d23[0]}, [lr]
-+
-+        vld2.32     {d1[1], d3[1]}, [a3]!
-+        vld2.32     {d5[1], d7[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        v8, [a3], #1
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d24[6]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[6]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d17[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d21[1]}, [ip]
-+        vld1.32     {d19[1]}, [v8]
-+        vld1.32     {d23[1]}, [lr]
-+
-+        @ So now we have:
-+        @ q0.32[i]  = curr[i].mv[0]
-+        @ q1.32[i]  = curr[i].mv[1]
-+        @ q2.32[i]  = neigh[i].mv[0]
-+        @ q3.32[i]  = neigh[i].mv[1]
-+        @ q8.32[i]  = curr_rpl0[curr[i].ref_idx[0]]
-+        @ q9.32[i]  = curr_rpl1[curr[i].ref_idx[1]]
-+        @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+        @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+        @ d24.16[i] = curr[i].pred_flag
-+        @ d25.16[i] = neigh[i].pred_flag
-+
-+        vtst.16     d28, d24, d12
-+        vtst.16     d29, d24, d13
-+        vadd.i16    d8, d24, d12
-+        vadd.i16    d9, d25, d12
-+        vtst.16     d30, d25, d12
-+        vtst.16     d31, d25, d13
-+        veor        d26, d8, d9
-+          ldr         lr, [sp, 6*8 + 1*4]
-+        vmovl.s16   q4, d28
-+        vmovl.s16   q5, d29
-+          teq         lr, #1
-+        vmovl.s16   q14, d30
-+          it ne
-+          lslne       v1, lr, #1
-+        vmovl.s16   q15, d31
-+          it ne
-+          rsbne       v2, v1, #32
-+        vbif        q0, q1, q4
-+        vbif        q2, q3, q14
-+        vbif        q1, q0, q5
-+        vbif        q3, q2, q15
-+        vabd.s16    q12, q0, q2
-+        vabd.s16    q2, q1
-+        vabd.s16    q0, q3
-+        vabd.s16    q1, q3
-+        vbif        q8, q9, q4
-+        vbif        q10, q11, q14
-+        vbif        q9, q8, q5
-+        vbif        q11, q10, q15
-+        vclt.u16    d6, d24, d27
-+        vclt.u16    d8, d2, d27
-+        vclt.u16    d7, d25, d27
-+        vclt.u16    d9, d3, d27
-+        vclt.u16    d2, d0, d27
-+        vclt.u16    d0, d4, d27
-+        vclt.u16    d3, d1, d27
-+        vclt.u16    d1, d5, d27
-+        vceq.i32    q12, q10, q8
-+        vceq.i32    q10, q9
-+        vceq.i32    q8, q11
-+        vceq.i32    q9, q11
-+        vshrn.i32   d6, q3, #8
-+        vshrn.i32   d7, q4, #8
-+        vshrn.i32   d8, q1, #8
-+        vshrn.i32   d9, q0, #8
-+        vmovn.i32   d4, q12
-+        vmovn.i32   d2, q10
-+        vmovn.i32   d3, q8
-+        vmovn.i32   d5, q9
-+        vand        q2, q3
-+        vrev16.8    q3, q3
-+        vand        q2, q3
-+        vand        q1, q4
-+        vrev16.8    q4, q4
-+        vand        q1, q4
-+        vand        d4, d5
-+        vand        d2, d3
-+        vbic        d0, d12, d4
-+        vshr.u16    d26, #2
-+        vbic        d0, d2
-+        vmov.i16    d1, #0x5555
-+        vorr        d0, d26
-+          bne         10f
-+
-+        @ Merge results into result word, no duplicates
-+        vmov        a2, s0
-+        vmov        v8, s1
-+        vmov.u16    ip, d0[1]
-+        vmov.u16    lr, d0[3]
-+        lsl         a2, #30
-+        lsl         v8, #30
-+        lsl         ip, #30
-+        lsl         lr, #30
-+        orr         a2, ip, a2, lsr #2
-+        orr         v8, lr, v8, lsr #2
-+        orr         a2, v8, a2, lsr #4
-+        subs        a1, #4
-+        orr         v7, a2, v7, lsr #8
-+        bhi         1b
-+
-+        mov         a1, #32
-+        ldr         a3, [sp, #6*8]
-+        vpop        {d8-d13}
-+        sub         a1, a1, a3, lsl #1
-+        mov         a1, v7, lsr a1
-+        pop         {a2-a4,v1-v8,pc}
-+10:
-+        @ Merge results into result word, with duplicates
-+        vmul.i16    d0, d1
-+        vmov        a2, s0
-+        vmov        v8, s1
-+        vmov.u16    ip, d0[1]
-+        vmov.u16    lr, d0[3]
-+        lsl         a2, v2
-+        subs        a1, #4
-+        lsl         v8, v2
-+        lsl         ip, v2
-+        lsl         lr, v2
-+        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
-+T       lsr         a2, v1
-+T       orr         a2, ip, a2
-+A       orr         a2, ip, a2, lsr v1
-+        lsl         ip, v1, #1
-+T       lsr         v8, v1
-+T       orr         v8, lr, v8
-+A       orr         v8, lr, v8, lsr v1
-+        lsl         lr, v1, #2
-+T       lsr         a2, ip
-+T       orr         a2, v8, a2
-+A       orr         a2, v8, a2, lsr ip
-+        ldr         v1, [sp, #6*8 + 12*4]
-+T       lsr         v7, lr
-+T       orr         v7, a2, v7
-+A       orr         v7, a2, v7, lsr lr
-+        bhi         1b
-+
-+        mov         a1, #32
-+        ldrd        a3, a4, [sp, #6*8]
-+        vpop        {d8-d13}
-+        mls         a1, a3, a4, a1
-+        mls         a1, a3, a4, a1
-+        mov         a1, v7, lsr a1
-+        pop         {a2-a4,v1-v8,pc}
-+
-+
-+2:
-+        sub         v5, v5, #10
-+        sub         v6, v6, #10
-+        vmov.u8     d16, #0
-+        blo         3f
-+        vld2.32     {d0[0], d1[0]}, [a3]!
-+        vld2.32     {d2[0], d3[0]}, [a4]!
-+        ldrb        a2, [a3], #1
-+        ldrb        ip, [a4], #1
-+        ldrb        lr, [a4], #1
-+        ldrb        v8, [a3], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d16[0]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[4]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d4[0]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d5[0]}, [ip]
-+        vld1.32     {d6[0]}, [v8]
-+        vld1.32     {d7[0]}, [lr]
-+
-+3:
-+        vld2.32     {d0[1], d1[1]}, [a3]!
-+        vld2.32     {d2[1], d3[1]}, [a4]!
-+        ldrb        a2, [a3], #1
-+          vmov.u16    d17, #1
-+        ldrb        ip, [a4], #1
-+          vmov.u16    d18, #2
-+        ldrb        v8, [a3], #1
-+          vmov.u16    d19, #4
-+        ldrb        lr, [a4], #1
-+        add         a2, v1, a2, lsl #2
-+        vld1.8      {d16[2]}, [a3], v5
-+        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[6]}, [a4], v6
-+        add         v8, v2, v8, lsl #2
-+        vld1.32     {d4[1]}, [a2]
-+        add         lr, v4, lr, lsl #2
-+        vld1.32     {d5[1]}, [ip]
-+        vld1.32     {d6[1]}, [v8]
-+        vld1.32     {d7[1]}, [lr]
-+
-+        @ So now we have:
-+        @ d0.32[i]  = curr[i].mv[0]
-+        @ d1.32[i]  = curr[i].mv[1]
-+        @ d2.32[i]  = neigh[i].mv[0]
-+        @ d3.32[i]  = neigh[i].mv[1]
-+        @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
-+        @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+        @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
-+        @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+        @ d16.16[i] = curr[i].pred_flag
-+        @ d16.16[2+i] = neigh[i].pred_flag
-+
-+        vtst.16     d20, d16, d17
-+        vtst.16     d22, d16, d18
-+        vadd.i16    d30, d16, d17
-+        vswp        d2, d3
-+        ldr         lr, [sp, #1*4]
-+        vmovl.s16   q10, d20
-+          teq         lr, #1
-+        vmovl.s16   q11, d22
-+          it ne
-+          lslne       v1, lr, #1
-+        vbif        d0, d1, d20
-+        vbif        d4, d6, d20
-+        vbif        d3, d2, d21
-+        vbif        d5, d7, d21
-+        vbif        d1, d0, d22
-+        vbif        d6, d4, d22
-+        vbif        d2, d3, d23
-+        vbif        d7, d5, d23
-+        vshr.u16    d30, #2
-+        vabd.s16    d24, d0, d3
-+        vabd.s16    d25, d1, d2
-+        vabd.s16    q0, q0, q1
-+        vceq.i32    d2, d4, d5
-+        vceq.i32    d20, d5, d6
-+        vceq.i32    d21, d4, d7
-+        vceq.i32    d3, d6, d7
-+        vclt.u16    d6, d24, d19
-+        vclt.u16    d7, d25, d19
-+        vclt.u16    d22, d1, d19
-+        vclt.u16    d23, d0, d19
-+        vshrn.i32   d6, q3, #8
-+        vmovn.i32   d2, q1
-+        vshrn.i32   d7, q11, #8
-+        vmovn.i32   d3, q10
-+        vand        q0, q3, q1
-+          it ne
-+          rsbne       v2, v1, #32
-+        vrev16.8    q3, q3
-+        vand        q0, q3
-+        vsra.u64    d30, #32
-+        vshr.u64    q1, q0, #32
-+        vand        q0, q1
-+        vbic        d0, d17, d0
-+        vand        d30, d30, d17
-+        vbic        d0, d1
-+        vmov.i16    d1, #0x5555
-+        vorr        d0, d30
-+          bne         10f
-+
-+        @ Construct result word, no duplicates
-+        cmp         a1, #2
-+        vmov.u16    a1, d0[1]
-+        vmov.u16    a2, d0[0]
-+        it eq
-+        orreq       a1, a2, a1, lsl #2
-+        pop         {a2-a4,v1-v8,pc}
-+10:
-+        @ Construct result word, with duplicates
-+        cmp         a1, #2
-+        vmul.i16    d0, d1
-+        vmov        a2, s0
-+        vmov.u16    a1, d0[1]
-+        lsl         a2, #16
-+        pkhbt       a1, a1, a1, lsl #16
-+        lsr         a2, v2
-+        lsr         a1, v2
-+T       itt eq
-+T       lsleq       a1, v1
-+T       orreq       a1, a2, a1
-+A       orreq       a1, a2, a1, lsl v1
-+        pop         {a2-a4,v1-v8,pc}
-+endfunc
-+
-+
-+
-+#else // non-NEON version
-+
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc0, in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+        add         ip, sp, #4*4
-+        push        {a2-a4,v1-v8,lr}
-+        mov         v6, #32
-+1:      ldmdb       ip, {v1-v4}
-+        ldrsb       v5, [a3, #8]    @ curr->ref_idx
-+        ldrsb       v8, [a3, #9]
-+        ldrsb       ip, [a4, #8]    @ neigh->ref_idx
-+        ldrsb       lr, [a4, #9]
-+        ldr         v1, [v1, v5, lsl #2]
-+        ldrb        v5, [a3, #10]   @ curr->pred_flag
-+        ldr         v2, [v2, v8, lsl #2]
-+        ldrb        v8, [a4, #10]   @ neigh->pred_flag
-+        ldr         v3, [v3, ip, lsl #2]
-+        ldr         v4, [v4, lr, lsl #2]
-+        teq         v5, #3
-+        beq         20f
-+        teq         v8, #3
-+        beq         90f
-+
-+        tst         v5, #1
-+        itee        ne
-+        ldrne       v5, [a3, #0]    @ curr->mv[0]
-+        moveq       v1, v2
-+        ldreq       v5, [a3, #4]    @ curr->mv[1]
-+        tst         v8, #1
-+        itee        ne
-+        ldrne       v8, [a4, #0]    @ neigh->mv[0]
-+        moveq       v3, v4
-+        ldreq       v8, [a4, #4]    @ neigh->mv[1]
-+        teq         v1, v3
-+        bne         10f
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v8, v5
-+        ssub16      v5, v5, v8
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        @ drop through
-+10:     it          ne
-+        movne       v5, #1<<30
-+11:
-+        sub         v6, v6, #2
-+T       mov         v7, v7, lsr #2
-+        subs        a2, a2, #1
-+A       orr         v7, v5, v7, lsr #2
-+T       orr         v7, v5, v7
-+        bhi         11b
-+
-+        ldrd        v3, v4, [sp, #16*4]
-+        ldr         a2, [sp]
-+        add         ip, sp, #16*4
-+        subs        a1, a1, #1
-+        add         a3, a3, v3
-+        add         a4, a4, v4
-+        bhi         1b
-+        mov         a1, v7, lsr v6
-+        pop         {a2-a4,v1-v8,pc}
-+
-+20:     teq         v8, #3
-+        bne         10b
-+
-+        teq         v1, v3
-+        it          eq
-+        teqeq       v2, v4
-+        bne         40f
-+        teq         v1, v2
-+        bne         30f
-+
-+        ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      v5, v1, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         25f
-+        ssub16      ip, v4, v2
-+        ssub16      v5, v2, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        beq         11b
-+        @ drop through
-+25:     ssub16      ip, v4, v1
-+        ssub16      v5, v1, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         10b
-+        ssub16      ip, v3, v2
-+        ssub16      v5, v2, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        b           10b
-+
-+30:     ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        ssub16      ip, v3, v1
-+        ssub16      v5, v1, v3
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        bne         10b
-+        ssub16      ip, v4, v2
-+        ssub16      v5, v2, v4
-+        sel         v5, v5, ip
-+        ands        v5, v5, lr
-+        b           10b
-+
-+40:     teq         v1, v4
-+        ite         eq
-+        teqeq       v2, v3
-+        bne         10b
-+
-+        ldrd        v1, v2, [a3]    @ curr->mv
-+        ldrd        v3, v4, [a4]    @ neigh->mv
-+        ldr         lr, =0xFFFCFFFC
-+        b           25b
-+
-+90:
-+        mov         v5, #1<<30
-+        b           11b
-+endfunc
-+
-+
-+#endif
-+
-+
-+@ =============================================================================
-+@
-+@ 10 bit
-+
-+function hevc_loop_filter_luma_body_10
-+        m_filter_luma 10, q11, q15
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
-+        hevc_loop_filter_luma_start
-+        b        .Lh_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r10, [sp, #32]
-+.Lh_loop_luma_common_10:
-+        m_filter_h_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
-+        hevc_loop_filter_luma_start
-+        sub      r4, r0, #8
-+        b        .Lv_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
-+        cmp      r3, #0
-+        it       eq
-+        bxeq     lr
-+        push     {r4-r10,lr}            @ 32 bytes
-+        ldr      r4, [sp, #36]
-+        ldr      r10, [sp, #32]
-+
-+.Lv_loop_luma_common_10:
-+        m_filter_v_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
-+        m_filter_h_uv_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
-+        m_filter_v_uv2_16 10
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
-new file mode 100644
-index 0000000000..7ed5c7dc52
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
-@@ -0,0 +1,184 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+/* uses registers q8 - q13 for temp values */
-+.macro tr4_luma_shift shift
-+        vaddl.s16   q8, d28, d30    // c0 = src0 + src2
-+        vaddl.s16   q9, d30, d31    // c1 = src2 + src3
-+        vsubl.s16   q10, d28, d31   // c2 = src0 - src3
-+        vaddl.s16   q11, d28, d31   // src0 + src3
-+
-+        vmul.i32    q12, q8, d1[0]  // 29 * c0
-+        vmul.i32    q13, q10, d2[0] // 55 * c2
-+        vmul.i32    q8, q8, d2[0]   // 55 * c0
-+        vmull.s16   q14, d29, d0[0] // c3 = 74 * src1
-+
-+        vsubw.s16   q11, q11, d30   // src0 - src2 + src3
-+        vmla.i32    q12, q9, d2[0]  // 29 * c0 + 55 * c1
-+        vmls.i32    q13, q9, d1[0]  // 55 * c2 - 29 * c1
-+        vmla.i32    q8, q10, d1[0]  // 55 * c0 + 29 * c2
-+
-+        vmul.i32    q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
-+        vadd.i32    q12, q12, q14   // dst0 = 29 * c0 + 55 * c1 + c3
-+        vadd.i32    q13, q13, q14   // dst1 = 55 * c2 - 29 * c1 + c3
-+        vsub.i32    q8, q8, q14     // dst3 = 55 * c0 + 29 * c2 - c3
-+
-+        vqrshrn.s32 d28, q12, \shift
-+        vqrshrn.s32 d29, q13, \shift
-+        vqrshrn.s32 d30, q11, \shift
-+        vqrshrn.s32 d31, q8, \shift
-+.endm
-+
-+/* uses registers q8 - q11 for temp values */
-+.macro tr4_shift shift
-+        vmull.s16   q9, d29, d0[0]   // 83 * src1
-+        vmull.s16   q8, d29, d0[1]   // 36 * src1
-+        vshll.s16   q14, d28, #6     // 64 * src0
-+        vshll.s16   q10, d30, #6     // 64 * src2
-+        vmlal.s16   q9, d31, d0[1]   // 83 * src1 + 36 * src3  o0
-+        vmlsl.s16   q8, d31, d0[0]   // 36 * src1 - 83 * src3  o1
-+        vadd.s32    q11, q14, q10    // 64 * (src0 + src2)     e0
-+        vsub.s32    q10, q14, q10    // 64 * (src0 - src2)     e1
-+        vadd.s32    q14, q11, q9     // e0 + o0
-+        vadd.s32    q15, q10, q8     // e1 + o1
-+        vsub.s32    q8, q10, q8      // e1 - o1
-+        vsub.s32    q9, q11, q9      // e0 - o0
-+
-+        vqrshrn.s32 d28, q14, \shift
-+        vqrshrn.s32 d29, q15, \shift
-+        vqrshrn.s32 d30, q8, \shift
-+        vqrshrn.s32 d31, q9, \shift
-+.endm
-+
-+.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7,                         \
-+                   tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
-+                   tmp1, /* Q reg which doesn't alias with d7 or d0     */ \
-+                   shift, I1, I2, I3
-+
-+        vmull.s16  q4, \d1, d1[1]        // 89 * src1
-+        \I1
-+        vmull.s16  q5, \d1, d1[0]        // 75 * src1
-+        \I2
-+        vmull.s16  q6, \d1, d1[3]        // 50 * src1
-+        \I3
-+        vmull.s16  q7, \d1, d1[2]        // 18 * src1
-+        vmlal.s16  q4, \d3, d1[0]        // 75 * src3
-+        vmlsl.s16  q5, \d3, d1[2]        //-18 * src3
-+        vmlsl.s16  q6, \d3, d1[1]        //-89 * src3
-+        vmlsl.s16  q7, \d3, d1[3]        //-50 * src3
-+
-+          // tr4
-+          vmull.s16  q1, \d2, d0[0]      // 83 * src(1*2)
-+          vmull.s16  q2, \d2, d0[1]      // 36 * src(1*2)
-+
-+        vmlal.s16  q4, \d5, d1[3]        // 50 * src5
-+        vmlsl.s16  q5, \d5, d1[1]        //-89 * src5
-+        vmlal.s16  q6, \d5, d1[2]        // 18 * src5
-+        vmlal.s16  q7, \d5, d1[0]        // 75 * src5
-+
-+          vshll.s16  q3, \d0, #6         // 64 * src(0*2)
-+          vshll.s16  \tmp0, \d4, #6      // 64 * src(2*2)
-+          vmlal.s16  q1, \d6, d0[1]      // 83 * src(1*2) + 36 * src(3*2)  o0
-+          vmlsl.s16  q2, \d6, d0[0]      // 36 * src(1*2) - 83 * src(3*2)  o1
-+          vadd.i32   \tmp1, q3, \tmp0    // 64 * (src(0*2) + src(2*2))     e0
-+          vsub.i32   \tmp0, q3, \tmp0    // 64 * (src(0*2) - src(2*2))     e1
-+
-+        vmlal.s16  q4, \d7, d1[2]        // 18 * src7
-+        vmlsl.s16  q5, \d7, d1[3]        //-50 * src7
-+        vmlal.s16  q6, \d7, d1[0]        // 75 * src7
-+        vmlsl.s16  q7, \d7, d1[1]        //-89 * src7
-+
-+          vsub.i32   q3, \tmp1, q1       // e0 - o0
-+          vadd.i32   \tmp1, \tmp1, q1    // e0 + o0
-+          vadd.i32   q1, \tmp0, q2       // e1 + o1
-+          vsub.i32   q2, \tmp0, q2       // e1 - o1
-+
-+        vadd.i32   \tmp0, \tmp1, q4      // e_8[0] + o_8[0], dst[0]
-+        vsub.i32   q4, \tmp1, q4         // e_8[0] - o_8[0], dst[7]
-+        vsub.i32   \tmp1, q3, q7         // e_8[3] - o_8[3], dst[4]
-+        vadd.i32   q7, q3, q7            // e_8[3] + o_8[3], dst[3]
-+        vadd.i32   q3, q1, q5            // e_8[1] + o_8[1], dst[1]
-+        vsub.i32   q5, q1, q5            // e_8[1] - o_8[1], dst[6]
-+        vsub.i32   q1, q2, q6            // e_8[2] - o_8[2], dst[5]
-+        vadd.i32   q6, q2, q6            // e_8[2] + o_8[2], dst[2]
-+        vqrshrn.s32   \d0, \tmp0, #\shift
-+        vqrshrn.s32   \d4, \tmp1, #\shift
-+        vqrshrn.s32   \d1, q3, #\shift
-+        vqrshrn.s32   \d5, q1, #\shift
-+        vqrshrn.s32   \d2, q6, #\shift
-+        vqrshrn.s32   \d6, q5, #\shift
-+        vqrshrn.s32   \d3, q7, #\shift
-+        vqrshrn.s32   \d7, q4, #\shift
-+.endm
-+
-+.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
-+        vld1.16     {\d0}, [r0 :64], r3
-+        vld1.16     {\d1}, [r2 :64], r3
-+        vld1.16     {\d2}, [r0 :64], r3
-+        vld1.16     {\d3}, [r2 :64], r3
-+        vld1.16     {\d4}, [r0 :64], r3
-+        vld1.16     {\d5}, [r2 :64], r3
-+        vld1.16     {\d6}, [r0 :64], r3
-+        vld1.16     {\d7}, [r2 :64], r3
-+
-+        tr8_process \
-+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+            \q01, \q23, 7, "\I1", "\I2", "\I3"
-+.endm
-+
-+.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
-+        tr8_process \
-+            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+            \q01, \q23, \shift
-+
-+        vzip.16    \d0, \d4
-+        vzip.16    \d1, \d5
-+        vzip.16    \d2, \d6
-+        vzip.16    \d3, \d7
-+        vst4.16    {\d0-\d3}, [r0 :128], r3
-+        vst4.16    {\d4-\d7}, [r2 :128], r3
-+.endm
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevc_idct_fn_neon.S"
-+
-+.text
-+
-+.align 4
-+tr4f:
-+.word 0x00240053  // 36 and d1[0] = 83
-+.word 0x00000000
-+tr8f:
-+.word 0x0059004b  // 89, d0[0] = 75
-+.word 0x00320012  // 50, d0[2] = 18
-+tr16:
-+.word 0x005a0057  // 90, d2[0] = 87
-+.word 0x00500046  // 80, d2[2] = 70
-+.word 0x0039002b  // 57, d2[0] = 43
-+.word 0x00190009  // 25, d2[2] = 9
-+
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 10
-+#include "rpi_hevc_idct_fn_neon.S"
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c
-new file mode 100644
-index 0000000000..109fa98c29
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
-@@ -0,0 +1,32 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+
-+av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
-+{
-+    int cpu_flags = av_get_cpu_flags();
-+
-+    if (have_neon(cpu_flags))
-+        ff_hevcdsp_rpi_init_neon(c, bit_depth);
-+}
-diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-new file mode 100644
-index 0000000000..9294ab8010
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-@@ -0,0 +1,467 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+#include "libavcodec/avcodec.h"
-+#include "libavcodec/bit_depth_template.c"
-+
-+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
-+// have been removed from head as we never use them.
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                             uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                             uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
-+
-+void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
-+
-+void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+
-+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                                int in_inc0, int in_inc1);
-+void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
-+
-+
-+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+
-+#if SAO_FILTER_N == 6
-+static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
-+                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+#endif
-+
-+
-+
-+#if RPI_HEVC_SAO_BUF_STRIDE != 160
-+#error SAO edge src stride not 160 - value used in .S
-+#endif
-+
-+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
-+{
-+    if (bit_depth == 8) {
-+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
-+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
-+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
-+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
-+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
-+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
-+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
-+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
-+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
-+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
-+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
-+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
-+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
-+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
-+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
-+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
-+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
-+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
-+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
-+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
-+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
-+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
-+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
-+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
-+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
-+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
-+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
-+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
-+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
-+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
-+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
-+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
-+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
-+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
-+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
-+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
-+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
-+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
-+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
-+#endif
-+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
-+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
-+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
-+
-+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
-+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
-+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
-+
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
-+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
-+#endif
-+    }
-+    else if (bit_depth == 10) {
-+        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
-+        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
-+        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
-+        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
-+        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
-+        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
-+        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
-+        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
-+        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
-+        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
-+        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
-+        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
-+        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
-+        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
-+        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
-+        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
-+        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
-+        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
-+        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
-+        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
-+        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
-+        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
-+        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
-+        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
-+        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
-+        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
-+        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
-+        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
-+        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
-+        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
-+        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
-+        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
-+        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
-+
-+        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
-+        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
-+        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
-+        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
-+        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
-+        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
-+#endif
-+        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
-+        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
-+        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
-+
-+        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
-+        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
-+        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
-+
-+#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
-+        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
-+#endif
-+    }
-+
-+    assert(offsetof(HEVCRpiMvField, mv) == 0);
-+    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
-+    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
-+    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
-+    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
-+}
-diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
-new file mode 100644
-index 0000000000..93876d14c0
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
-@@ -0,0 +1,620 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+#define BIT_DEPTH 10
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
-+.endm
-+
-+@ add_residual4x4(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1]
-+        lsl         r2, #1
-+        vld1.16     {d0}, [r0 :64], r2
-+        vld1.16     {d1}, [ip :64], r2
-+        vld1.16     {d2}, [r0 :64]
-+        vld1.16     {d3}, [ip :64]
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q11
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0 :64], r2
-+        vst1.16     {d1}, [ip :64], r2
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [ip :64]
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r1
-+        vdup.16     q15, r2
-+        lsl         r1, #1
-+        vld1.16     {d0}, [r0 :64], r1
-+        vld1.16     {d1}, [ip :64], r1
-+        vld1.16     {d2}, [r0 :64]
-+        vld1.16     {d3}, [ip :64]
-+        sub         r0, r1
-+        vqadd.s16   q0,  q15
-+        sub         ip, r1
-+        vqadd.s16   q1,  q15
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0 :64], r1
-+        vst1.16     {d1}, [ip :64], r1
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [ip :64]
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ add_residual8x8(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
-+        mov         r3, #8
-+        vmov.i64    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vldm        r1!, {q10-q13}
-+        vld1.16     {q0}, [r0 :128], r2
-+        vld1.16     {q1}, [ip :128], r2
-+        vld1.16     {q2}, [r0 :128]
-+        vld1.16     {q3}, [ip :128]
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q11
-+        subs        r3, #4
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0}, [r0 :128], r2
-+        vst1.16     {q1}, [ip :128], r2
-+        vst1.16     {q2}, [r0 :128], r2
-+        vst1.16     {q3}, [ip :128], r2
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #4
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual8x8_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r2
-+        mov         r3, #8
-+9:
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r1, #1
-+1:
-+        vld1.16     {q0}, [r0 :128], r1
-+        vld1.16     {q1}, [ip :128], r1
-+        vld1.16     {q2}, [r0 :128]
-+        vld1.16     {q3}, [ip :128]
-+        sub         r0, r1
-+        vqadd.s16   q0,  q15
-+        sub         ip, r1
-+        vqadd.s16   q1,  q15
-+        subs        r3, #4
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0}, [r0 :128], r1
-+        vst1.16     {q1}, [ip :128], r1
-+        vst1.16     {q2}, [r0 :128], r1
-+        vst1.16     {q3}, [ip :128], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ add_residual16x16(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
-+        add         ip, r0, r2
-+        vmov.i16    q8,  #0
-+        lsl         r2, #1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        mov         r3, #16
-+1:
-+        vldm        r1!, {q10-q13}
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0 :128]
-+        subs        r3, #2
-+        vld1.16     {q2, q3}, [ip :128]
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q11
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0, q1}, [r0 :128], r2
-+        vst1.16     {q2, q3}, [ip :128], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #8
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual16x16_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
-+        vdup.i16    q15, r2
-+        mov         r3, #16
-+9:
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r1
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r1, #1
-+1:
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0 :128]
-+        subs        r3, #2
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q15
-+        vld1.16     {q2, q3}, [ip :128]
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0, q1}, [r0 :128], r1
-+        vst1.16     {q2, q3}, [ip :128], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ add_residual32x32(
-+@  uint16_t *_dst,    [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        mov         r3, #32
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vldm        r1!, {q10-q13}
-+        vldm        r0,  {q0-q3}
-+        vqadd.s16   q0,  q10
-+          pldw        [lr]
-+        vqadd.s16   q1,  q11
-+          add         lr, r2
-+        vqadd.s16   q2,  q12
-+        subs        r3, #1
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0-q1}, [r0], r2
-+        vst1.16     {q2-q3}, [ip], r2
-+        bne         1b
-+        pop         {pc}
-+
-+endfunc
-+
-+@ add_residual16x16_dc_c(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r3, #16
-+        vdup.32     q15, r2
-+        b           9f
-+endfunc
-+
-+@ add_residual32x32_dc(
-+@  uint16_t *_dst,    [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r2
-+        mov         r3, #32
-+9:
-+        vmov.i16    q8,  #0
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vldm        r0,  {q0-q3}
-+        vqadd.s16   q0,  q15
-+        subs        r3, #1
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0-q1}, [r0], r1
-+        vst1.16     {q2-q3}, [ip], r1
-+        bne         1b
-+        bx          lr
-+
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1 :256]
-+        lsl         r2, #1
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+        sub         r0, r2
-+        vmov.i16    q8,  #0
-+        sub         ip, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        mov         r3, #8
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        subs        r3, #2
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual16x16_u(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        vdup.16     q15, r3
-+        mov         r3, #16
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q10
-+          pldw        [lr]
-+        vqadd.s16   q1,  q15
-+          add         lr, r2
-+        vqadd.s16   q2,  q11
-+        subs        r3, #1
-+        vqadd.s16   q3,  q15
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld1.16     {q10, q11}, [r1 :256]
-+        lsl         r2, #1
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+        sub         r0, r2
-+        vmov.i16    q8,  #0
-+        sub         ip, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        mov         r3, #8
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        subs        r3, #2
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        bx          lr
-+endfunc
-+
-+@ add_residual16x16_v(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        vdup.16     q15, r3
-+        mov         r3, #16
-+        vmov.i16    q8,  #0
-+        add         lr, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vqadd.s16   q0,  q15
-+          pldw        [lr]
-+        vqadd.s16   q1,  q10
-+          add         lr, r2
-+        vqadd.s16   q2,  q15
-+        subs        r3, #1
-+        vqadd.s16   q3,  q11
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
-+        vmov.i16    q8,  #0
-+        add         ip, r0, r2
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        lsl         r2, #1
-+        vldm        r1, {q10-q13}
-+        vld2.16     {d0, d2}, [r0 :128], r2
-+        vld2.16     {d1, d3}, [ip :128], r2
-+        vld2.16     {d4, d6}, [r0 :128]
-+        vld2.16     {d5, d7}, [ip :128]
-+
-+        sub         r0, r2
-+        vqadd.s16   q0,  q10
-+        sub         ip, r2
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+
-+        vst2.16     {d0, d2}, [r0 :128], r2
-+        vst2.16     {d1, d3}, [ip :128], r2
-+        vst2.16     {d4, d6}, [r0 :128]
-+        vst2.16     {d5, d7}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
-+        push        {lr}
-+        add         ip, r0, r2
-+        lsl         r2, #1
-+        vmov.i16    q8,  #0
-+        add         r3, r1, #(8*8*2)  @ Offset to V
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        mov         lr, #8
-+1:
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        subs        lr, #2
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q12, q13}, [r3 :256]!
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@   uint16_t *_dst,       [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
-+        push        {r4, lr}
-+        vmov.i16    q8,  #0
-+        add         r3,  r1, #(16*16*2)  @ Offset to V
-+        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
-+        add         ip, r0, #32
-+        add         r4, r0, r2
-+        mov         lr, #16
-+1:
-+        vld2.16     {q0, q1}, [r0 :256]
-+        vld2.16     {q2, q3}, [ip :256]
-+        vld1.16     {q10, q11}, [r1 :256]!
-+        vld1.16     {q12, q13}, [r3 :256]!
-+        vqadd.s16   q0,  q10
-+          pldw        [r4]
-+        vqadd.s16   q1,  q12
-+          add         r4, r2
-+        vqadd.s16   q2,  q11
-+        subs        lr, #1
-+        vqadd.s16   q3,  q13
-+        clip16_4    q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0 :256], r2
-+        vst2.16     {q2, q3}, [ip :256], r2
-+        bne         1b
-+        pop         {r4,pc}
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
-new file mode 100644
-index 0000000000..d9a1d7d98c
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
-@@ -0,0 +1,741 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+@ General notes:
-+@
-+@ Residual is generally only guaranteed to be clipped to 16 bits.
-+@ This means that we do need to do vmovl, vqadd, vqmovun
-+@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
-+@ with this).
-+@
-+@ There is an exception for the DC case because its transform is guaranteed
-+@ to be small enough that overflow cannot occur during the first add.
-+
-+@ ============================================================================
-+@ Y add
-+
-+function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]
-+        lsl         r2, #1
-+        vld1.32     d4[0], [r0], r2
-+        rsb         r3, r2, #0
-+        vld1.32     d4[1], [ip], r2
-+        vld1.32     d5[0], [r0], r3
-+        vld1.32     d5[1], [ip], r3
-+        vmovl.u8    q8, d4
-+        vmovl.u8    q9, d5
-+        vqadd.s16   q0, q8
-+        vqadd.s16   q1, q9
-+        vqmovun.s16 d0, q0
-+        vqmovun.s16 d1, q1
-+        vst1.32     d0[0], [r0], r2
-+        vst1.32     d0[1], [ip], r2
-+        vst1.32     d1[0], [r0]
-+        vst1.32     d1[1], [ip]
-+        bx          lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
-+        push        {r4, lr}
-+        vld1.16     {q0, q1}, [r1]!
-+        add         ip, r0, r2
-+        vld1.8      {d6}, [r0]
-+        add         r4, r0, r2, lsl #1
-+        vld1.8      {d7}, [ip]
-+        add         lr, ip, r2, lsl #1
-+        lsl         r2, #1
-+        mov         r3, #8-2
-+        vmovl.u8    q2, d6
-+        vmovl.u8    q3, d7
-+        vqadd.s16   q2, q0
-+        vqadd.s16   q3, q1
-+1:
-+          vld1.16     {q0, q1}, [r1]!
-+        subs        r3, #2
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+          vld1.8      {d6}, [r4], r2
-+          vld1.8      {d7}, [lr], r2
-+        vst1.8      {d4}, [r0], r2
-+        vst1.8      {d5}, [ip], r2
-+          vmovl.u8    q2, d6
-+            pldw        [r4]
-+          vmovl.u8    q3, d7
-+          vqadd.s16   q2, q0
-+          vqadd.s16   q3, q1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q2
-+          vqmovun.s16 d5, q3
-+          vst1.8      {d4}, [r0]
-+          vst1.8      {d5}, [ip]
-+          pop         {r4, pc}
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
-+        vld1.16     {q0, q1}, [r1]!
-+        add         ip, r0, r2
-+        vld1.8      {q3}, [r0]
-+        mov         r3, #16-1
-+        vmovl.u8    q2, d6
-+        vmovl.u8    q3, d7
-+        vqadd.s16   q2, q0
-+        vqadd.s16   q3, q1
-+1:
-+          vld1.16     {q0, q1}, [r1]!
-+        subs        r3, #1
-+        vqmovun.s16 d4, q2
-+        vqmovun.s16 d5, q3
-+          vld1.8      {q3}, [ip], r2
-+        vst1.8      {q2}, [r0], r2
-+          vmovl.u8    q2, d6
-+            pldw        [ip]
-+          vmovl.u8    q3, d7
-+          vqadd.s16   q2, q0
-+          vqadd.s16   q3, q1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q2
-+          vqmovun.s16 d5, q3
-+          vst1.8      {q2}, [r0]
-+          bx          lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
-+        vldm        r1!, {q0-q3}
-+        vld1.8      {q8, q9}, [r0]
-+        add         ip, r0, r2
-+        vmovl.u8    q10, d16
-+        mov         r3, #32-1
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vqadd.s16   q10, q0
-+        vqadd.s16   q11, q1
-+        vqadd.s16   q12, q2
-+        vqadd.s16   q13, q3
-+1:
-+          vldm        r1!, {q0-q3}
-+        vqmovun.s16 d20, q10
-+        vqmovun.s16 d21, q11
-+        vqmovun.s16 d22, q12
-+        vqmovun.s16 d23, q13
-+          vld1.8      {q8, q9}, [ip], r2
-+        subs        r3, #1
-+        vst1.8      {q10, q11}, [r0], r2
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q11, d17
-+          vmovl.u8    q12, d18
-+          vmovl.u8    q13, d19
-+          vqadd.s16   q10, q0
-+          vqadd.s16   q11, q1
-+          vqadd.s16   q12, q2
-+          vqadd.s16   q13, q3
-+        bne     1b
-+
-+          vqmovun.s16 d20, q10
-+          vqmovun.s16 d21, q11
-+          vqmovun.s16 d22, q12
-+          vqmovun.s16 d23, q13
-+          vst1.8      {q10, q11}, [r0]
-+          bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
-+        add         ip, r0, r1
-+        vdup.16     q15, r2
-+        lsl         r1, #1
-+        vld1.32     d4[0], [r0], r1
-+        rsb         r3, r1, #0
-+        vld1.32     d4[1], [ip], r1
-+        vld1.32     d5[0], [r0], r3
-+        vld1.32     d5[1], [ip], r3
-+        vaddw.u8    q0, q15, d4
-+        vaddw.u8    q1, q15, d5
-+        vqmovun.s16 d0, q0
-+        vqmovun.s16 d1, q1
-+        vst1.32     d0[0], [r0], r1
-+        vst1.32     d0[1], [ip], r1
-+        vst1.32     d1[0], [r0]
-+        vst1.32     d1[1], [ip]
-+        bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ DC Y or C add
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
-+        mov         r3,  #4-2
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3, #8-2
-+1:      vld1.8      d16, [r0]
-+        add         ip, r0, r1
-+        push        {r4, lr}
-+        vld1.8      d17, [ip]
-+        add         r4, r0, r1, lsl #1
-+        vaddw.u8    q0, q15, d16
-+        lsl         r1, #1
-+        vaddw.u8    q1, q15, d17
-+        add         lr, ip, r1
-+1:
-+          vld1.8      {d16}, [r4], r1
-+          vld1.8      {d17}, [lr], r1
-+        subs        r3, #2
-+        vqmovun.s16 d4, q0
-+        vqmovun.s16 d5, q1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+        vst1.8      {d4}, [r0], r1
-+        vst1.8      {d5}, [ip], r1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q0
-+          vqmovun.s16 d5, q1
-+          vst1.8      {d4}, [r0]
-+          vst1.8      {d5}, [ip]
-+          pop         {r4, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
-+        mov         r3,  #8-1
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3,  #16-1
-+1:      vld1.8      {q8}, [r0]
-+        add         ip, r0, r1
-+        vaddw.u8    q0, q15, d16
-+        vaddw.u8    q1, q15, d17
-+1:
-+          vld1.8      {q8}, [ip], r1
-+        subs        r3, #1
-+        vqmovun.s16 d4, q0
-+        vqmovun.s16 d5, q1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+        vst1.8      {q2}, [r0], r1
-+        bne         1b
-+
-+          vqmovun.s16 d4, q0
-+          vqmovun.s16 d5, q1
-+          vst1.8      {q2}, [r0]
-+          bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
-+        mov         r3,  #16-1
-+        vdup.32     q15, r2
-+        b           1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
-+@   uint8_t * dst,              // [r0]
-+@   unsigned int stride,        // [r1]
-+@   int dc)                     // [r2]
-+
-+function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
-+        vdup.16     q15, r2
-+        mov         r3, #32-1
-+1:      vld1.8      {q8, q9}, [r0]
-+        add         ip, r0, r1
-+        vaddw.u8    q0, q15, d16
-+        vaddw.u8    q1, q15, d17
-+        vaddw.u8    q2, q15, d18
-+        vaddw.u8    q3, q15, d19
-+1:
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d22, q2
-+        vqmovun.s16 d23, q3
-+          vld1.8      {q8, q9}, [ip], r1
-+        subs        r3, #1
-+          vaddw.u8    q0, q15, d16
-+          vaddw.u8    q1, q15, d17
-+          vaddw.u8    q2, q15, d18
-+          vaddw.u8    q3, q15, d19
-+        vst1.8      {q10, q11}, [r0], r1
-+        bne     1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d22, q2
-+          vqmovun.s16 d23, q3
-+          vst1.8      {q10, q11}, [r0]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.8      {d18}, [r0 :64]
-+        sub         r0, r2
-+        vld1.8      {d19}, [ip :64]
-+        sub         ip, r2
-+        vdup.16     q2, r3
-+        vdup.16     q3, r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        push        {r4, lr}
-+        vld2.8      {d16, d17}, [r0 :128]
-+        lsl         r2, #1
-+        vld2.8      {d18, d19}, [ip :128]
-+        mov         r3, #8-2
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         r4, r0, r2
-+        vmovl.u8    q10, d16
-+        add         lr, ip, r2
-+        vmovl.u8    q11, d18
-+        vqadd.s16   q0,  q10
-+        vaddw.u8    q2,  q15, d17
-+        vqadd.s16   q1,  q11
-+        vaddw.u8    q3,  q15, d19
-+1:
-+        vqmovun.s16 d20,  q0
-+        vqmovun.s16 d21,  q2
-+          vld2.8      {d16, d17}, [r4 :128], r2
-+        subs        r3, #2
-+        vqmovun.s16 d22,  q1
-+        vqmovun.s16 d23,  q3
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vld2.8      {d18, d19}, [lr :128], r2
-+        vst2.8      {d22, d23}, [ip :128], r2
-+          vld1.16     {q0, q1}, [r1 :256]!
-+          vmovl.u8    q10, d16
-+          vmovl.u8    q11, d18
-+          vqadd.s16   q0,  q10
-+          vaddw.u8    q2,  q15, d17
-+          vqadd.s16   q1,  q11
-+          vaddw.u8    q3,  q15, d19
-+        bne         1b
-+
-+          vqmovun.s16 d20,  q0
-+          vqmovun.s16 d21,  q2
-+          vqmovun.s16 d22,  q1
-+          vqmovun.s16 d23,  q3
-+          vst2.8      {d20, d21}, [r0 :128]
-+          vst2.8      {d22, d23}, [ip :128]
-+          pop         {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_u(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+@   int dc_v)             [r3]
-+
-+function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld2.8      {q8, q9}, [r0 :256]
-+        mov         r3, #16-1
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        vmovl.u8    q11, d16
-+        vmovl.u8    q12, d17
-+        vqadd.s16   q0,  q11
-+        vaddw.u8    q11, q15, d18
-+        vqadd.s16   q1,  q12
-+        vaddw.u8    q12, q15, d19
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        r3, #1
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d22, q11
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d23, q12
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {q10, q11}, [r0 :256], r2
-+          vmovl.u8    q11, d16
-+            pldw        [ip]
-+          vmovl.u8    q12, d17
-+          vqadd.s16   q0,  q11
-+          vaddw.u8    q11, q15, d18
-+          vqadd.s16   q1,  q12
-+          vaddw.u8    q12, q15, d19
-+        bne         1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d22, q11
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d23, q12
-+          vst2.8      {q10, q11}, [r0 :256]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q2, q3}, [r1]
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.8      {d18}, [r0 :64]
-+        sub         r0, r2
-+        vld1.8      {d19}, [ip :64]
-+        sub         ip, r2
-+        vdup.16     q0, r3
-+        vdup.16     q1, r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        push        {r4, lr}
-+        vld2.8      {d16, d17}, [r0 :128]
-+        lsl         r2, #1
-+        vld2.8      {d18, d19}, [ip :128]
-+        mov         r3, #8-2
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         r4, r0, r2
-+        vmovl.u8    q10, d17
-+        add         lr, ip, r2
-+        vmovl.u8    q11, d19
-+        vqadd.s16   q0,  q10
-+        vaddw.u8    q2,  q15, d16
-+        vqadd.s16   q1,  q11
-+        vaddw.u8    q3,  q15, d18
-+1:
-+        vqmovun.s16 d20,  q2
-+        vqmovun.s16 d21,  q0
-+          vld2.8      {d16, d17}, [r4 :128], r2
-+        subs        r3, #2
-+        vqmovun.s16 d22,  q3
-+        vqmovun.s16 d23,  q1
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vld2.8      {d18, d19}, [lr :128], r2
-+        vst2.8      {d22, d23}, [ip :128], r2
-+          vld1.16     {q0, q1}, [r1 :256]!
-+          vmovl.u8    q10, d17
-+          vmovl.u8    q11, d19
-+          vqadd.s16   q0,  q10
-+          vaddw.u8    q2,  q15, d16
-+          vqadd.s16   q1,  q11
-+          vaddw.u8    q3,  q15, d18
-+        bne         1b
-+
-+          vqmovun.s16 d20,  q2
-+          vqmovun.s16 d21,  q0
-+          vqmovun.s16 d22,  q3
-+          vqmovun.s16 d23,  q1
-+          vst2.8      {d20, d21}, [r0 :128]
-+          vst2.8      {d22, d23}, [ip :128]
-+          pop         {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
-+        vdup.16     q15, r3
-+        add         ip, r0, r2
-+        vld2.8      {q8, q9}, [r0 :256]
-+        mov         r3, #16-1
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        vmovl.u8    q11, d18
-+        vmovl.u8    q12, d19
-+        vqadd.s16   q0,  q11
-+        vaddw.u8    q11, q15, d16
-+        vqadd.s16   q1,  q12
-+        vaddw.u8    q12, q15, d17
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        r3, #1
-+        vqmovun.s16 d20, q11
-+        vqmovun.s16 d22, q0
-+        vqmovun.s16 d21, q12
-+        vqmovun.s16 d23, q1
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {q10, q11}, [r0 :256], r2
-+          vmovl.u8    q11, d18
-+            pldw        [ip]
-+          vmovl.u8    q12, d19
-+          vqadd.s16   q0,  q11
-+          vaddw.u8    q11, q15, d16
-+          vqadd.s16   q1,  q12
-+          vaddw.u8    q12, q15, d17
-+        bne         1b
-+
-+          vqmovun.s16 d20, q11
-+          vqmovun.s16 d22, q0
-+          vqmovun.s16 d21, q12
-+          vqmovun.s16 d23, q1
-+          vst2.8      {q10, q11}, [r0 :256]
-+          bx          lr
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
-+        add         ip, r0, r2
-+        vld1.16     {q0, q1}, [r1]!       @ all of U
-+        lsl         r2, #1
-+        vld1.8      {d16}, [r0 :64], r2
-+        rsb         r3, r2, #0
-+        vld1.8      {d17}, [ip :64], r2
-+        vld1.16     {q2, q3}, [r1]        @ all of V
-+        vld1.8      {d18}, [r0 :64], r3
-+        vld1.8      {d19}, [ip :64], r3
-+        vmovl.u8    q10, d16
-+        vmovl.u8    q11, d17
-+        vmovl.u8    q12, d18
-+        vmovl.u8    q13, d19
-+        vzip.16     q0, q2
-+        vzip.16     q1, q3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
-+        vqmovun.s16 d0,  q0
-+        vqmovun.s16 d1,  q2
-+        vqmovun.s16 d2,  q1
-+        vqmovun.s16 d3,  q3
-+        vst1.8      {d0}, [r0 :64], r2
-+        vst1.8      {d1}, [ip :64], r2
-+        vst1.8      {d2}, [r0 :64]
-+        vst1.8      {d3}, [ip :64]
-+        bx          lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
-+        vld2.8      {d16, d17}, [r0 :128]
-+        add         r3, r1, #(8*8*2)  @ Offset to V
-+        vld1.16     {q0}, [r1 :128]!
-+        add         ip, r0, r2
-+        vld1.16     {q1}, [r3 :128]!
-+        vmovl.u8    q10, d16
-+        push        {lr}
-+        vmovl.u8    q8,  d17
-+        mov         lr, #8-1
-+        vqadd.s16   q10, q0
-+        vqadd.s16   q1,  q8
-+1:
-+          vld2.8      {d16, d17}, [ip :128], r2
-+        subs        lr, #1
-+          vld1.16     {q0}, [r1 :128]!
-+        vqmovun.s16 d20, q10
-+        vqmovun.s16 d21, q1
-+          vld1.16     {q1}, [r3 :128]!
-+        vst2.8      {d20, d21}, [r0 :128], r2
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q8,  d17
-+          vqadd.s16   q10, q0
-+          vqadd.s16   q1,  q8
-+        bne         1b
-+
-+          vqmovun.s16 d20, q10
-+          vqmovun.s16 d21, q1
-+          vst2.8      {d20, d21}, [r0 :128]
-+          pop         {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
-+        vld2.8      {q8, q9}, [r0 :256]
-+        add         r3, r1, #(16*16*2)  @ Offset to V
-+        vld1.16     {q0, q1}, [r1 :256]!
-+        add         ip, r0, r2
-+        vld1.16     {q2, q3}, [r3 :256]!
-+        vmovl.u8    q10, d16
-+        push        {lr}
-+        vmovl.u8    q8,  d17
-+        mov         lr, #16-1
-+        vmovl.u8    q11, d18
-+        vmovl.u8    q9,  d19
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q8
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q9
-+1:
-+          vld2.8      {q8, q9}, [ip :256], r2
-+        subs        lr, #1
-+        vqmovun.s16 d20, q0
-+        vqmovun.s16 d22, q2
-+        vqmovun.s16 d21, q1
-+        vqmovun.s16 d23, q3
-+          vld1.16     {q0, q1}, [r1 :256]!
-+        vst2.8      {d20-d23}, [r0 :256], r2
-+          vld1.16     {q2, q3}, [r3 :256]!
-+          vmovl.u8    q10, d16
-+            pldw        [ip]
-+          vmovl.u8    q8,  d17
-+          vmovl.u8    q11, d18
-+          vmovl.u8    q9,  d19
-+          vqadd.s16   q0,  q10
-+          vqadd.s16   q1,  q8
-+          vqadd.s16   q2,  q11
-+          vqadd.s16   q3,  q9
-+        bne         1b
-+
-+          vqmovun.s16 d20, q0
-+          vqmovun.s16 d22, q2
-+          vqmovun.s16 d21, q1
-+          vqmovun.s16 d23, q3
-+          vst2.8      {d20-d23}, [r0 :256]
-+          pop         {pc}
-+endfunc
-+
-+@ 32x32 chroma never occurs so NIF
-+
-+@ ============================================================================
-diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-new file mode 100644
-index 0000000000..b56e0f9644
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-@@ -0,0 +1,2245 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.set EDGE_SRC_STRIDE, 160
-+
-+@ PIC jump tables are fractionally more expensive than absolute in our code
-+.set jent_pic, CONFIG_PIC
-+
-+
-+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
-+        vshr.u8   q12, q8, #3
-+        \I1
-+        vadd.i8   q8, \Q_K128
-+        \I2
-+        vshr.u8   q13, q9, #3
-+        \I3
-+        vadd.i8   q9, \Q_K128
-+        \I4
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT0, d25
-+        vtbl.8    d26, \XLAT1, d26
-+        vtbl.8    d27, \XLAT1, d27
-+
-+        vqadd.s8  q8, q12
-+        vshr.u8   q12, q10, #3
-+        vadd.i8   q10, \Q_K128
-+        vqadd.s8  q9, q13
-+        vshr.u8   q13, q11, #3
-+        vadd.i8   q11, \Q_K128
-+
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT0, d25
-+        vtbl.8    d26, \XLAT1, d26
-+        vtbl.8    d27, \XLAT1, d27
-+        vqadd.s8  q10, q12
-+        vsub.i8   q8, \Q_K128
-+        vqadd.s8  q11, q13
-+        vsub.i8   q9, \Q_K128
-+        vsub.i8   q10, \Q_K128
-+        vsub.i8   q11, \Q_K128
-+.endm
-+
-+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+        \L1
-+        \L2
-+        \L3
-+        \L4
-+        \L5
-+        vadd.i8   q12, q8, \Q_K128
-+        vshr.u8   q8, #3
-+        vtbl.8    d16, \XLAT0, d16
-+        vtbl.8    d17, \XLAT1, d17
-+        vqadd.s8  q12, q8
-+        bmi       2f
-+1:        \L1
-+          \L2
-+          \L3
-+          \L4
-+          \L5
-+        vsub.i8   q13, q12, \Q_K128
-+          vadd.i8   q12, q8, \Q_K128
-+          vshr.u8   q8, #3
-+        \S1
-+        \S2
-+        \S3
-+        \S4
-+          vtbl.8    d16, \XLAT0, d16
-+          vtbl.8    d17, \XLAT1, d17
-+          vqadd.s8  q12, q8
-+          bpl       1b
-+2:        vsub.i8   q13, q12, \Q_K128
-+          \S1
-+          \S2
-+          \S3
-+          \S4
-+.endm
-+
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
-+.endm
-+
-+@ Clobbers q12, q13
-+.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
-+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
-+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
-+        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
-+        \I1
-+        vtbl.8    d24, \XLAT0, d24
-+        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
-+        vtbl.8    d25, \XLAT1, d25
-+        \I2
-+        vtbl.8    d26, \XLAT0, d26
-+        vtbl.8    d27, \XLAT1, d27
-+        vaddw.s8  \Q0, d24
-+        vaddw.s8  \Q1, d25
-+        vaddw.s8  \Q2, d26
-+        vaddw.s8  \Q3, d27
-+        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
-+.endm
-+
-+@ Clobbers q10, q11, q12
-+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+        \L1
-+        \L2
-+        \L3
-+        \L4
-+        \L5
-+        vshrn.i16 d24, \Q0, #\bit_depth - 5
-+        vshrn.i16 d25, \Q1, #\bit_depth - 5
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT1, d25
-+        vaddw.s8  q10, \Q0, d24
-+        vaddw.s8  q11, \Q1, d25
-+        bmi       2f
-+1:        \L1
-+          \L2
-+          \L3
-+          \L4
-+          \L5
-+        vmax.s16  q10, \Q_MIN
-+        vmax.s16  q11, \Q_MIN
-+          vshrn.i16 d24, \Q0, #\bit_depth - 5
-+          vshrn.i16 d25, \Q1, #\bit_depth - 5
-+        vmin.s16  q10, \Q_MAX
-+        vmin.s16  q11, \Q_MAX
-+        \S1
-+        \S2
-+        \S3
-+        \S4
-+          vtbl.8    d24, \XLAT0, d24
-+          vtbl.8    d25, \XLAT1, d25
-+          vaddw.s8  q10, \Q0, d24
-+          vaddw.s8  q11, \Q1, d25
-+          bpl       1b
-+2:        vmax.s16  q10, \Q_MIN
-+          vmax.s16  q11, \Q_MIN
-+          vmin.s16  q10, \Q_MAX
-+          vmin.s16  q11, \Q_MAX
-+          \S1
-+          \S2
-+          \S3
-+          \S4
-+.endm
-+
-+
-+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
-+@ so we are quite safe stuffing it into a byte array
-+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
-+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
-+@ precision
-+
-+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
-+@ array via the stack
-+@ Given that sao_left_class > 28 can cause wrap we can't just poke
-+@ all 4 bytes in at once
-+@
-+@ It also loads other common regs
-+
-+@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
-+function band_load_y
-+        ldr       ip, [sp, #16]         @ &sao_offset_val[0]
-+        ldr       r4, [sp, #20]         @ sao_left_class
-+        vmov.i64  d4, #0
-+        vmov.i64  q0, #0
-+        pld       [r1]
-+        vld2.8    {q8}, [ip]
-+        sub       ip, sp, #8*5
-+        vmov.i64  q1, #0
-+        add       r4, ip, r4
-+        vpush     {d0-d4}               @ Put zero array on stack
-+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
-+        ldr       ip, [ip, #8*5 + 28]   @ height
-+        vst1.32   {d16[0]}, [r4]
-+        add       r4, r1, r3
-+        vpop      {d0-d4}               @ Pop modified array
-+        sub       ip, ip, #1
-+        vorr      d0, d0, d4
-+        bx        lr
-+endfunc
-+
-+@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
-+function band_load_c
-+        ldr       ip, [sp, #16]         @ &sao_offset_val1[0]
-+        ldr       r4, [sp, #20]         @ sao_left_class1
-+        vmov.i64  d24, #0
-+        vmov.i64  q10, #0
-+        pld       [r1]
-+        vld2.8    {q8}, [ip]
-+        sub       ip, sp, #8*5
-+        vmov.i64  q11, #0
-+        add       r4, ip, r4
-+        ldr       ip, [sp, #24]         @ &sao_offset_val2[0]
-+        vpush     {d20-d24}             @ Put zero array on stack
-+        vld2.8    {q9}, [ip]
-+        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
-+        ldr       ip, [sp, #8*5 + 28]   @ sao_left_class2
-+        vst1.32   {d16[0]}, [r4]
-+        add       ip, sp, ip
-+        vshr.u64  d18, d18, #8          @ 1st interesting val is [1]
-+        vldmia    sp, {d0-d3}           @ Load modified array
-+        vldr      d16, [sp, #8*4]
-+        add       r4, r1, r3
-+        vstmia    sp, {d20-d24}         @ Put zero array on stack (again)
-+        vst1.32   {d18[0]}, [ip]
-+        vorr      d0, d0, d16
-+        vldmia    sp, {d4-d7}           @ Load modified array
-+        vldr      d18, [sp, #8*4]
-+        ldr       ip, [sp, #8*5 + 36]   @ height
-+        add       sp, sp, #8*5
-+        vorr      d4, d4, d18
-+        sub       ip, ip, #1
-+        bx        lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_64_neon_8, export=1
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vldmia    r1, {q8-q11}
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
-+            "pld       [r4]",                 \
-+            "subs      ip, #1",               \
-+            "it ne; addne r4, r3",            \
-+            "add       r1, r3"
-+        vstmia    r0, {q8-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_32_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vld1.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #2
-+        vld1.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+        vst1.8    { q8, q9 }, [r0, :128], r2
-+        vst1.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_16_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_y
-+
-+1:      vld1.8    { q8}, [r1, :128], r3
-+        subs      ip, #4
-+        vld1.8    { q9}, [r6, :128], r3
-+        vld1.8    {q10}, [r1, :128], r3
-+        vld1.8    {q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+        vst1.8    { q8}, [r0, :128], r2
-+        vst1.8    { q9}, [r5, :128], r2
-+        vst1.8    {q10}, [r0, :128], r2
-+        vst1.8    {q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_8_neon_8, export=1
-+        ldr       ip, [sp, #8]          @ width
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        cmp       ip, #8
-+        bl        band_load_y
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        blt       4f
-+
-+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
-+            "vld1.8    {d16}, [r1, :64], r3", \
-+            "subs      ip, #2",               \
-+            "vld1.8    {d17}, [r6, :64], r3", \
-+            "",                               \
-+            "",                               \
-+            "vst1.8 {d26}, [r0, :64], r2",    \
-+            "vst1.8 {d27}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        sao_band_16b_8 {d0-d3}, {d0-d3}, q15,    \
-+            "vld1.32   {d16[0]}, [r1, :32], r3", \
-+            "subs      ip, #4",                  \
-+            "vld1.32   {d16[1]}, [r6, :32], r3", \
-+            "vld1.32   {d17[0]}, [r1, :32], r3", \
-+            "vld1.32   {d17[1]}, [r6, :32], r3", \
-+            "vst1.32   {d26[0]}, [r0, :32], r2", \
-+            "vst1.32   {d26[1]}, [r5, :32], r2", \
-+            "vst1.32   {d27[0]}, [r0, :32], r2", \
-+            "vst1.32   {d27[1]}, [r5, :32], r2"
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        vmov.u8   q15, #128
-+        bl        band_load_c
-+
-+1:      vld2.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #1
-+        vld2.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
-+            "pld       [r4]",                 \
-+            "it ne; addne r4, r3"
-+
-+        vst2.8    { q8, q9 }, [r0, :128], r2
-+        vst2.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop     {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.u8   q15, #128
-+        bl        band_load_c
-+
-+1:      vld2.8    { q8, q9 }, [r1, :128], r3
-+        subs      ip, #2
-+        vld2.8    {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_8 {d0-d3}, {d4-d7}, q15
-+
-+        vst2.8    { q8, q9 }, [r0, :128], r2
-+        vst2.8    {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop     {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
-+        ldr       ip, [sp, #16]         @ width
-+        push      {r4-r6, lr}
-+        vmov.u8   q15, #128
-+        cmp       ip, #8
-+        bl        band_load_c
-+        blt       4f
-+
-+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15,      \
-+            "vld2.8    {d16-d17}, [r1, :128], r3", \
-+            "subs      ip, #1",                    \
-+            "",                                    \
-+            "",                                    \
-+            "",                                    \
-+            "vst2.8    {d26-d27}, [r0, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
-+            "vld1.8    {d16}, [r1, :64], r3", \
-+            "subs      ip, #2",               \
-+            "vld1.8    {d17}, [r6, :64], r3", \
-+            "vuzp.8    d16, d17",             \
-+            "",                               \
-+            "vzip.8    d26, d27",             \
-+            "vst1.8    {d26}, [r0, :64], r2", \
-+            "vst1.8    {d27}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_64_16 bit_depth
-+        push      {r4-r6, lr}
-+        vmov.i64  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+        vpush     {q4-q7}
-+
-+1:      vldm      r1, {q4-q11}
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+            "subs      ip, #1",                                                  \
-+            "add       r1, r3"
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
-+        vstm      r0, {q4-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        vpop      {q4-q7}
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_64_neon_10, export=1
-+        band_64_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_32_16 bit_depth
-+        push      {r4-r6, lr}
-+        vmov.i64  q2, #0
-+        vmov.i16  q3, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+
-+1:      vldm      r1, {q8-q11}
-+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+            "subs      ip, #1",                                                   \
-+            "add       r1, r3"
-+        vstm      r0, {q8-q11}
-+        add       r0, r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_32_neon_10, export=1
-+        band_32_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_16_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+
-+1:      vld1.16   { q8, q9 }, [r1, :128], r3
-+        subs      r12, #2
-+        vld1.16   {q10, q11}, [r6, :128], r3
-+        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
-+        vst1.16   { q8, q9 }, [r0, :128], r2
-+        vst1.16   {q10, q11}, [r5, :128], r2
-+        bpl       1b
-+
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_16_neon_10, export=1
-+        band_16_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_10 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+.macro band_8_16 bit_depth
-+        ldr       ip, [sp, #8]          @ width
-+        push      {r4-r6, lr}
-+        vmov.i64  q14, #0
-+        cmp       ip, #8
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_y
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        blt       4f
-+
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+            "vld1.16   {q8}, [r1, :128], r3",                           \
-+            "subs      ip, #2",                                         \
-+            "vld1.16   {q9}, [r6, :128], r3",                           \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst1.16   {q10}, [r0, :128], r2",                          \
-+            "vst1.16   {q11}, [r5, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+            "vld1.16   {d16}, [r1, :64], r3",                           \
-+            "subs      ip, #4",                                         \
-+            "vld1.16   {d17}, [r6, :64], r3",                           \
-+            "vld1.16   {d18}, [r1, :64], r3",                           \
-+            "vld1.16   {d19}, [r6, :64], r3",                           \
-+            "vst1.16   {d20}, [r0, :64], r2",                           \
-+            "vst1.16   {d21}, [r5, :64], r2",                           \
-+            "vst1.16   {d22}, [r0, :64], r2",                           \
-+            "vst1.16   {d23}, [r5, :64], r2"
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_8_neon_10, export=1
-+        band_8_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_32_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        sub       r2, #64
-+        sub       r3, #64
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+        mov       lr, #64
-+        vpush     {q4-q7}
-+
-+1:      vld2.16   { q4, q5 }, [r1, :128], lr
-+        subs      ip, #1
-+        vld2.16   { q6, q7 }, [r6, :128], lr
-+        vld2.16   { q8, q9 }, [r1, :128], r3
-+        vld2.16   {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "pld       [r4]",                                                      \
-+            "it ne; addne r4, r3"
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+        vst2.16   { q4, q5 }, [r0, :128], lr
-+        vst2.16   { q6, q7 }, [r5, :128], lr
-+        vst2.16   { q8, q9 }, [r0, :128], r2
-+        vst2.16   {q10, q11}, [r5, :128], r2
-+
-+        bpl       1b
-+
-+        vpop      {q4-q7}
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
-+        band_c_32_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_16_16 bit_depth
-+        push      {r4-r6, lr}
-+        add       r5, r0, #32
-+        add       r6, r1, #32
-+        vmov.i64  q14, #0
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+
-+1:      vld2.16   { q8, q9 }, [r1, :128], r3
-+        subs      ip, #1
-+        vld2.16   {q10, q11}, [r6, :128], r3
-+
-+        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+        vst2.16   { q8, q9 }, [r0, :128], r2
-+        vst2.16   {q10, q11}, [r5, :128], r2
-+
-+        bpl       1b
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
-+        band_c_16_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_10(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
-+
-+.macro band_c_8_16 bit_depth
-+        ldr       ip, [sp, #16]         @ width
-+        push      {r4-r6, lr}
-+        vmov.i64  q14, #0
-+        cmp       ip, #8
-+        vmov.i16  q15, #(1 << \bit_depth) - 1
-+        bl        band_load_c
-+        blt       4f
-+
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "vld2.16   {q8,q9}, [r1, :128], r3",                        \
-+            "subs      ip, #1",                                         \
-+            "",                                                         \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst2.16   {q10,q11}, [r0, :128], r2"
-+        pop       {r4-r6, pc}
-+4:
-+        add       r5, r0, r2
-+        add       r6, r1, r3
-+        lsl       r2, #1
-+        lsl       r3, #1
-+        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+            "vld2.16   {d16,d18}, [r1, :128], r3",                      \
-+            "subs      ip, #2",                                         \
-+            "vld2.16   {d17,d19}, [r6, :128], r3",                      \
-+            "",                                                         \
-+            "",                                                         \
-+            "vst2.16   {d20,d22}, [r0, :128], r2",                      \
-+            "vst2.16   {d21,d23}, [r5, :128], r2"
-+        pop       {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
-+        band_c_8_16 10
-+endfunc
-+
-+
-+@ =============================================================================
-+@ SAO EDGE
-+
-+@ r0    destination address
-+@ r2    stride to post-increment r0 with
-+@ [r5]  translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27.  For Y d26=d27
-+
-+function edge_64b_body_8
-+
-+        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
-+        vcgt.u8 q13,  q5,  q1
-+        vcgt.u8 q14,  q6,  q2
-+        vcgt.u8 q15,  q7,  q3
-+
-+        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
-+        vcgt.u8  q1,  q5
-+        vcgt.u8  q2,  q6
-+        vcgt.u8  q3,  q7
-+
-+        vsub.s8  q0,  q12       @ a = sign(c-a)
-+        vsub.s8  q1,  q13
-+        vsub.s8  q2,  q14
-+        vsub.s8  q3,  q15
-+
-+        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
-+        vcgt.u8  q13, q5,  q9
-+        vcgt.u8  q14, q6,  q10
-+        vcgt.u8  q15, q7,  q11
-+
-+        vsub.s8  q0,  q12
-+        vsub.s8  q1,  q13
-+        vsub.s8  q2,  q14
-+        vsub.s8  q3,  q15
-+
-+        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
-+        vcgt.u8  q13, q9,  q5
-+        vcgt.u8  q14, q10, q6
-+        vcgt.u8  q15, q11, q7
-+
-+        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
-+        vadd.s8  q1,  q13
-+        vmov.u8  q12, #2
-+        vadd.s8  q2,  q14
-+        vadd.s8  q3,  q15
-+
-+        vadd.s8  q0,  q12
-+        vadd.s8  q1,  q12
-+
-+        vld1.8   {d26, d27}, [r5]
-+
-+        vadd.s8  q2,  q12
-+        vuzp.8   q0,  q1
-+        vmov.u8  q15, #128
-+        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
-+
-+        vtbl.8   d0,  {d26}, d0
-+        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
-+
-+        vtbl.8   d1,  {d26}, d1
-+        vadd.s8  q14, q5, q15
-+
-+        vtbl.8   d2,  {d27}, d2
-+        vuzp.8   q2,  q3
-+
-+        vtbl.8   d3,  {d27}, d3
-+
-+        vtbl.8   d4,  {d26}, d4
-+        vzip.8   q0,  q1
-+
-+        vtbl.8   d5,  {d26}, d5
-+        vqadd.s8 q0,  q12
-+        vqadd.s8 q1,  q14
-+        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
-+
-+        vtbl.8   d6,  {d27}, d6
-+        vtbl.8   d7,  {d27}, d7
-+        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
-+        vzip.8   q2,  q3
-+
-+        vsub.s8  q0,  q15
-+        vqadd.s8 q2,  q12
-+        vqadd.s8 q3,  q14
-+        vsub.s8  q1,  q15
-+        vsub.s8  q2,  q15
-+        vsub.s8  q3,  q15
-+
-+        bx      lr
-+endfunc
-+
-+@ r0    destination address
-+@ r2    stride to post-increment r0 with
-+@ r4    upper clip value
-+@ [r5]  translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27.  For Y d26=d27
-+
-+function edge_64b_body_16
-+
-+        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
-+        vcgt.u16 q13, q5, q1
-+        vcgt.u16 q14, q6, q2
-+        vcgt.u16 q15, q7, q3
-+
-+        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
-+        vcgt.u16 q1, q1, q5
-+        vcgt.u16 q2, q2, q6
-+        vcgt.u16 q3, q3, q7
-+
-+        vsub.s16 q0, q0, q12 // a = sign(c-a)
-+        vsub.s16 q1, q1, q13
-+        vsub.s16 q2, q2, q14
-+        vsub.s16 q3, q3, q15
-+
-+        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
-+        vcgt.u16 q13, q5, q9
-+        vcgt.u16 q14, q6, q10
-+        vcgt.u16 q15, q7, q11
-+
-+        vsub.s16 q0, q0, q12
-+        vsub.s16 q1, q1, q13
-+        vsub.s16 q2, q2, q14
-+        vsub.s16 q3, q3, q15
-+
-+        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
-+        vcgt.u16 q13, q9, q5
-+        vcgt.u16 q14, q10, q6
-+        vcgt.u16 q15, q11, q7
-+
-+        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
-+        vadd.s16 q1, q1, q13
-+        vadd.s16 q2, q2, q14
-+        vadd.s16 q3, q3, q15
-+
-+        vmov.u8  q12, #2
-+
-+        vmovn.s16 d0, q0
-+        vmovn.s16 d1, q1
-+        vmovn.s16 d2, q2
-+        vmovn.s16 d3, q3
-+
-+        vldr     d26, [r5]
-+
-+        vuzp.8   q0, q1
-+
-+        vldr     d27, [r5, #8]
-+
-+        vadd.s8  q0, q0, q12
-+        vadd.s8  q1, q1, q12
-+
-+        vmov.i64 q12, #0
-+
-+        vtbl.8   d0, {d26}, d0
-+        vtbl.8   d1, {d26}, d1
-+        vtbl.8   d2, {d27}, d2
-+        vtbl.8   d3, {d27}, d3
-+
-+        vdup.i16 q13, r4
-+
-+        vzip.8   q0, q1
-+
-+        @ Avoid overwrite whilst widening
-+        vaddw.s8 q2, q6, d2
-+        vaddw.s8 q3, q7, d3
-+        vaddw.s8 q1, q5, d1
-+        vaddw.s8 q0, q4, d0
-+
-+        @ now clip
-+        clip16_4 q2, q3, q1, q0, q12, q13
-+
-+        bx       lr
-+endfunc
-+
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3, q9, q10
-+@
-+@ d16, d17 (q8) xlat U, V
-+@ q14.u8 #2
-+@ q15.u8 #128
-+
-+function edge_16b_body_8
-+        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
-+        vadd.u8  q9,  q14, q9
-+        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
-+        vsub.u8  q9,  q9,  q0
-+        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
-+        vadd.u8  q9,  q9,  q0
-+        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
-+        vsub.u8  q0,  q9,  q0
-+
-+        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
-+
-+        vuzp.8   d0,  d1
-+
-+        vtbl.8   d0,  {d16}, d0
-+        vtbl.8   d1,  {d17}, d1
-+
-+        vzip.8   d0,  d1
-+        vqadd.s8 q0,  q3
-+        vsub.s8  q0,  q15
-+
-+        bx      lr
-+endfunc
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3
-+@
-+@ q12, #0
-+@ d16, d17 xlat U, V
-+@ q14.u8 #2
-+@ q15.u16 max
-+function edge_16b_body_16
-+        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
-+        vadd.u16 q9, q14, q9
-+        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
-+        vsub.u16 q9, q9, q0
-+        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
-+        vadd.u16 q9, q9, q0
-+        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
-+        vsub.u16 q0, q9, q0
-+
-+        vmovn.s16 d0, q0
-+        @ d1 will have random contents that we transform but
-+        @ that doesn't matter as we then discard them
-+        vuzp.8   d0, d1
-+
-+        vtbl.8   d0, {d16}, d0
-+        vtbl.8   d1, {d17}, d1
-+
-+        vzip.8   d0, d1
-+
-+        vaddw.s8 q0, q1, d0
-+
-+        @ now clip
-+        vmax.s16 q0, q12
-+        vmin.s16 q0, q15
-+        bx       lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
-+@   int eo,                           [sp, #sp_base + 0]
-+@   int width,                        [sp, #sp_base + 4]
-+@   int height)                       [sp, #sp_base + 8]
-+
-+@ Jumps via jump_tab with
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   EDGE_SRC_STRIDE                   [r3]
-+@   (1 << \bit_depth) - 1             [r4]
-+@   * xlat_table                      [r5]  // setup_64b only
-+@   int height                        [r12]
-+@
-+@   0                                 [q12] // > 8 bit
-+@   2                                 [q14]
-+@   128                               [q15] // = 8 bit
-+@   r4                                [q15] // > 8 bit
-+
-+.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
-+
-+@ Build translate registers
-+@ As translate values can only be 0-4 we don't care about junk in the rest
-+@ of the register
-+.if \is_chroma
-+        ldr      ip, [sp, #0]
-+        push     {r4-r6, lr}    @ 16 bytes
-+        vld1.8   {d16[2]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[2]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[0]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[0]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[1]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[1]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[3]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[3]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[4]}, [r3]
-+        vld1.8   {d17[4]}, [ip]
-+        movw     r3, EDGE_SRC_STRIDE
-+.set sp_base, 20
-+.else
-+        add      ip, r3, #4
-+        vld1.8   {d16[1]}, [r3]
-+        add      r3, r3, #2
-+        vld1.8   {d17[0]}, [ip]
-+        add      ip, ip, #2
-+        vld1.8   {d16[0]}, [r3]
-+        add      r3, r3, #6
-+        vld1.8   {d17[1]}, [ip]
-+        vld1.8   {d16[2]}, [r3]
-+        movw     r3, EDGE_SRC_STRIDE
-+        push     {r4-r6, lr}    @ 16 bytes
-+        vzip.8   d16, d17
-+        vmov     d17, d16
-+.set sp_base, 16
-+.endif
-+
-+@ If setup_64b we need the xlat table on the stack
-+.if \setup_64b
-+        sub      r5, sp, #16
-+.endif
-+
-+@ Get jump address
-+@ We have a special case for width 4 as the calling code doesn't detect it
-+@ If we may have w4 then we add a 2nd jump table after the 1st
-+.if \check_w4
-+        ldr      r12, [sp, #sp_base + 4]        @ width
-+        adr      r6, \jump_tab
-+        ldr      lr, [sp, #sp_base + 0]        @ e0
-+        cmp      r12, #8
-+        it lt
-+        addlt    r6, #16
-+.else
-+        ldr      lr, [sp, #sp_base + 0]        @ e0
-+        adr      r6, \jump_tab
-+.endif
-+
-+        ldr      r12, [sp, #sp_base + 8]        @ height
-+
-+.if \bit_depth > 8
-+        movw     r4, (1 << \bit_depth) - 1
-+.endif
-+.if \setup_16b
-+.if \bit_depth > 8
-+        vmov.i64 q12, #0
-+        vdup.16  q15, r4
-+        vmov.u16 q14, #2
-+.else
-+        vmov.u8  q15, #128
-+        vmov.u8  q14, #2
-+.endif
-+.endif
-+
-+@ If setup_64b we need q4-q7 saved.
-+.if \setup_64b
-+        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
-+.set sp_base, sp_base + 80
-+.endif
-+
-+        ldr      r6, [r6, lr, lsl #2]
-+
-+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
-+.if \do2
-+        push     {r0, r1, r6, r12}
-+.if jent_pic
-+        bl       98f
-+.else
-+        blx      r6
-+.endif
-+        pop      {r0, r1, r6, r12}
-+
-+        add      r0, #64
-+        add      r1, #64
-+.endif
-+
-+.if jent_pic
-+        bl       98f
-+.else
-+        blx      r6
-+.endif
-+
-+@ Tidy up & return
-+.if \setup_64b
-+        vpop     {q4-q8}        @ spurious but harmless load of q8
-+.endif
-+        pop      {r4-r6, pc}
-+
-+.if jent_pic && !\xjump
-+@ Magic label - used as 98b in jent macro
-+98:
-+        add      pc, r6
-+.endif
-+.endm
-+
-+
-+.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
-+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
-+.endm
-+
-+.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
-+        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
-+.endm
-+
-+
-+.macro  edge_64b_e0, body_fn, pb
-+        sub      r1, #8
-+        mov      r6, lr
-+1:      vldm     r1, {d7-d16}
-+        // load a
-+        vext.8   q0,  q3,  q4, #(16 - \pb)
-+        add      r1, r3
-+        vext.8   q1,  q4,  q5, #(16 - \pb)
-+        subs     r12, #1
-+        vext.8   q2,  q5,  q6, #(16 - \pb)
-+        vext.8   q3,  q6,  q7, #(16 - \pb)
-+        pld      [r1]
-+        // load b
-+        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
-+        pld      [r1, #64]
-+        vext.8   q8,  q4,  q5, #\pb
-+        vext.8   q9,  q5,  q6, #\pb
-+        vext.8   q10, q6,  q7, #\pb
-+        bl       \body_fn
-+        vstm     r0, {q0-q3}
-+        add      r0, r0, r2
-+        bgt      1b
-+        bx       r6
-+.endm
-+
-+.macro  edge_32bx2_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        sub      r1, #8
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      vldmia   r1, {d7-d12}
-+        // load a
-+        vext.8   q0, q3, q4, #16 - \pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q1, q4, q5, #16 - \pb
-+        subs     r12, #2
-+        // load b
-+        vext.8   q8, q4, q5, #\pb
-+        vext.8   q9, q5, q6, #\pb
-+        vldr     d25, [r6, #-8]
-+        vldmia   r6, {d12-d15}
-+        vldr     d26, [r6, #32]
-+        // load a
-+        vext.8   q2, q12, q6, #16 - \pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q3, q6, q7, #16 - \pb
-+        // load b
-+        vext.8   q10, q6, q7, #\pb
-+        vext.8   q11, q7, q13, #\pb
-+        bl       \body_fn
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+.endm
-+
-+.macro  edge_16b_e0, body_fn, pb
-+        sub      r1, #8
-+        mov      r6, lr
-+1:      vldmia   r1, {d1-d4}
-+        add      r1, r3
-+        subs     r12, #1
-+        vext.8   q0, q0, q1, #16 - \pb
-+        vext.8   q2, q1, q2, #\pb
-+
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        bgt      1b
-+        bx       r6
-+.endm
-+
-+.macro  edge_8bx2_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        sub      r1, #8
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      vldmia   r1, {d1-d2}
-+        vldmia   r6, {d3-d4}
-+        vldr     d6, [r1, #16]
-+        subs     r12, #2
-+        vldr     d7, [r6, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vext.8   d0, d1, d2, #8 - \pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   d5, d3, d4, #\pb
-+        vext.8   d4, d2, d6, #\pb
-+        vext.8   d1, d7, d3, #8 - \pb
-+
-+        bl       \body_fn
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+.endm
-+
-+.macro  edge_4bx4_e0, body_fn, pb
-+        add      r6, r1, r3
-+        push     {r7,lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+
-+        tst      r1, #4
-+        bne      2f
-+1:      // r1 (and assumed r6) are 64-bit aligned
-+        vldr     d2, [r1]
-+        vldr     d0, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d20, [r6]
-+        subs     r12, #4
-+        vldr     d18, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d3, [r1]
-+        vshr.u64 d4, d2, #\pb * 8
-+        vldr     d1, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d21, [r6]
-+        vext.8   d0, d0, d2, #8 - \pb
-+        vldr     d19, [r6,#-8]
-+        add      r6, r6, r3, lsl #1
-+        vshr.u64 d22, d20, #\pb * 8
-+        vext.8   d18, d18, d20, #8 - \pb
-+        vshr.u64 d5, d3, #\pb * 8
-+        vext.8   d1, d1, d3, #8 - \pb
-+        vshr.u64 d23, d21, #\pb * 8
-+        vext.8   d19, d19, d21, #8 - \pb
-+        vsli.64  q1, q10, #32
-+        vsli.64  q2, q11, #32
-+        vsli.64  q0, q9, #32
-+
-+        bl       \body_fn
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      1b
-+        pop      {r7,pc}
-+
-+2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
-+        vldr     d20, [r1, #-4]
-+        vldr     d22, [r1, #4]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d2, [r6, #-4]
-+        subs     r12, #4
-+        vldr     d4, [r6, #4]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d21, [r1, #-4]
-+        vshl.i64 d18, d20, #\pb * 8
-+        vldr     d23, [r1, #4]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d3, [r6, #-4]
-+        vext.8   d22, d20, d22, #\pb
-+        vldr     d5, [r6, #4]
-+        add      r6, r6, r3, lsl #1
-+        vshl.i64 d0, d2, #\pb * 8
-+        vext.8   d4, d2, d4, #\pb
-+        vshl.i64 d19, d21, #\pb * 8
-+        vext.8   d23, d21, d23, #\pb
-+        vshl.i64 d1, d3, #\pb * 8
-+        vext.8   d5, d3, d5, #\pb
-+        vsri.64  q1, q10, #32
-+        vsri.64  q0, q9, #32
-+        vsri.64  q2, q11, #32
-+
-+        bl       \body_fn
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      2b
-+        pop      {r7,pc}
-+.endm
-+
-+
-+.macro  edge_64b_e1, body_fn
-+        sub      r1, r3
-+        push     {lr}
-+        add      r6, r1, #32
-+        // load a
-+        vld1.8   {q0-q1}, [r1, :256], r3
-+        vld1.8   {q2-q3}, [r6, :256], r3
-+        // load c
-+        vld1.8   {q4-q5}, [r1, :256], r3
-+        vld1.8   {q6-q7}, [r6, :256], r3
-+1:      // load b
-+        vld1.8   {q8-q9}, [r1, :256], r3
-+        subs     r12, #1
-+        vld1.8   {q10-q11}, [r6, :256], r3
-+        bl       \body_fn
-+        vstm     r0, {q0-q3}
-+        // copy c to a
-+        vmov.64  q0, q4
-+        pld      [r1, r3]
-+        vmov.64  q1, q5
-+        it       le
-+        pople    {lr}
-+        vmov.64  q2, q6
-+        it       le
-+        bxle     lr
-+        vmov.64  q3, q7
-+        add      r0, r0, r2
-+        // copy b to c
-+        vmov.64  q4, q8
-+        vmov.64  q5, q9
-+        vmov.64  q6, q10
-+        vmov.64  q7, q11
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e1, body_fn
-+        sub      r6, r1, r3
-+        vld1.8   {q2-q3}, [r1, :256], r3
-+        vld1.8   {q0-q1}, [r6, :256]
-+        mov      r6, lr
-+
-+1:      @ Given the data duplication here we could obviously do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        vld1.8   {q8-q9}, [r1, :256], r3
-+        subs     r12, #2
-+        vmov     q4, q2
-+        vmov     q5, q3
-+        vld1.8   {q10-q11}, [r1, :256], r3
-+        vmov     q6, q8
-+        vmov     q7, q9
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        // copy b to a
-+        vmov     q0, q8
-+        vmov     q1, q9
-+        vst1.8   {q2-q3}, [r0, :256], r2
-+        vmov     q2, q10
-+        it       le
-+        bxle     r6
-+        vmov     q3, q11
-+        b        1b
-+.endm
-+
-+.macro  edge_16b_e1, body_fn
-+        sub      r6, r1, r3
-+        // load c
-+        vld1.8   {q1}, [r1, :128], r3
-+        // load a
-+        vld1.8   {q0}, [r6, :128]
-+        mov      r6, lr
-+1:      // load b
-+        vld1.8   {q2}, [r1, :128], r3
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        subs     r12, #1
-+        // copy c to a
-+        vmov.64  q0, q1
-+        it       le
-+        bxle     r6
-+        // copy b to c
-+        vmov.64  q1, q2
-+        b        1b
-+.endm
-+
-+.macro  edge_8bx2_e1, body_fn
-+        sub      r6, r1, r3
-+        lsl      r3, #1
-+        push     {r7, lr}
-+        vld1.8   {d1}, [r1, :64], r3
-+        vld1.8   {d0}, [r6, :64], r3
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+1:      @ Given the data duplication here we could obviously do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        vld1.8   {d4}, [r6, :64], r3
-+        vmov     d2, d1
-+        vld1.8   {d5}, [r1, :64], r3
-+        subs     r12, #2
-+        vmov     d3, d4
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+
-+        // copy b to a
-+        vmov     q0, q2
-+        bgt      1b
-+        pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e1, body_fn
-+        sub      r6, r1, r3
-+        lsl      r3, #1
-+        push     {r7, lr}
-+        vld1.32  {d0[1]}, [r1, :32], r3
-+        add      r7, r0, r2
-+        vld1.32  {d0[0]}, [r6, :32], r3
-+        lsl      r2, #1
-+        vld1.32  {d4[1]}, [r1, :32], r3
-+        vld1.32  {d4[0]}, [r6, :32], r3
-+        vld1.32  {d5[1]}, [r1, :32], r3
-+        vld1.32  {d5[0]}, [r6, :32], r3
-+        vmov     d1, d4
-+        vext.32  d2, d0, d4, #1
-+        subs     r12, #4
-+        vmov     d22, d5
-+        vext.32  d3, d4, d5, #1
-+        b        2f
-+
-+1:      vst1.32  {d0[0]}, [r0, :32], r2
-+        vext.32  d2, d22, d4, #1
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vmov     d0, d22
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vext.32  d3, d4, d5, #1
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        vmov     d1, d4
-+        vmov     d22, d5
-+2:      @ Given the data duplication here we could probably do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        bl       \body_fn
-+        ble      3f
-+        vld1.32  {d4[0]}, [r6, :32], r3
-+        subs     r12, #4
-+        vld1.32  {d4[1]}, [r1, :32], r3
-+        vld1.32  {d5[0]}, [r6, :32], r3
-+        vld1.32  {d5[1]}, [r1, :32], r3
-+        b        1b
-+
-+3:      vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32]
-+        vst1.32  {d1[1]}, [r7, :32]
-+        pop      {r7, pc}
-+.endm
-+
-+.macro  edge_64b_e2, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        // load c and a
-+        vld1.8   {q4-q5}, [r1, :128]
-+        vldr     d25, [r6, #-8]
-+        vldmia   r6, {d16-d23}
-+        vext.8   q0, q12, q8, #16 - \pb
-+        add      r6, r1, #32
-+        vext.8   q1, q8, q9, #16 - \pb
-+        add      r1, r1, r3
-+        vext.8   q2, q9, q10, #16 - \pb
-+        vld1.8   {q6-q7}, [r6, :128]
-+        sub      r6, r1, r3
-+        vext.8   q3, q10, q11, #16 - \pb
-+
-+1:      // load b
-+        vldmia   r1, {d16-d24}
-+        vext.8   q8, q8, q9, #\pb
-+        pld      [r1, r3]
-+        vext.8   q9, q9, q10, #\pb
-+        subs     r12, #1
-+        vext.8   q10, q10, q11, #\pb
-+        vext.8   q11, q11, q12, #\pb
-+        bl       \body_fn
-+        // next a is mostly available in c
-+        vldr     d25, [r6, #-8]
-+        vstmia   r0, {q0-q3}
-+        vext.8   q3, q6, q7, #16 - \pb
-+        it       le
-+        pople    {lr}
-+        vext.8   q2, q5, q6, #16 - \pb
-+        it       le
-+        bxle     lr
-+        vext.8   q1, q4, q5, #16 - \pb
-+        add      r6, r6, r3
-+        vext.8   q0, q12, q4, #16 - \pb
-+        add      r0, r0, r2
-+        // next c is mostly available in b
-+        vldr     d8, [r1]
-+        vext.8   d9, d16, d17, #8 - \pb
-+        vext.8   q5, q8, q9, #16 - \pb
-+        add      r1, r1, r3
-+        vext.8   q6, q9, q10, #16 - \pb
-+        pld      [r6, #-8]
-+        vext.8   q7, q10, q11, #16 - \pb
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        // load a and first 32b of c
-+        vld1.8   {q4-q5}, [r1, :256]
-+        vldr     d25, [r6, #-8]
-+        vld1.8   {q13-q14}, [r6, :256]
-+        vldr     d31, [r1, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q0, q12, q13, #16 - \pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q1, q13, q14, #16 - \pb
-+        vext.8   q2, q15, q4, #16 - \pb
-+        vext.8   q3, q4, q5, #16 - \pb
-+1:
-+        // load second 32b of c and second 32b of b
-+        vldmia   r6, {d12-d16}
-+        vldmia   r1, {d20-d24}
-+        // first 32b of b is mostly available in second 32b of c
-+        vext.8   q9, q7, q8, #\pb
-+        subs     r12, #2
-+        vext.8   q8, q6, q7, #\pb
-+        vext.8   q10, q10, q11, #\pb
-+        vext.8   q11, q11, q12, #\pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        ble      2f
-+
-+        vldr     d25, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d8, [r1]
-+        vext.8   d9, d20, d21, #8 - \pb
-+        vldr     d31, [r1, #-8]
-+        add      r1, r1, r3, lsl #1
-+        // first 32b of a is mostly available in second 32b of c
-+        vext.8   q1, q6, q7, #16 - \pb
-+        vext.8   q0, q12, q6, #16 - \pb
-+        // first 32b of c is mostly available in second 32b of b
-+        vext.8   q5, q10, q11, #16 - \pb
-+        // second 32b of a is mostly available in first 32b of c
-+        vext.8   q2, q15, q4, #16 - \pb
-+        vext.8   q3, q4, q5, #16 - \pb
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_16b_e2, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        vld1.8   {q1}, [r1, :128], r3
-+        vldr     d19, [r6, #-8]
-+        vld1.8   {q10}, [r6, :128], r3
-+
-+1:      vldmia   r1, {d4-d6}
-+        vext.8   q0, q9, q10, #16 - \pb
-+        subs     r12, #1
-+        vext.8   q2, q2, q3, #\pb
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        ble      2f
-+        vmov     q10, q1
-+        vldr     d2, [r1]
-+        add      r1, r1, r3
-+        vldr     d19, [r6, #-8]
-+        add      r6, r6, r3
-+        vext.8   d3, d4, d5, #8 - \pb
-+        b        1b
-+
-+2:      pop      {pc}
-+.endm
-+
-+.macro  edge_8bx2_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        vldr     d18, [r6, #-8]
-+        vldr     d19, [r6]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #-8]
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldmia   r6, {d3-d4}
-+        vld1.8   {d21-d22}, [r1, :128]
-+
-+1:      vext.8   d0, d18, d19, #8 - \pb
-+        vext.8   d4, d3, d4, #\pb
-+        vext.8   d1, d20, d2, #8 - \pb
-+        subs     r12, #2
-+        vext.8   d5, d21, d22, #\pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        ble      2f
-+
-+        vldr     d18, [r6, #-8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #-8]
-+        vmov     d19, d3
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldmia   r6, {d3-d4}
-+        vld1.8   {d21-d22}, [r1, :128]
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e2, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7-r9, lr}
-+        add      r8, r1, r3
-+        sub      r6, r6, #\pb
-+        add      r8, r8, #\pb
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+
-+1:      vld1.32  {d0[0]}, [r6], r3
-+        subs     r12, #4
-+        vld1.32  {d2[0]}, [r1], r3
-+        vld1.32  {d4[0]}, [r8], r3
-+        vld1.32  {d0[1]}, [r6], r3
-+        vld1.32  {d2[1]}, [r1], r3
-+        vld1.32  {d4[1]}, [r8], r3
-+        vld1.32  {d1[0]}, [r6], r3
-+        vld1.32  {d3[0]}, [r1], r3
-+        vld1.32  {d5[0]}, [r8], r3
-+        vld1.32  {d1[1]}, [r6], r3
-+        vld1.32  {d3[1]}, [r1], r3
-+        vld1.32  {d5[1]}, [r8], r3
-+
-+        bl       \body_fn
-+
-+        vst1.32  {d0[0]}, [r0, :32], r2
-+        vst1.32  {d0[1]}, [r7, :32], r2
-+        vst1.32  {d1[0]}, [r0, :32], r2
-+        vst1.32  {d1[1]}, [r7, :32], r2
-+        bgt      1b
-+
-+        pop      {r7-r9,pc}
-+.endm
-+
-+.macro  edge_64b_e3, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        // load c and a
-+        vld1.8   {q4-q5}, [r1, :128]
-+        vldmia   r6, {d16-d24}
-+        vext.8   q0, q8, q9, #\pb
-+        add      r6, r1, #32
-+        vext.8   q1, q9, q10, #\pb
-+        add      r1, r1, r3
-+        vext.8   q2, q10, q11, #\pb
-+        vld1.8   {q6-q7}, [r6, :128]
-+        sub      r6, r1, r3
-+        vext.8   q3, q11, q12, #\pb
-+
-+1:      // load b
-+        vldr     d17, [r1, #-8]
-+        vldmia   r1, {d18-d25}
-+        vext.8   q8, q8, q9, #16 - \pb
-+        pld      [r1, r3]
-+        vext.8   q9, q9, q10, #16 - \pb
-+        subs     r12, #1
-+        vext.8   q10, q10, q11, #16 - \pb
-+        vext.8   q11, q11, q12, #16 - \pb
-+        bl       \body_fn
-+        // next a is mostly available in c
-+        vldr     d24, [r6, #64]
-+        vstmia   r0, {q0-q3}
-+        vext.8   q0, q4, q5, #\pb
-+        it       le
-+        pople    {lr}
-+        vext.8   q1, q5, q6, #\pb
-+        it       le
-+        bxle     lr
-+        vext.8   q2, q6, q7, #\pb
-+        add      r6, r6, r3
-+        vext.8   q3, q7, q12, #\pb
-+        add      r0, r0, r2
-+        // next c is mostly available in b
-+        vext.8   d14, d22, d23, #\pb
-+        vldr     d15, [r1, #56]
-+        vext.8   q4, q8, q9, #\pb
-+        add      r1, r1, r3
-+        vext.8   q5, q9, q10, #\pb
-+        vext.8   q6, q10, q11, #\pb
-+        b        1b
-+.endm
-+
-+.macro  edge_32bx2_e3, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        // load a and first 32b of c
-+        vldmia   r1, {d8-d12}
-+        vldmia   r6, {d24-d28}
-+        vext.8   q2, q4, q5, #\pb
-+        add      r6, r6, r3, lsl #1
-+        vext.8   q3, q5, q6, #\pb
-+        add      r1, r1, r3, lsl #1
-+        vext.8   q0, q12, q13, #\pb
-+        vext.8   q1, q13, q14, #\pb
-+1:
-+        // load second 32b of c and second 32b of b
-+        vldr     d25, [r6, #-8]
-+        subs     r12, #2
-+        vldmia   r6, {d12-d15}
-+        vldr     d27, [r1, #-8]
-+        vldmia   r1, {d20-d23}
-+        // first 32b of b is mostly available in second 32b of c
-+        vext.8   q8, q12, q6, #16 - \pb
-+        vext.8   q9, q6, q7, #16 - \pb
-+        vext.8   q11, q10, q11, #16 - \pb
-+        vext.8   q10, q13, q10, #16 - \pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {q0-q1}, [r0, :256], r2
-+        vst1.8   {q2-q3}, [r7, :256], r2
-+        ble      2f
-+
-+        vldr     d24, [r6, #32]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d11, [r1, #24]
-+        vext.8   d10, d22, d23, #\pb
-+        vldr     d30, [r1, #32]
-+        add      r1, r1, r3, lsl #1
-+        // first 32b of a is mostly available in second 32b of c
-+        vext.8   q0, q6, q7, #\pb
-+        vext.8   q1, q7, q12, #\pb
-+        // first 32b of c is mostly available in second 32b of b
-+        vext.8   q4, q10, q11, #\pb
-+        // second 32b of a is mostly available in first 32b of c
-+        vext.8   q3, q5, q15, #\pb
-+        vext.8   q2, q4, q5, #\pb
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_16b_e3, body_fn, pb
-+        push     {lr}
-+        sub      r6, r1, r3
-+        vld1.8   {q1}, [r1, :128], r3
-+        vldmia   r6, {d18-d20}
-+        add      r6, r6, r3
-+
-+1:      vldr     d5, [r1, #-8]
-+        vld1.8   {q3}, [r1, :128]
-+        subs     r12, #1
-+        vext.8   q0, q9, q10, #\pb
-+        vext.8   q2, q2, q3, #16 - \pb
-+        bl       \body_fn
-+        vst1.8   {q0}, [r0, :128], r2
-+        ble      2f
-+        vmov     q9, q1
-+        vldr     d3, [r1, #8]
-+        add      r1, r1, r3
-+        vldr     d20, [r6, #16]
-+        add      r6, r6, r3
-+        vext.8   d2, d4, d5, #\pb
-+        b        1b
-+
-+2:      pop      {pc}
-+.endm
-+
-+.macro  edge_8bx2_e3, body_fn, pb
-+        sub      r6, r1, r3
-+        push     {r7, lr}
-+        add      r7, r0, r2
-+        lsl      r2, #1
-+        vld1.8   {d18-d19}, [r6]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #8]
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d4, [r6, #-8]
-+        vldr     d3, [r6]
-+        vldr     d21, [r1, #-8]
-+        vldr     d22, [r1]
-+
-+1:      vext.8   d0, d18, d19, #\pb
-+        vext.8   d4, d4, d3, #8 - \pb
-+        vext.8   d1, d2, d20, #\pb
-+        subs     r12, #2
-+        vext.8   d5, d21, d22, #8 - \pb
-+
-+        bl       \body_fn
-+
-+        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r7, :64], r2
-+        ble      2f
-+
-+        vldr     d19, [r6, #8]
-+        add      r6, r6, r3, lsl #1
-+        vldr     d20, [r1, #8]
-+        vmov     d18, d3
-+        vldr     d2, [r1]
-+        add      r1, r1, r3, lsl #1
-+        vldr     d4, [r6, #-8]
-+        vldr     d3, [r6]
-+        vldr     d21, [r1, #-8]
-+        vldr     d22, [r1]
-+        b        1b
-+
-+2:      pop      {r7, pc}
-+.endm
-+
-+.macro  edge_4bx4_e3, body_fn, pb
-+        @ e3 is the same as e2 but with the X offset reversed
-+        edge_4bx4_e2 \body_fn, (-\pb)
-+.endm
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
-+@ simpler and clearer in the code to stick with .word
-+T       .word  (0 + \lab) - (4 + 98b)
-+A       .word  (0 + \lab) - (8 + 98b)
-+.else
-+T       .word   1 + \lab
-+A       .word   \lab
-+.endif
-+.endm
-+
-+.macro edge_64b_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_64b_e0     \body_fn, \pb
-+10:     edge_64b_e1     \body_fn
-+20:     edge_64b_e2     \body_fn, \pb
-+30:     edge_64b_e3     \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_32bx2_e0   \body_fn, \pb
-+10:     edge_32bx2_e1   \body_fn
-+20:     edge_32bx2_e2   \body_fn, \pb
-+30:     edge_32bx2_e3   \body_fn, \pb
-+.endm
-+
-+.macro edge_16b_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+
-+0:      edge_16b_e0     \body_fn, \pb
-+10:     edge_16b_e1     \body_fn
-+20:     edge_16b_e2     \body_fn, \pb
-+30:     edge_16b_e3     \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_32bx2_e0   \body_fn_64b, \pb
-+10:     edge_32bx2_e1   \body_fn_64b
-+20:     edge_32bx2_e2   \body_fn_64b, \pb
-+30:     edge_32bx2_e3   \body_fn_64b, \pb
-+5:      edge_16b_e0     \body_fn_16b, \pb
-+15:     edge_16b_e1     \body_fn_16b
-+25:     edge_16b_e2     \body_fn_16b, \pb
-+35:     edge_16b_e3     \body_fn_16b, \pb
-+.endm
-+
-+.macro edge_16b_8bx2_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_16b_e0     \body_fn, \pb
-+10:     edge_16b_e1     \body_fn
-+20:     edge_16b_e2     \body_fn, \pb
-+30:     edge_16b_e3     \body_fn, \pb
-+5:      edge_8bx2_e0    \body_fn, \pb
-+15:     edge_8bx2_e1    \body_fn
-+25:     edge_8bx2_e2    \body_fn, \pb
-+35:     edge_8bx2_e3    \body_fn, \pb
-+.endm
-+
-+.macro edge_8bx2_4bx4_bodies, body_fn, pb
-+        jent    0f
-+        jent    10f
-+        jent    20f
-+        jent    30f
-+        jent    5f
-+        jent    15f
-+        jent    25f
-+        jent    35f
-+
-+0:      edge_8bx2_e0    \body_fn, \pb
-+10:     edge_8bx2_e1    \body_fn
-+20:     edge_8bx2_e2    \body_fn, \pb
-+30:     edge_8bx2_e3    \body_fn, \pb
-+5:      edge_4bx4_e0    \body_fn, \pb
-+15:     edge_4bx4_e1    \body_fn
-+25:     edge_4bx4_e2    \body_fn, \pb
-+35:     edge_4bx4_e3    \body_fn, \pb
-+.endm
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_8, export=1
-+        edge_16b_init   8, 0, 1, 99f
-+99:
-+        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_8, export=1
-+        edge_16b_init   8, 0, 0, 99f
-+99:
-+        edge_16b_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_8, export=1
-+        edge_64b_init   8, 0, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_8(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_64_neon_8, export=1
-+        edge_64b_init   8, 0, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
-+        edge_16b_init   8, 1, 1, 99f
-+99:
-+        edge_16b_8bx2_bodies edge_16b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
-+        edge_64b_init   8, 1, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_8(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
-+        edge_64b_init   8, 1, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_10, export=1
-+        edge_16b_init   10, 0, 1, 99f
-+99:
-+        edge_16b_8bx2_bodies edge_16b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_10, export=1
-+        edge_64b_init   10, 0, 0, 99f
-+99:
-+        edge_32bx2_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+@ We simply split the 32 case into 2 vertical stripes
-+@ and call the fns for w32
-+@
-+@ Calling code will always have src != dst so we don't have to worry
-+@ about edge effects
-+
-+function ff_hevc_rpi_sao_edge_64_neon_10, export=1
-+        edge_64b_init   10, 0, 1, 99f, xjump=1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_10(
-+@   uint8_t *_dst,            [r0]
-+@   uint8_t *_src,            [r1]
-+@   int  stride_dst,          [r2]
-+@   int16_t *_sao_offset_val, [r3]
-+@   int eo,                   [sp, #0]
-+@   int width,                [sp, #4]
-+@   int height)               [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_10, export=1
-+        edge_64b_init   10, 0, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
-+        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
-+99:
-+        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
-+        edge_64b_init   10, 1, 1, 99f, xjump=1
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_10(
-+@   uint8_t *_dst,                    [r0]
-+@   const uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,             [r2]
-+@   const int16_t *_sao_offset_val_u, [r3]
-+@   const int16_t *_sao_offset_val_v, [sp, #0]
-+@   int eo,                           [sp, #4]
-+@   int width,                        [sp, #8]
-+@   int height)                       [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
-+        edge_64b_init   10, 1, 0, 99f
-+99:
-+        edge_64b_bodies edge_64b_body_16, 4
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
-new file mode 100644
-index 0000000000..36a23a5bf9
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_arm.h
-@@ -0,0 +1,28 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
-+#define AVCODEC_ARM_HEVCPRED_ARM_H
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+
-+void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
-new file mode 100644
-index 0000000000..80724d4cf3
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
-@@ -0,0 +1,35 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/cpu.h"
-+#include "libavutil/arm/cpu.h"
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+#include "rpi_hevcpred_arm.h"
-+
-+av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+    int cpu_flags = av_get_cpu_flags();
-+
-+    if (have_neon(cpu_flags))
-+        ff_hevc_rpi_pred_init_neon(c, bit_depth);
-+}
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
-new file mode 100644
-index 0000000000..21e7700174
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
-@@ -0,0 +1,210 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcpred_arm.h"
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+    switch (bit_depth)
-+    {
-+    case 8:
-+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
-+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
-+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16;  // Equivalent to c_4_neon_8
-+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+
-+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
-+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
-+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
-+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
-+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
-+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
-+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
-+
-+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
-+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
-+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
-+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
-+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
-+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
-+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
-+
-+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
-+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
-+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
-+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
-+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
-+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
-+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
-+
-+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
-+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
-+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
-+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
-+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
-+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
-+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
-+
-+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_8;
-+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_8;
-+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_8;
-+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_8;
-+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
-+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
-+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
-+        break;
-+    case 10:
-+        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
-+        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+        c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
-+        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
-+        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
-+        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
-+        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
-+        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
-+        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
-+        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
-+        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
-+
-+        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
-+        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
-+        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
-+        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
-+        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
-+        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
-+        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
-+
-+        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
-+        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
-+        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
-+        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
-+        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
-+        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
-+        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
-+
-+        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
-+        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
-+        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
-+        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
-+        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
-+        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
-+        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
-+
-+        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_10;
-+        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_10;
-+        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_10;
-+        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_10;
-+        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
-+        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
-+        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
-+        break;
-+    default:
-+        break;
-+    }
-+}
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-new file mode 100644
-index 0000000000..fa8f67cf03
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-@@ -0,0 +1,2984 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+/*
-+ * General angular pred
-+ *
-+ * Horizontal (10) & Vertical (26) cases have their own file
-+ * and are not dealt with properly here (luma filtering is missing)
-+ *
-+ * The inv_angle calculations are annoying - if it wasn't for the +128
-+ * rounding step then the result would simply be the loop counter :-(
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.text
-+
-+@ Horizontal Patch functions
-+@ These need a transpose before store so exist as smaller patches
-+@ Patches can be called repeatedly without any intermediate setup
-+@ to generate a horizontal block
-+@
-+@ It is almost certainly the case that larger patch fns can be built
-+@ and they would be a little faster, but we would still need the small
-+@ fns and code size (or at least instruction cache size) is an issue
-+@ given how much code we already have here
-+
-+@ Generate 8x8 luma 8 patch
-+@
-+@ r3   Out stride
-+@ r4   Angle add
-+@ r7   Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2   Left ptr - updated
-+@ r10  Inv angle accumulator (_up only)
-+@ r12  32 - angle frac (_down) or angle frac (_up)
-+@ d0   Older reference samples
-+@ d1=r8+r9  Newer reference samples
-+@ d2   32 - angle frac
-+@ d3   Angle frac
-+@ q2   Partially computed next result (_up only)
-+@
-+@ Temps
-+@ r5   Loop counter
-+@ r6
-+@ r7   (_down only)
-+@ r11  (_up only)
-+@ q2, q8-q11
-+
-+patch_h_down_8x8_8:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        lsr         r8, #8
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #24
-+        ldr         r9, [r2, #5]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_8x8_8_continue:
-+        mov         r5, #8
-+1:
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+        vext.8      q8, q8, q9, #8
-+          itt         mi
-+          lsrmi       r7, r8, #8
-+          vmovmi      d0, r8, r9
-+          vdup.8      d2, r12
-+        vext.8      q9, q9, q10, #8
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #24
-+        vext.8      q10, q10, q11, #8
-+          it          mi
-+          ldrmi       r9, [r2, #1]!
-+        vmov        d22, d23
-+        vrshrn.u16  d23, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.8      d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_8x8_8:
-+        vzip.8      d16, d17
-+        add         r6, r0, r3
-+        vzip.8      d18, d19
-+        lsl         r3, #1
-+        vzip.8      d20, d21
-+        add         r5, r0, r3
-+        vzip.8      d22, d23
-+        vzip.16     q8, q9
-+        vzip.16     q10, q11
-+        vzip.32     q8, q10
-+        vzip.32     q9, q11
-+        vst1.8      {d16}, [r0]!
-+        vst1.8      {d17}, [r6], r3
-+        vst1.8      {d20}, [r5], r3
-+        vst1.8      {d21}, [r6], r3
-+        vst1.8      {d18}, [r5], r3
-+        vst1.8      {d19}, [r6], r3
-+        vst1.8      {d22}, [r5]
-+        asr         r3, #1
-+        vst1.8      {d23}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_8x8_8:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r4
-+        lsr         r11, r8, #24
-+        vdup.8      d2, r6
-+        ldr         r8, [r2, #-1]!
-+        orr         r9, r11, r9, lsl #8
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmull.u8    q2, d0, d2
-+        vmlal.u8    q2, d1, d3
-+patch_h_up_8x8_8_continue:
-+        mov         r5, #8
-+1:
-+          add         r12, r4
-+          mov         r11, #0
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+        vext.8      q8, q8, q9, #8
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #8
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          vdup.8      d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #24
-+        vext.8      q9, q9, q10, #8
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #8
-+          ldrbcs      r11, [r1, r11]
-+          vdup.8      d3, r12
-+        vext.8      q10, q10, q11, #8
-+          it          hi
-+          ldrbhi      r11, [r2, #-1]!
-+        vmov        d22, d23
-+        vrshrn.u16  d23, q2, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #8
-+          vmovcs      d1, r8, r9
-+          vmull.u8    q2, d0, d2
-+        subs        r5, #1
-+          vmlal.u8    q2, d1, d3
-+        bne         1b
-+
-+        b           store_tran_8x8_8
-+
-+
-+.macro ADRT reg, val
-+@ adr in T32 has enough range but not in A32
-+A       adrl        \reg, \val
-+T       adr         \reg, \val
-+.endm
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        ldr         lr, [r2], #1        @ Top
-+        rsb         r12, r6, #32
-+        vmov        s0, lr
-+        vdup.8      d3, r6
-+        ldr         lr, [r2], #1
-+        vdup.8      d2, r12
-+        vmov        s2, lr
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      s0, lr
-+          ldrmi       lr, [r2], #1
-+          vdup.8      d2, r12
-+          it          mi
-+          vmovmi      s2, lr
-+          vdup.8      d3, r6
-+        mov         r5, #2
-+1:
-+        vrshrn.u16  d20, q2, #5
-+            subs        r12, r4
-+          vmull.u8    q2, d0, d2
-+            it          mi
-+            addmi       r12, #32
-+          vmlal.u8    q2, d1, d3
-+            rsb         r6, r12, #32
-+        vext.64     q8, q8, q9, #1
-+            it          mi
-+            vmovmi      s0, lr
-+        vext.64     q9, q9, q10, #1
-+            it          mi
-+            ldrmi       lr, [r2], #1
-+            vdup.8      d2, r12
-+            it          mi
-+            vmovmi      s2, lr
-+        subs        r5, #1
-+            vdup.8      d3, r6
-+        bne         1b
-+
-+          vrshrn.u16  d20, q2, #5
-+            vmull.u8    q2, d0, d2
-+        add         r12, r0,  r3
-+            vmlal.u8    q2, d1, d3
-+        lsl         r3,  #1
-+          vext.64     q8, q8, q9, #1
-+          vext.64     q9, q9, q10, #1
-+            vrshrn.u16  d20, q2, #5
-+
-+98:
-+        vst4.8      {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
-+        vst4.8      {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
-+        vst4.8      {d17[2], d18[2], d19[2], d20[2]}, [r0]
-+        vst4.8      {d17[3], d18[3], d19[3], d20[3]}, [r12]
-+        pop        {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        rsb         r12, r6, #32
-+        ldr         lr, [r2]            @ Left
-+        ldrb        r2, [r2, #-1]       @ Top-left
-+        vmov        s0, lr
-+        vdup.8      d2, r12
-+        vdup.8      d3, r6
-+        orr         lr, r2, lr, lsl #8
-+        vmov        s2, lr
-+        sub         r8, r7, #128
-+        mov         r5, #3
-+2:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+T         it          mi
-+          addmi       r12, #32
-+T         asr         r6, r8, #8
-+T         it          mi
-+T         ldrbmi      r2, [r1, r6]
-+A         ldrbmi      r2, [r1, r8, asr #8]
-+          rsb         r6, r12, #32
-+          vdup.8      d2, r12
-+          ittt        mi
-+          vmovmi      s0, lr
-+          orrmi       lr, r2, lr, lsl #8
-+          vmovmi      s2, lr
-+        vrshrn.u16  d20, q2, #5
-+          vdup.8      d3, r6
-+          it          mi
-+          addmi       r8, r7
-+        subs        r5, #1
-+        vext.64     q8, q8, q9, #1
-+        vext.64     q9, q9, q10, #1
-+        bne         2b
-+
-+          vmull.u8    q2, d0, d2
-+        add         r12, r0,  r3
-+          vmlal.u8    q2, d1, d3
-+        lsl         r3,  #1
-+          vrshrn.u16  d20, q2, #5
-+        b           98b
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrh        r7, [r7]
-+        rsb         r12, r6, #32
-+        ldr         lr, [r1]            @ Top
-+        ldrb        r1, [r2, #-1]       @ Top-left
-+        vmov        s0, lr
-+        vdup.8      d2, r12
-+        vdup.8      d3, r6
-+        orr         lr, r1, lr, lsl #8
-+        vmov        s2, lr
-+        sub         r8, r7, #128
-+        mov         r5, #3
-+2:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+T         it          mi
-+          addmi       r12, #32
-+T         asr         r6, r8, #8
-+T         it          mi
-+T         ldrbmi      r1, [r2, r6]
-+A         ldrbmi      r1, [r2, r8, asr #8]
-+          rsb         r6, r12, #32
-+          vdup.8      d2, r12
-+          ittt        mi
-+          vmovmi      s0, lr
-+          orrmi       lr, r1, lr, lsl #8
-+          vmovmi      s2, lr
-+        vrshrn.u16  d4, q2, #5
-+          vdup.8      d3, r6
-+          it          mi
-+          addmi       r8, r7
-+        subs        r5, #1
-+        vst1.32     {d4[0]}, [r0], r3
-+        bne         2b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.32     {d4[0]}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldr         lr, [r1], #1        @ Top
-+        rsb         r12, r6, #32
-+        vmov        s0, lr
-+        vdup.8      d3, r6
-+        ldr         lr, [r1], #1
-+        vdup.8      d2, r12
-+        vmov        s2, lr
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      s0, lr
-+          ldrmi       lr, [r1], #1
-+          vdup.8      d2, r12
-+          it          mi
-+          vmovmi      s2, lr
-+          vdup.8      d3, r6
-+        mov         r5, #2
-+1:
-+        vrshrn.u16  d6, q2, #5
-+            subs        r12, r4
-+          vmull.u8    q2, d0, d2
-+            it          mi
-+            addmi       r12, #32
-+          vmlal.u8    q2, d1, d3
-+            rsb         r6, r12, #32
-+        vst1.32     {d6[0]}, [r0], r3
-+            itt         mi
-+            vmovmi      s0, lr
-+            ldrmi       lr, [r1], #1
-+            vdup.8      d2, r12
-+            it          mi
-+            vmovmi      s2, lr
-+        subs        r5, #1
-+            vdup.8      d3, r6
-+        bne         1b
-+
-+          vrshrn.u16  d6, q2, #5
-+            vmull.u8    q2, d0, d2
-+            vmlal.u8    q2, d1, d3
-+          vst1.32     {d6[0]}, [r0], r3
-+            vrshrn.u16  d6, q2, #5
-+            vst1.32     {d6[0]}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_8x8_8
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_8x8_8
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrb        lr, [r2, #-1]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #8
-+        vdup.8      d2, r12
-+        orr         r9, r9, r8, lsr #24
-+        orr         r8, lr, r8, lsl #8
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #7
-+1:
-+        vdup.8      d3, r6
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r12, r4
-+        vmlal.u8    q2, d1, d3
-+          ittt        mi
-+          addmi       lr, r2, r1, asr #8
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #8
-+          ldrbmi      lr, [lr]
-+          vdup.8      d2, r12
-+        vrshrn.u16  d4, q2, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #24
-+          orrmi       r8, lr, r8, lsl #8
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.8      {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.8      d3, r6
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.8      {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        mov         r5, #7
-+        lsr         r8, #8
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #24
-+        ldr         r9, [r1, #5]!
-+        vmov        d1, r8, r9
-+1:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #8
-+          vdup.8      d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #24
-+          ldrmi       r9, [r1, #1]!
-+        vrshrn.u16  d6, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.8      d3, r6
-+        subs        r5, #1
-+        vst1.8      {d6}, [r0], r3
-+        bne         1b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d6, q2, #5
-+          vst1.8      {d6}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+
-+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #3
-+
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #3
-+        sub         r10, r10, r7, lsl #3
-+
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q9}, [r1]
-+        sub         r1, r2, #1
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.8      d6, r6
-+        vext.8      q8, q9, q9, #15
-+        sub         r8, r7, #128
-+        vld1.8      {d16[0]}, [r1]
-+        vdup.8      d7, r12
-+        mov         r5, #15
-+1:
-+        vmull.u8    q0, d18, d7
-+        subs        r12, r4
-+        vmlal.u8    q0, d16, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d19, d7
-+        it          cc
-+        addcc       r1, r2, r8, asr #8
-+        vmlal.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vext.8      q10, q8, q8, #15
-+        sub         r5, #1
-+        vld1.8      {d20[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d22, d7
-+        subs        r12, r4
-+        vmlal.u8    q0, d20, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d23, d7
-+        it          cc
-+        addcc       r1, r2, r8, asr #8
-+        vmlal.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vext.8      q8, q10, q10, #15
-+        sub         r5, #1
-+        vld1.8      {d16[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d22, d7
-+        vmlal.u8    q0, d20, d6
-+        vmull.u8    q1, d23, d7
-+        vmlal.u8    q1, d21, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d18, d7
-+        vmlal.u8    q0, d16, d6
-+        vmull.u8    q1, d19, d7
-+        vmlal.u8    q1, d17, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.8      {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vext.8      q8, q9, q9, #1
-+        vld1.8      {d17[7]}, [r1]!
-+        mov         r5, #15
-+1:
-+        vmull.u8    q0, d16, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d18, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d19, d7
-+        sub         r5, #1
-+        vext.8      q10, q8, q8, #1
-+        teq         r5, #0
-+        vld1.8      {d21[7]}, [r1]
-+        it          cc
-+        addcc       r1, #1
-+        vmov        q11, q8
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d20, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d22, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d23, d7
-+        sub         r5, #1
-+        vext.8      q8, q10, q10, #1
-+        teq         r5, #0
-+        vld1.8      {d17[7]}, [r1]
-+        it          cc
-+        addcc       r1, #1
-+        vmov        q9, q10
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d20, d6
-+        vmlal.u8    q0, d22, d7
-+        vmull.u8    q1, d21, d6
-+        vmlal.u8    q1, d23, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d16, d6
-+        vmlal.u8    q0, d18, d7
-+        vmull.u8    q1, d17, d6
-+        vmlal.u8    q1, d19, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_8x8_8
-+        bl          patch_h_down_8x8_8_continue
-+        bl          patch_h_down_8x8_8_continue
-+        bl          patch_h_down_8x8_8_continue
-+
-+        add         r2, r1, #8          @ restore r2, but 8 rows further down left
-+        add         r1, r1, #8
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #3
-+        bne         1b
-+
-+        pop        {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2,r10}
-+        bl          patch_h_up_8x8_8
-+        bl          patch_h_up_8x8_8_continue
-+        bl          patch_h_up_8x8_8_continue
-+        bl          patch_h_up_8x8_8_continue
-+        pop         {r2,r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #3
-+        sub         r10, r10, r7, lsl #3
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop        {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q0-q1}, [r1]
-+        sub         r9, r2, #1
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        mov         r5, #32
-+1:
-+        vld1.8      {d17[7]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        add         r9, r2, r8, asr #8
-+        vext.8      q1, q0, q1, #15
-+        vext.8      q0, q8, q0, #15
-+2:
-+        vmull.u8    q10, d4, d19
-+        subs        r12, r4
-+        vmlal.u8    q10, d0, d18
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d5, d19
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d1, d18
-+        sub         r5, #1
-+        vmull.u8    q12, d6, d19
-+        teq         r5, #0
-+        vmlal.u8    q12, d2, d18
-+        vmull.u8    q13, d7, d19
-+        vmlal.u8    q13, d3, d18
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.8      {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.8      {d16[0]}, [r5]
-+        mov         r5, #32
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #1
-+        vmov        q3, q1
-+        vext.8      q0, q0, q1, #1
-+        vext.8      q1, q1, q8, #1
-+2:
-+        vmull.u8    q10, d0, d18
-+        subs        r12, r4
-+        vmlal.u8    q10, d4, d19
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d1, d18
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d5, d19
-+        sub         r5, #1
-+        vmull.u8    q12, d2, d18
-+        teq         r5, #0
-+        vmlal.u8    q12, d6, d19
-+        vmull.u8    q13, d3, d18
-+        vmlal.u8    q13, d7, d19
-+        vld1.8      {d16[0]}, [r1]
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ Chroma 8 bit 4x4 patch fns
-+        .text
-+
-+patch_h_down_c_4x4_8:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        lsr         r8, #16
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r2, #6]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_c_4x4_8_continue:
-+        mov         r5, #4
-+1:
-+          subs        r12, r4
-+        vmull.u8    q2, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmlal.u8    q2, d1, d3
-+          rsb         r6, r12, #32
-+        vext.8      q8, q8, q9, #8
-+          it          mi
-+          lsrmi       r7, r8, #16
-+        vmov        d18, d19
-+          it          mi
-+          vmovmi      d0, r8, r9
-+          vdup.8      d2, r12
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #16
-+        vrshrn.u16  d19, q2, #5
-+          itt         mi
-+          ldrmi       r9, [r2, #2]!
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.8      d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_c_4x4_8:
-+        vzip.16     d16, d17
-+        add         r6, r0, r3
-+        vzip.16     d18, d19
-+        lsl         r3, #1
-+        vzip.32     q8, q9
-+        add         r5, r0, r3
-+        vst1.16     {d16}, [r0]!
-+        vst1.16     {d17}, [r6], r3
-+        vst1.16     {d18}, [r5]
-+        asr         r3, #1
-+        vst1.16     {d19}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_c_4x4_8:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r4
-+        lsr         r11, r8, #16
-+        vdup.8      d2, r6
-+        ldr         r8, [r2, #-2]!
-+        orr         r9, r11, r9, lsl #16
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmull.u8    q2, d0, d2
-+        vmlal.u8    q2, d1, d3
-+patch_h_up_c_4x4_8_continue:
-+        mov         r5, #4
-+1:
-+          add         r12, r4
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+          mov         r11, #0
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #7
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          it          eq
-+          biceq       r11, #1
-+          vdup.8      d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #16
-+          vdup.8      d3, r12
-+        vext.8      q8, q8, q9, #8
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #16
-+          ldrhcs      r11, [r1, r11]
-+        vmov        d18, d19
-+          it          hi
-+          ldrhhi      r11, [r2, #-2]!
-+        vrshrn.u16  d19, q2, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #16
-+          vmovcs      d1, r8, r9
-+          vmull.u8    q2, d0, d2
-+        subs        r5, #1
-+          vmlal.u8    q2, d1, d3
-+        bne         1b
-+
-+        b           store_tran_c_4x4_8
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_c_4x4_8
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_c_4x4_8
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrh        lr, [r2, #-2]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #16
-+        vdup.8      d2, r12
-+        orr         r9, r9, r8, lsr #16
-+        orr         r8, lr, r8, lsl #16
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #3
-+1:
-+        vdup.8      d3, r6
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r12, r4
-+        vmlal.u8    q2, d1, d3
-+          itttt       mi
-+          addmi       lr, r2, r1, asr #7
-+          bicmi       lr, #1
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #16
-+          ldrhmi      lr, [lr]
-+          vdup.8      d2, r12
-+        vrshrn.u16  d4, q2, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #16
-+          orrmi       r8, lr, r8, lsl #16
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.8      d3, r6
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d4, q2, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.8      d3, r6
-+        mov         r5, #3
-+        lsr         r8, #16
-+        vdup.8      d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r1, #6]!
-+        vmov        d1, r8, r9
-+1:
-+        vmull.u8    q2, d0, d2
-+          subs        r12, r4
-+        vmlal.u8    q2, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #16
-+          vdup.8      d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #16
-+          ldrmi       r9, [r1, #2]!
-+        vrshrn.u16  d6, q2, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.8      d3, r6
-+        subs        r5, #1
-+        vst1.16     {d6}, [r0], r3
-+        bne         1b
-+
-+          vmull.u8    q2, d0, d2
-+          vmlal.u8    q2, d1, d3
-+          vrshrn.u16  d6, q2, #5
-+          vst1.16     {d6}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q9}, [r1]
-+        sub         r1, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.8      d6, r6
-+        vext.8      q8, q9, q9, #14
-+        sub         r8, r7, #128
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.8      d7, r12
-+        mov         r5, #7
-+1:
-+        subs        r12, r4
-+        vmull.u8    q0, d18, d7
-+        it          cc
-+        asrcc       r1, r8, #8
-+        vmlal.u8    q0, d16, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d19, d7
-+        it          cc
-+        addcc       r1, r2, r1, lsl #1
-+        vmlal.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vext.8      q10, q8, q8, #14
-+        sub         r5, #1
-+        vld1.16     {d20[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        subs        r12, r4
-+        vmull.u8    q0, d22, d7
-+        it          cc
-+        asrcc       r1, r8, #8
-+        vmlal.u8    q0, d20, d6
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d23, d7
-+        it          cc
-+        addcc       r1, r2, r1, lsl #1
-+        vmlal.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vext.8      q8, q10, q10, #14
-+        sub         r5, #1
-+        vld1.16     {d16[0]}, [r1]
-+        it          cc
-+        addcc       r8, r7
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d22, d7
-+        vmlal.u8    q0, d20, d6
-+        vmull.u8    q1, d23, d7
-+        vmlal.u8    q1, d21, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d18, d7
-+        vmlal.u8    q0, d16, d6
-+        vmull.u8    q1, d19, d7
-+        vmlal.u8    q1, d17, d6
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.8      {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vext.8      q8, q9, q9, #2
-+        vld1.16     {d17[3]}, [r1]!
-+        mov         r5, #7
-+1:
-+        vmull.u8    q0, d16, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d18, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d17, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d19, d7
-+        sub         r5, #1
-+        vext.8      q10, q8, q8, #2
-+        teq         r5, #0
-+        vld1.16     {d21[3]}, [r1]
-+        it          cc
-+        addcc       r1, #2
-+        vmov        q11, q8
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmull.u8    q0, d20, d6
-+        subs        r12, r4
-+        vmlal.u8    q0, d22, d7
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q1, d21, d6
-+        rsb         r6, r12, #32
-+        vmlal.u8    q1, d23, d7
-+        sub         r5, #1
-+        vext.8      q8, q10, q10, #2
-+        teq         r5, #0
-+        vld1.16     {d17[3]}, [r1]
-+        it          cc
-+        addcc       r1, #2
-+        vmov        q9, q10
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vdup.8      d6, r6
-+        vdup.8      d7, r12
-+        vst1.8      {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmull.u8    q0, d20, d6
-+        vmlal.u8    q0, d22, d7
-+        vmull.u8    q1, d21, d6
-+        vmlal.u8    q1, d23, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmull.u8    q0, d16, d6
-+        vmlal.u8    q0, d18, d7
-+        vmull.u8    q1, d17, d6
-+        vmlal.u8    q1, d19, d7
-+        vrshrn.u16  d0, q0, #5
-+        vrshrn.u16  d1, q1, #5
-+        vst1.8      {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_c_4x4_8
-+        bl          patch_h_down_c_4x4_8_continue
-+        bl          patch_h_down_c_4x4_8_continue
-+        bl          patch_h_down_c_4x4_8_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_c_4x4_8
-+        bl          patch_h_up_c_4x4_8_continue
-+        bl          patch_h_up_c_4x4_8_continue
-+        bl          patch_h_up_c_4x4_8_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.8      {q0-q1}, [r1]
-+        sub         r9, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        mov         r5, #16
-+1:
-+        vld1.16     {d17[3]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.8      q1, q0, q1, #14
-+        add         r9, r2, r9, lsl #1
-+        vext.8      q0, q8, q0, #14
-+2:
-+        vmull.u8    q10, d4, d19
-+        subs        r12, r4
-+        vmlal.u8    q10, d0, d18
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d5, d19
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d1, d18
-+        sub         r5, #1
-+        vmull.u8    q12, d6, d19
-+        teq         r5, #0
-+        vmlal.u8    q12, d2, d18
-+        vmull.u8    q13, d7, d19
-+        vmlal.u8    q13, d3, d18
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.8      {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.16     {d16[0]}, [r5]
-+        mov         r5, #16
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #2
-+        vmov        q3, q1
-+        vext.8      q0, q0, q1, #2
-+        vext.8      q1, q1, q8, #2
-+2:
-+        vmull.u8    q10, d0, d18
-+        subs        r12, r4
-+        vmlal.u8    q10, d4, d19
-+        it          cc
-+        addcc       r12, #32
-+        vmull.u8    q11, d1, d18
-+        rsb         r6, r12, #32
-+        vmlal.u8    q11, d5, d19
-+        sub         r5, #1
-+        vmull.u8    q12, d2, d18
-+        teq         r5, #0
-+        vmlal.u8    q12, d6, d19
-+        vmull.u8    q13, d3, d18
-+        vmlal.u8    q13, d7, d19
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.8      d18, r6
-+        vdup.8      d19, r12
-+        vrshrn.u16  d20, q10, #5
-+        vrshrn.u16  d21, q11, #5
-+        vrshrn.u16  d22, q12, #5
-+        vrshrn.u16  d23, q13, #5
-+        vst1.8      {q10-q11}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+@------------------------------------------------------------------------------
-+@ Data
-+
-+        .text
-+        .balign  64
-+angle_2:
-+        .byte    32
-+        .byte    26,  21,  17,  13,   9,   5,   2,   0
-+        @ Sign inverted from standards table
-+        .byte     2,   5,   9,  13,  17,  21,  26,  32
-+        .byte    26,  21,  17,  13,   9,   5,   2,   0
-+        @ Standard sign
-+        .byte     2,   5,   9,  13,  17,  21,  26,  32
-+
-+        .balign   2
-+
-+        @ Sign inverted from standards table
-+inv_angle:
-+        .short   4096, 1638,  910,  630,  482,  390,  315
-+        .short    256
-+        .short    315,  390,  482,  630,  910, 1638, 4096
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 bit fns
-+@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
-+@ but runs out of register width for 12+ bit
-+
-+        .text
-+        .balign 64
-+
-+patch_h_down_4x4_10:
-+        ldrd        r8, r9, [r2]        @ Left
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r6
-+        lsr         r8, #16
-+        vdup.16     d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r2, #6]!
-+        vmov        d1, r8, r9
-+        // drop through...
-+patch_h_down_4x4_10_continue:
-+        mov         r5, #4
-+1:
-+          subs        r12, r4
-+        vmul.u16    d4, d0, d2
-+          it          mi
-+          addmi       r12, #32
-+        vmla.u16    d4, d1, d3
-+          rsb         r6, r12, #32
-+        vext.16     q8, q8, q9, #4
-+          it          mi
-+          lsrmi       r7, r8, #16
-+        vmov        d18, d19
-+          it          mi
-+          vmovmi      d0, r8, r9
-+          vdup.16     d2, r12
-+          it          mi
-+          orrmi       r8, r7, r9, lsl #16
-+        vrshr.u16   d19, d4, #5
-+          itt         mi
-+          ldrmi       r9, [r2, #2]!
-+          vmovmi      d1, r8, r9
-+        subs        r5, #1
-+          vdup.16     d3, r6
-+        bne         1b
-+        // drop through...
-+store_tran_4x4_10:
-+        vzip.16     d16, d17
-+        add         r6, r0, r3
-+        vzip.16     d18, d19
-+        lsl         r3, #1
-+        vzip.32     q8, q9
-+        add         r5, r0, r3
-+        vst1.16     {d16}, [r0]!
-+        vst1.16     {d17}, [r6], r3
-+        vst1.16     {d18}, [r5]
-+        asr         r3, #1
-+        vst1.16     {d19}, [r6]
-+
-+        bx          lr
-+
-+patch_h_up_4x4_10:
-+        ldrd        r8, r9, [r2]
-+        rsb         r6, r4, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r4
-+        lsr         r11, r8, #16
-+        vdup.16     d2, r6
-+        ldr         r8, [r2, #-2]!
-+        orr         r9, r11, r9, lsl #16
-+        vmov        d1, r8, r9
-+        mov         r12, r4
-+        vmul.u16    d4, d0, d2
-+        vmla.u16    d4, d1, d3
-+patch_h_up_4x4_10_continue:
-+        mov         r5, #4
-+1:
-+          add         r12, r4
-+          cmp         r12, #33
-+          it          cs
-+          addcs       r10, r7
-+          mov         r11, #0
-+          itt         cs
-+          subcs       r12, #32
-+          tstcs       r10, #1<<31
-+          rsb         r6, r12, #32
-+          it          eq
-+          asreq       r11, r10, #7
-+          it          cs
-+          vmovcs      d0, r8, r9
-+          it          eq
-+          biceq       r11, #1
-+          vdup.16     d2, r6
-+          it          cs
-+          lsrcs       r6, r8, #16
-+          vdup.16     d3, r12
-+        vext.16     q8, q8, q9, #4
-+          itt         cs
-+          orrcs       r9, r6, r9, lsl #16
-+          ldrhcs      r11, [r1, r11]
-+        vmov        d18, d19
-+          it          hi
-+          ldrhhi      r11, [r2, #-2]!
-+        vrshr.u16   d19, d4, #5
-+          itt         cs
-+          orrcs       r8, r11, r8, lsl #16
-+          vmovcs      d1, r8, r9
-+          vmul.u16    d4, d0, d2
-+        subs        r5, #1
-+          vmla.u16    d4, d1, d3
-+        bne         1b
-+
-+        b           store_tran_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_4x4_10
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        bl          patch_h_up_4x4_10
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        ldrh        lr, [r2, #-2]       @ Top-left
-+        ldrh        r7, [r7]
-+        vmov        d0, r8, r9
-+        lsl         r9, r9, #16
-+        vdup.16     d2, r12
-+        orr         r9, r9, r8, lsr #16
-+        orr         r8, lr, r8, lsl #16
-+        vmov        d1, r8, r9
-+        sub         r1, r7, #128
-+        mov         r5, #3
-+1:
-+        sel         lr, lr, lr          @ force pipeline 0 on Cortex-A53
-+        vdup.16     d3, r6
-+        vmul.u16    d4, d0, d2
-+          subs        r12, r12, r4
-+        vmla.u16    d4, d1, d3
-+          itttt       mi
-+          addmi       lr, r2, r1, asr #7
-+          bicmi       lr, #1
-+          addmi       r12, r12, #32
-+          vmovmi      d0, r8, r9
-+          rsb         r6, r12, #32
-+          itt         mi
-+          lslmi       r9, r9, #16
-+          ldrhmi      lr, [lr]
-+          vdup.16     d2, r12
-+        vrshr.u16   d4, d4, #5
-+          itttt       mi
-+          orrmi       r9, r9, r8, lsr #16
-+          orrmi       r8, lr, r8, lsl #16
-+          vmovmi      d1, r8, r9
-+          addmi       r1, r1, r7
-+        subs        r5, r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vdup.16     d3, r6
-+          nop                           @ force next insn into pipeline 0 to enable
-+          vmul.u16    d4, d0, d2        @ vmla to execute back-to-back on Cortex-A53
-+          vmla.u16    d4, d1, d3
-+          vrshr.u16   d4, d4, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        ldrd        r8, r9, [r1]        @ Top
-+        rsb         r12, r6, #32
-+        vmov        d0, r8, r9
-+        vdup.16     d3, r6
-+        lsr         r8, #16
-+        vdup.16     d2, r12
-+        orr         r8, r8, r9, lsl #16
-+        ldr         r9, [r1, #6]!
-+        vmov        d1, r8, r9
-+        mov         r5, #3
-+1:
-+        vmul.u16    d4, d0, d2
-+          subs        r12, r4
-+        vmla.u16    d4, d1, d3
-+          it          mi
-+          addmi       r12, #32
-+          rsb         r6, r12, #32
-+          itt         mi
-+          vmovmi      d0, r8, r9
-+          lsrmi       r8, #16
-+          vdup.16     d2, r12
-+          itt         mi
-+          orrmi       r8, r8, r9, lsl #16
-+          ldrmi       r9, [r1, #2]!
-+        vrshr.u16   d4, d4, #5
-+          it          mi
-+          vmovmi      d1, r8, r9
-+          vdup.16     d3, r6
-+        subs        r5, #1
-+        vst1.16     {d4}, [r0], r3
-+        bne         1b
-+
-+          vmul.u16    d4, d0, d2
-+          vmla.u16    d4, d1, d3
-+          vrshr.u16   d4, d4, #5
-+          vst1.16     {d4}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
-+        sub         r0, #16
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+
-+        push        {r2}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2}
-+
-+        sub         r0, #16
-+        mov         r10, #-128
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q9}, [r1]
-+        sub         r1, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.16     q2, r6
-+        vext.16     q8, q9, q9, #7
-+        sub         r8, r7, #128
-+        vld1.16     {d16[0]}, [r1]
-+        vdup.16     q3, r12
-+        mov         r5, #7
-+1:
-+        vmul.u16    q0, q9, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q8, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #1
-+        vext.16     q10, q8, q8, #7
-+        rsb         r6, r12, #32
-+        vmov        q11, q8
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.16     {d20[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q11, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q10, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #1
-+        vext.16     q8, q10, q10, #7
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.16     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q11, q3
-+        vmla.u16    q0, q10, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q9, q3
-+        vmla.u16    q0, q8, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.16     {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vext.16     q8, q9, q9, #1
-+        vld1.16     {d17[3]}, [r1]!
-+        mov         r5, #7
-+1:
-+        vmul.u16    q0, q8, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q9, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q10, q8, q8, #1
-+        rsb         r6, r12, #32
-+        vld1.16     {d21[3]}, [r1]
-+        sub         r5, #1
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #2
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q10, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q11, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q8, q10, q10, #1
-+        rsb         r6, r12, #32
-+        vld1.16     {d17[3]}, [r1]
-+        sub         r5, #1
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #2
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q10, q2
-+        vmla.u16    q0, q11, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q8, q2
-+        vmla.u16    q0, q9, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #32
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<2
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #32
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q0-q1}, [r1]
-+        sub         r9, r2, #2
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        mov         r5, #16
-+1:
-+        vld1.16     {d17[3]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.16     q1, q0, q1, #7
-+        add         r9, r2, r9, lsl #1
-+        vext.16     q0, q8, q0, #7
-+2:
-+        vmul.u16    q11, q2, q10
-+        subs        r12, r4
-+        vmla.u16    q11, q0, q9
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q3, q10
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q1, q9
-+        sub         r5, #1
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.16     {d16[0]}, [r5]
-+        mov         r5, #16
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #2
-+        vmov        q3, q1
-+        vext.16     q0, q0, q1, #1
-+        vext.16     q1, q1, q8, #1
-+2:
-+        vmul.u16    q11, q0, q9
-+        subs        r12, r4
-+        vmla.u16    q11, q2, q10
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q1, q9
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q3, q10
-+        sub         r5, #1
-+        vld1.16     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r11, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #1
-+        vpush       {d8}
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        add         sp, #8
-+        mov         r10, #8
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_4x4_10
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+        bl          patch_h_down_4x4_10_continue
-+
-+        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*2
-+        mov         r6, r4
-+        sub         r0, #64
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        add         sp, #8
-+        ldrh        r7, [r7]
-+        mov         r10, #-128
-+        vmov.i8     d6, #1<<6
-+1:
-+        push        {r2, r10}
-+        bl          patch_h_up_4x4_10
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        bl          patch_h_up_4x4_10_continue
-+        pop         {r2, r10}
-+
-+        vmov        r8, s12
-+        sub         r0, #64
-+        add         r2, #8
-+        add         r0, r0, r3, lsl #2
-+        sub         r10, r10, r7, lsl #2
-+        vshr.u8     d6, #1
-+        teq         r8, #0
-+        bne         1b
-+
-+        pop         {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        sub         r9, r2, #2
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vmov        d0, d9
-+        vmov        s2, r12
-+        add         r10, r0, #32
-+        mov         r5, #32
-+1:
-+        vld1.16     {d1[3]}, [r9]
-+        add         r8, r7
-+        vmov        q11, q4
-+        vmov        q10, q3
-+        asr         r9, r8, #8
-+        vmov        q9, q2
-+        add         r9, r2, r9, lsl #1
-+        vmov        q8, q1
-+        vext.16     q4, q3, q4, #7
-+        vext.16     q3, q2, q3, #7
-+        vext.16     q2, q1, q2, #7
-+        vext.16     q1, q0, q1, #7
-+2:
-+        vmul.u16    q12, q8, d1[1]
-+        adds        r12, r4
-+        vmla.u16    q12, q1, d1[0]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q9, d1[1]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q2, d1[0]
-+        sub         r5, #1
-+        vmul.u16    q14, q10, d1[1]
-+        teq         r5, #0
-+        vmla.u16    q14, q3, d1[0]
-+        vmul.u16    q15, q11, d1[1]
-+        vmla.u16    q15, q4, d1[0]
-+        vmov        s2, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d0
-+        pop         {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        add         r1, r1, #64
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        vmov        d1, d9
-+        vmov        s1, r12
-+        add         r10, r0, #32
-+        mov         r5, #32
-+1:
-+        vld1.16     {d0[0]}, [r1]!
-+        vmov        q8, q1
-+        vmov        q9, q2
-+        vmov        q10, q3
-+        vmov        q11, q4
-+        vext.16     q1, q1, q2, #1
-+        vext.16     q2, q2, q3, #1
-+        vext.16     q3, q3, q4, #1
-+        vext.16     q4, q4, q0, #1
-+2:
-+        vmul.u16    q12, q1, d0[2]
-+        adds        r12, r4
-+        vmla.u16    q12, q8, d0[3]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q2, d0[2]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q9, d0[3]
-+        sub         r5, #1
-+        vmul.u16    q14, q3, d0[2]
-+        teq         r5, #0
-+        vmla.u16    q14, q10, d0[3]
-+        vmul.u16    q15, q4, d0[2]
-+        vmla.u16    q15, q11, d0[3]
-+        vmov        s1, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d1
-+        pop         {r4-r11, pc}
-+
-+endfunc
-+
-+
-+
-+@ Generate 4x4 chroma patch
-+@
-+@ In (const)
-+@ r1   Up ptr (_up only)
-+@ r3   Out stride
-+@ r4   Angle add
-+@ r7   Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2   Left ptr - updated
-+@ r6   Angle frac (init to r4 + 32)
-+@ r8   Inv angle accumulator
-+@ q2   Cur Line - load before 1st call for down - set by _up
-+@ q8   Cur Line - load before 1st call for up   - set by _down
-+@
-+@ Temps
-+@ r5   Loop counter
-+@ r12
-+@ d0, q1, q12-q15
-+
-+patch_h_down_c_4x4_10:
-+        vld1.16     {q12}, [r2]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        mov         r5, #4
-+1:
-+        vmov        q13, q12
-+        vext.16     q12, q12, q12, #2
-+        vld1.32     {d25[1]}, [r2]!
-+patch_h_down_c_4x4_10_continue:
-+2:
-+        vmov        q8, q9
-+        subs        r12, r4
-+        vmul.u16    q0, q13, q3
-+        it          cc
-+        addcc       r12, #32
-+        vmla.u16    q0, q12, q2
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vmov        q10, q11
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vrshr.u16   q11, q0, #5
-+        bhi         2b
-+        bne         1b
-+
-+        bcs         3f
-+        vmov        q13, q12
-+        vext.16     q12, q12, q12, #2
-+        vld1.32     {d25[1]}, [r2]!
-+3:
-+
-+store_tran_c_4x4_10:
-+T       add         r6, r0, r3
-+        vzip.32     q8, q10
-+A       add         r6, r0, r3
-+T       lsl         r3, #1
-+        vzip.32     q9, q11
-+A       add         r5, r0, r3, lsl #1
-+T       add         r5, r0, r3
-+        vst2.32     {d16,d18}, [r0]!
-+A       lsl         r3, #1
-+        vst2.32     {d17,d19}, [r6], r3
-+        asr         r3, #1
-+        vst2.32     {d20,d22}, [r5]
-+        mov         r5, #4
-+        vst2.32     {d21,d23}, [r6]
-+        bx          lr
-+
-+patch_h_up_c_4x4_10:
-+        vld1.16     {q1}, [r2]
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        mov         r5, #4
-+1:
-+        adds        r8, r7
-+        vmov        q12, q1
-+        it          mi
-+        ldrmi       r6, [r2, #-4]!
-+        vext.16     q1, q1, q1, #6
-+        itt         pl
-+        asrpl       r6, r8, #8
-+        ldrpl       r6, [r1, r6, lsl #2]
-+        vmov        s4, r6
-+patch_h_up_c_4x4_10_continue:
-+2:
-+        vmov        q8, q9
-+        subs        r12, r4
-+        vmul.u16    q0, q12, q3
-+        it          cc
-+        addcc       r12, #32
-+        vmla.u16    q0, q1, q2
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vmov        q10, q11
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vrshr.u16   q11, q0, #5
-+        bhi         2b
-+        bne         1b
-+
-+        bcs         store_tran_c_4x4_10
-+        adds        r8, r7
-+        vmov        q12, q1
-+        it          mi
-+        ldrmi       r6, [r2, #-4]!
-+        vext.16     q1, q1, q1, #6
-+        itt         pl
-+        asrpl       r6, r8, #8
-+        ldrpl       r6, [r1, r6, lsl #2]
-+        vmov        s4, r6
-+        b           store_tran_c_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        bl          patch_h_down_c_4x4_10
-+        pop         {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+        bl          patch_h_up_c_4x4_10
-+        pop         {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q9}, [r1]
-+        sub         r1, r2, #4
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        vdup.16     q2, r6
-+        vext.16     q8, q9, q9, #6
-+        sub         r8, r7, #128
-+        vld1.32     {d16[0]}, [r1]
-+        vdup.16     q3, r12
-+        mov         r5, #3
-+1:
-+        vmul.u16    q0, q9, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q8, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #2
-+        vext.16     q10, q8, q8, #6
-+        rsb         r6, r12, #32
-+        vmov        q11, q8
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.32     {d20[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q11, q3
-+        subs        r12, r4
-+        vmla.u16    q0, q10, q2
-+        ittt        cc
-+        asrcc       r1, r8, #8
-+        addcc       r12, #32
-+        addcc       r1, r2, r1, lsl #2
-+        vext.16     q8, q10, q10, #6
-+        rsb         r6, r12, #32
-+        vmov        q9, q10
-+        sub         r5, #1
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r8, r7
-+        vld1.32     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q11, q3
-+        vmla.u16    q0, q10, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q9, q3
-+        vmla.u16    q0, q8, q2
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        vld1.16     {q9}, [r1]!
-+        rsb         r12, r6, #32
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vext.16     q8, q9, q9, #2
-+        vld1.32     {d17[1]}, [r1]!
-+        mov         r5, #3
-+1:
-+        vmul.u16    q0, q8, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q9, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q10, q8, q8, #2
-+        rsb         r6, r12, #32
-+        vld1.32     {d21[1]}, [r1]
-+        sub         r5, #1
-+        vmov        q11, q8
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #4
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         1b
-+        beq         4f
-+2:
-+        vmul.u16    q0, q10, q2
-+        subs        r12, r4
-+        vmla.u16    q0, q11, q3
-+        it          cc
-+        addcc       r12, #32
-+        vext.16     q8, q10, q10, #2
-+        rsb         r6, r12, #32
-+        vld1.32     {d17[1]}, [r1]
-+        sub         r5, #1
-+        vmov        q9, q10
-+        teq         r5, #0
-+        vrshr.u16   q0, q0, #5
-+        it          cc
-+        addcc       r1, #4
-+        vdup.16     q2, r6
-+        vdup.16     q3, r12
-+        vst1.16     {q0}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+        bcc         5f
-+3:
-+        vmul.u16    q0, q10, q2
-+        vmla.u16    q0, q11, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+4:
-+        bcc         3b
-+5:
-+        vmul.u16    q0, q8, q2
-+        vmla.u16    q0, q9, q3
-+        vrshr.u16   q0, q0, #5
-+        vst1.16     {q0}, [r0]
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r8, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        mov         r1,  r2             @ save r2 - r1 unused by patch_down
-+
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        add         r2, r1, #4*4        @ restore r2, but 4 rows further down left
-+        sub         r0, #32
-+        mov         r6, r4
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        pop         {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+
-+        push        {r2, r8}
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+        pop         {r2, r8}
-+
-+        sub         r0, #32
-+        mov         r6, r4
-+        add         r2, #16
-+        sub         r8, r8, r7, lsl #2
-+        add         r0, r0, r3, lsl #2
-+
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+
-+        pop         {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        vld1.16     {q0-q1}, [r1]
-+        sub         r9, r2, #4
-+        rsb         r12, r6, #32
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        mov         r5, #8
-+1:
-+        vld1.32     {d17[1]}, [r9]
-+        add         r8, r7
-+        vmov        q2, q0
-+        vmov        q3, q1
-+        asr         r9, r8, #8
-+        vext.16     q1, q0, q1, #6
-+        add         r9, r2, r9, lsl #2
-+        vext.16     q0, q8, q0, #6
-+2:
-+        vmul.u16    q11, q2, q10
-+        subs        r12, r4
-+        vmla.u16    q11, q0, q9
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q3, q10
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q1, q9
-+        sub         r5, #1
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q0-q1}, [r1]!
-+        rsb         r12, r6, #32
-+        vld1.32     {d16[0]}, [r5]
-+        mov         r5, #8
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+1:
-+        vmov        q2, q0
-+        add         r1, #4
-+        vmov        q3, q1
-+        vext.16     q0, q0, q1, #2
-+        vext.16     q1, q1, q8, #2
-+2:
-+        vmul.u16    q11, q0, q9
-+        subs        r12, r4
-+        vmla.u16    q11, q2, q10
-+        it          cc
-+        addcc       r12, #32
-+        vmul.u16    q12, q1, q9
-+        rsb         r6, r12, #32
-+        vmla.u16    q12, q3, q10
-+        sub         r5, #1
-+        vld1.32     {d16[0]}, [r1]
-+        teq         r5, #0
-+        vdup.16     q9, r6
-+        vdup.16     q10, r12
-+        vrshr.u16   q11, q11, #5
-+        vrshr.u16   q12, q12, #5
-+        vst1.16     {q11-q12}, [r0], r3
-+        bhi         2b
-+        bne         1b
-+
-+        pop         {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride        [r3]
-+@       unsigned int mode       [sp, #0]  2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
-+        ldr         r12, [sp]
-+        push        {r4-r10, lr}
-+        ADRT        r4, angle_2 - 2
-+        ADRT        r7, inv_angle - 11*2
-+        add         r7, r7, r12, lsl #1
-+        lsl         r3, #2
-+        vpush       {d8}
-+        ldrsb       r6, [r4, r12]
-+        cmp         r12, #26
-+        ldrsb       r4, [r4, r12]
-+        bge         26f
-+        cmp         r12, #18
-+        bge         18f
-+        cmp         r12, #10
-+        bge         10f
-+
-+@ Down of Horizontal - works down left
-+        add         sp, #8
-+        mov         r10, #4
-+        mov         r1, r2
-+1:
-+        bl          patch_h_down_c_4x4_10
-+        bl          patch_h_down_c_4x4_10_continue
-+        bl          patch_h_down_c_4x4_10_continue
-+        bl          patch_h_down_c_4x4_10_continue
-+
-+        add         r2, r1, #4*4         @ restore r2, but 4 rows further down left
-+        add         r1, r1, #4*4
-+        mov         r6, r4
-+        sub         r0, #64
-+        subs        r10, #1
-+        add         r0, r0, r3, lsl #2
-+        bne         1b
-+
-+        pop         {r4-r10, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+        add         sp, #8
-+        mov         r10, #4
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        sub         r8, r7
-+2:
-+        push        {r2, r8}
-+        bl          patch_h_up_c_4x4_10
-+        bl          patch_h_up_c_4x4_10_continue
-+        bl          patch_h_up_c_4x4_10_continue
-+        bl          patch_h_up_c_4x4_10_continue
-+        pop         {r2, r8}
-+
-+        sub         r0, #64
-+        mov         r6, r4
-+        add         r2, #16
-+        sub         r8, r8, r7, lsl #2
-+        add         r0, r0, r3, lsl #2
-+        subs        r10, #1
-+        bne         2b
-+
-+        pop         {r4-r10, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        sub         r9, r2, #4
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        ldrh        r7, [r7]
-+        mov         r8, #-128
-+        vmov        d0, d9
-+        vmov        s2, r12
-+        add         r10, r0, #32
-+        mov         r5, #16
-+1:
-+        vld1.32     {d1[1]}, [r9]
-+        add         r8, r7
-+        vmov        q11, q4
-+        vmov        q10, q3
-+        asr         r9, r8, #8
-+        vmov        q9, q2
-+        add         r9, r2, r9, lsl #2
-+        vmov        q8, q1
-+        vext.16     q4, q3, q4, #6
-+        vext.16     q3, q2, q3, #6
-+        vext.16     q2, q1, q2, #6
-+        vext.16     q1, q0, q1, #6
-+2:
-+        vmul.u16    q12, q8, d1[1]
-+        adds        r12, r4
-+        vmla.u16    q12, q1, d1[0]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q9, d1[1]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q2, d1[0]
-+        sub         r5, #1
-+        vmul.u16    q14, q10, d1[1]
-+        teq         r5, #0
-+        vmla.u16    q14, q3, d1[0]
-+        vmul.u16    q15, q11, d1[1]
-+        vmla.u16    q15, q4, d1[0]
-+        vmov        s2, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d0
-+        pop         {r4-r10, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+        add         r5, r1, #32
-+        vld1.16     {q1-q2}, [r1]
-+        rsb         r12, r6, r6, lsl #16
-+        vld1.16     {q3-q4}, [r5]
-+        add         r1, r1, #64
-+        rsb         r4, r12, #0
-+        rsb         r12, r12, #32 << 16
-+        vmov        d1, d9
-+        vmov        s1, r12
-+        add         r10, r0, #32
-+        mov         r5, #16
-+1:
-+        vld1.32     {d0[0]}, [r1]!
-+        vmov        q8, q1
-+        vmov        q9, q2
-+        vmov        q10, q3
-+        vmov        q11, q4
-+        vext.16     q1, q1, q2, #2
-+        vext.16     q2, q2, q3, #2
-+        vext.16     q3, q3, q4, #2
-+        vext.16     q4, q4, q0, #2
-+2:
-+        vmul.u16    q12, q1, d0[2]
-+        adds        r12, r4
-+        vmla.u16    q12, q8, d0[3]
-+        it          cc
-+        addcc       r12, #32 << 16
-+        vmul.u16    q13, q2, d0[2]
-+        it          cc
-+        subcc       r12, #32
-+        vmla.u16    q13, q9, d0[3]
-+        sub         r5, #1
-+        vmul.u16    q14, q3, d0[2]
-+        teq         r5, #0
-+        vmla.u16    q14, q10, d0[3]
-+        vmul.u16    q15, q4, d0[2]
-+        vmla.u16    q15, q11, d0[3]
-+        vmov        s1, r12
-+        vrshr.u16   q12, q12, #5
-+        vrshr.u16   q13, q13, #5
-+        vrshr.u16   q14, q14, #5
-+        vrshr.u16   q15, q15, #5
-+        vst1.16     {q12-q13}, [r0], r3
-+        vst1.16     {q14-q15}, [r10], r3
-+        bhi         2b
-+        bne         1b
-+
-+        vpop        {d8}
-+        vmov        d9, d1
-+        pop         {r4-r10, pc}
-+
-+endfunc
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
-new file mode 100644
-index 0000000000..df8c1c25b9
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
-@@ -0,0 +1,705 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        ldr         r2, [r2]
-+        vld1.32     {d0[0]}, [r1]
-+        mov         r1, #2
-+        vmov        s1, r2
-+        vmov        s2, r2
-+        vmov.i16    q2, #3
-+        add         r2, r0, r3
-+        vaddl.u8    q1, d0, d1    @ d2[0] = top[0] + left[0]
-+        lsl         r3, #1
-+        vmovl.u8    q0, d0
-+        vmov.i64    d7, #0xffff
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmov.i64    d7, #0xff
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #3
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.8      d6, d6[0]
-+        vrshrn.i16  d0, q0, #2
-+
-+        @ Store top line
-+        vst1.32     {d0[0]}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d1, d0, #5*8
-+        vshr.u64    d2, d0, #6*8
-+        vshr.u64    d3, d0, #7*8
-+        vbif        d1, d6, d7
-+        vbif        d2, d6, d7
-+        vst1.32     {d1[0]}, [r2], r3
-+        vbif        d3, d6, d7
-+        vst1.32     {d2[0]}, [r0]
-+        vst1.32     {d3[0]}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {d0}, [r1]
-+        vld1.8      {d1}, [r2]
-+A       add         r2, r0, r3, lsl #1
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+        vadd.i16    d0, d1       @ d0 has 2 val pairs
-+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
-+        vpadd.i32   d3, d0, d0
-+        vrshrn.u16  d0, q1, #3
-+
-+        @ Store
-+        vst1.8      {d0}, [r0], r3
-+        vst1.8      {d0}, [r2], r3
-+        vst1.8      {d0}, [r0]
-+        vst1.8      {d0}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {d0}, [r1]
-+        mov         r1, #2
-+        vld1.8      {d16}, [r2]
-+        vmov.i16    q2, #3
-+        vmov.i64    d7, #0xffff
-+        vaddl.u8    q1, d0, d16   @ d2[0] = top[0] + left[0]
-+        vmovl.u8    q0, d0
-+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmov.i64    d7, #0xff
-+        vmovl.u8    q1, d16
-+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #4
-+        vmla.i16    q1, q2, d6[0]
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.8      d6, d6[0]
-+        vrshrn.i16  d2, q1, #2
-+        vrshrn.i16  d0, q0, #2
-+
-+        @ Store top line
-+        vst1.8      {d0}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d2, #8
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        mov         r1, #6
-+1:
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        subs        r1, #2
-+        vbit        d6, d2, d7
-+        vshr.u64    d2, #8
-+        vst1.8      {d6}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0}, [r1]
-+        mov         r1, #8
-+        vld1.8      {q1}, [r2]
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+A       add         r2, r0, r3, lsl #1
-+A       lsl         r3, #2
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vaddl.u8    q1, d2, d3
-+        vadd.i16    q1, q0
-+        vadd.i16    d3, d2        @ d3 has 2 val pairs
-+        vpadd.i32   d2, d3, d3    @ This add U & V separately
-+        vpadd.i32   d3, d3, d3
-+        vrshrn.u16  d0, q1, #4
-+        vrshrn.u16  d1, q1, #4
-+
-+        @ Store
-+1:
-+        vst1.8      {q0}, [r0], r3
-+        subs        r1, #4
-+        vst1.8      {q0}, [r2], r3
-+        vst1.8      {q0}, [r0], r3
-+        vst1.8      {q0}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q8}, [r1]
-+        mov         r1, #2
-+        vld1.8      {q9}, [r2]
-+        vaddl.u8    q10, d16, d17
-+        vaddl.u8    q11, d16, d18
-+        vaddl.u8    q0, d18, d19
-+        vmov.i16    q1, #3
-+        vadd.i16    q10, q0
-+        vmovl.u8    q0, d18
-+        vadd.i16    d20, d21
-+        vmov.i16    d2[0], r1     @ 2, 3, 3, 3...
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vmovl.u8    q2, d16
-+        vmovl.u8    q9, d19
-+        vpadd.i16   d20, d20      @ 2 (top & bottom of vector the same)
-+        vmov.i64    d7, #0xffff
-+        vmovl.u8    q8, d17
-+        vbit        d4, d22, d7   @ q2 = top[0]+left[0], top[1..7]
-+        vmov.i64    d7, #0xff
-+        vpadd.i16   d20, d20      @ 1 (all the same)
-+        vrshr.u16   d21, d20, #5
-+        vrshr.u16   d20, d20, #5
-+        vmla.i16    q0, q10, d2[1]
-+        vmla.i16    q9, q10, d2[1]
-+        vmla.i16    q2, q10, q1
-+        vmla.i16    q8, q10, d2[1]
-+        vdup.8      q1, d20[0]
-+        vrshrn.i16  d0, q0, #2
-+        vrshrn.i16  d1, q9, #2
-+        vrshrn.i16  d4, q2, #2
-+        vrshrn.i16  d5, q8, #2
-+        vext.8      q0, q0, q0, #1
-+
-+        @ Store top line
-+        vst1.8      {q2}, [r0], r3
-+
-+        @ Store the rest
-+        mov         r1, #15
-+1:
-+        vbit        d2, d0, d7
-+        vext.8      q0, q0, q0, #1
-+        subs        r1, #1
-+        vst1.8      {q1}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0-q1}, [r1]
-+        mov         r1, #16
-+        vld1.8      {q2-q3}, [r2]
-+T       lsl         r3, #1
-+        vaddl.u8    q0, d0, d1
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vaddl.u8    q1, d2, d3
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vaddl.u8    q2, d4, d5
-+        vaddl.u8    q3, d6, d7
-+        vadd.i16    q0, q1
-+        vadd.i16    q2, q3
-+        vadd.i16    q0, q2
-+        vadd.i16    d0, d1        @ d0 has 2 val pairs
-+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
-+        vpadd.i32   d5, d0, d0
-+        vrshrn.u16  d0, q2, #5
-+        vrshrn.u16  d1, q2, #5
-+        vrshrn.u16  d2, q2, #5
-+        vrshrn.u16  d3, q2, #5
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_32_neon_8, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0-q1}, [r1]
-+        mov         r1, #32
-+        vld1.8      {q2-q3}, [r2]
-+        add         r2, r0, r3
-+        vaddl.u8    q0, d0, d1
-+        lsl         r3, #1
-+        vaddl.u8    q1, d2, d3
-+        vaddl.u8    q2, d4, d5
-+        vaddl.u8    q3, d6, d7
-+        vadd.i16    q0, q1
-+        vadd.i16    q2, q3
-+        vadd.i16    q0, q2
-+        vadd.i16    d0, d1        @ d0 has 4 vals
-+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
-+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
-+        vpadd.i16   d5, d0, d0
-+        vrshrn.u16  d0, q2, #6
-+        vrshrn.u16  d1, q2, #6
-+        vrshrn.u16  d2, q2, #6
-+        vrshrn.u16  d3, q2, #6
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ -----------------------------------------------------------------------------
-+@
-+@ 10 Bit versions
-+@
-+@ There is no actual bit depth dependency in this code except that our
-+@ intermediate results will overflow the 16 bits they are stored in
-+@ All there functions are good to 10 bits - with the worst case being
-+@ in dc_32 where we use all 16 bits.
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {d0}, [r1]
-+        mov         r1, #2
-+        vld1.16     {d1}, [r2]
-+T       lsl         r3, #1
-+        vmov.i16    q2, #3
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.u16    d2, d0, d1    @ d2[0] = top[0] + left[0]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vmov.i64    d7, #0xffff
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #3
-+        vmla.i16    q0, q2, d6[0]
-+        vrshr.u16   q0, #2
-+
-+        @ Store top line
-+        vst1.16     {d0}, [r0], r3
-+
-+        @ Store the rest
-+        vshr.u64    d3, d1, #1*16
-+        vshr.u64    d4, d1, #2*16
-+        vshr.u64    d5, d1, #3*16
-+        vbif        d3, d6, d7
-+        vbif        d4, d6, d7
-+        vst1.16     {d3}, [r2], r3
-+        vbif        d5, d6, d7
-+        vst1.16     {d4}, [r0]
-+        vst1.16     {d5}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.8      {q0}, [r1]
-+        vld1.8      {q1}, [r2]
-+A       add         r2, r0, r3, lsl #2
-+A       lsl         r3, #3
-+T       lsl         r3, #2
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vadd.i16    q0, q1
-+        vadd.i16    d0, d1       @ d0 has 2 val pairs
-+        vpadd.i32   d2, d0, d0   @ This adds U & V separately
-+        vpadd.i32   d3, d0, d0
-+        vrshr.u16   q0, q1, #3
-+
-+        vst1.16     {q0}, [r0], r3
-+        vst1.16     {q0}, [r2], r3
-+        vst1.16     {q0}, [r0]
-+        vst1.16     {q0}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q0}, [r1]
-+        mov         r1, #2
-+        vld1.16     {q8}, [r2]
-+T       lsl         r3, #1
-+        vmov.i16    q2, #3
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.i16    q1, q0, q8    @ q1[0] = top[0] + left[0]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmov.i64    d7, #0xffff
-+        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
-+        vadd.i16    d6, d2, d3    @ d6 has 4 vals
-+        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ top_line[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d6, d6        @ 1 (all the same)
-+        vrshr.u16   d6, #4
-+        vmla.i16    q8, q2, d6[0]
-+        vmla.i16    q0, q2, d6[0]
-+        vdup.16     q2, d6[0]
-+        vdup.16     q9, d6[0]
-+        vrshr.u16   q8, q8, #2
-+        vrshr.u16   q0, q0, #2
-+        vext.16     q1, q8, q8, #1
-+
-+        @ Store top line
-+        vst1.16     {q0}, [r0], r3
-+
-+        @ Store the rest
-+        vbit        d18, d2, d7
-+        vst1.16     {q9}, [r2], r3
-+        mov         r1, #6
-+1:
-+        vext.16     q8, q8, q8, #2
-+        subs        r1, #2
-+        vext.16     q1, q1, q1, #2
-+        vbit        d4, d16, d7
-+        vst1.16     {q2}, [r0], r3
-+        vbit        d18, d2, d7
-+        vst1.16     {q9}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q0-q1}, [r1]
-+        mov         r1, #8
-+        vld1.16     {q2-q3}, [r2]
-+T       lsl         r3, #2
-+        vadd.i16    q1, q0
-+A       add         r2, r0, r3, lsl #2
-+A       lsl         r3, #3
-+T       add         r2, r0, r3
-+T       lsl         r3, #1
-+        vadd.i16    q2, q3
-+        vadd.i16    q1, q2
-+        vadd.i16    d3, d2        @ d3 has 2 val pairs
-+        vpadd.i32   d2, d3, d3    @ This add U & V separately
-+        vpadd.i32   d3, d3, d3
-+        vrshr.u16   q0, q1, #4
-+        vrshr.u16   q1, q1, #4
-+
-+        @ Store
-+1:
-+        vst1.8      {q0-q1}, [r0], r3
-+        subs        r1, #2
-+        vst1.8      {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vld1.16     {q8-q9}, [r1]
-+        mov         r1, #2
-+        vld1.16     {q10-q11}, [r2]
-+        lsl         r3, #1        @ stride given in pels
-+        vadd.i16    q0, q8, q9
-+        vadd.i16    q1, q10, q11
-+        vmov.i16    q3, #3
-+        vadd.i16    q1, q0
-+        vadd.i16    d0, d16, d20
-+        vmov.i64    d31, #0xffff
-+        vadd.i16    d3, d2
-+        vmov.16     d6[0], r1     @ 2, 3, 3, 3...
-+
-+        @ top line gets some smoothing
-+        @ (top[i] + 3*dc + 2) >> 2
-+        @ as does left
-+        @ topline[0] is extra special
-+        @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+        vbit        d16, d0, d31  @ q8 = top[0]+left[0], top[1..7]
-+        vpadd.i16   d3, d3        @ 2 (top & bottom of vector the same)
-+        vpadd.i16   d3, d3        @ 1 (all the same)
-+        vrshr.u16   d2, d3, #5
-+        vrshr.u16   d3, d3, #5
-+        vmov        q0, q1
-+        vmla.i16    q10, q1, d6[1]
-+        vmla.i16    q11, q1, d6[1]
-+        vmla.i16    q8, q1, q3
-+        vmla.i16    q9, q1, d6[1]
-+        vrshr.u16   q2, q10, #2
-+        vrshr.u16   q3, q11, #2
-+        vrshr.u16   q8, #2
-+        vrshr.u16   q9, #2
-+        vext.16     q2, q2, q2, #1
-+        mov         r1, #7<<29
-+
-+        @ Store top line
-+        vst1.16     {q8-q9}, [r0], r3
-+
-+        @ Store the rest
-+1:
-+        vbit        d0, d4, d31
-+        vext.16     q2, q2, q2, #1
-+        subs        r1, #1<<29
-+        vst1.16     {q0-q1}, [r0], r3
-+        bne         1b
-+1:
-+        vbit        d0, d6, d31
-+        vext.16     q3, q3, q3, #1
-+        subs        r1, #1<<29
-+        vst1.16     {q0-q1}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        vldm        r1, {q0-q3}
-+        vldm        r2, {q8-q11}
-+        vadd.i16    q0, q1
-+        mov         r1, #16
-+        vadd.i16    q2, q3
-+        add         r2, r0, #32
-+        vadd.i16    q8, q9
-+        lsl         r3, #2
-+        vadd.i16    q10, q11
-+        vadd.u16    q0, q2
-+        vadd.u16    q8, q10
-+        vadd.i16    q0, q8
-+        vadd.i16    d0, d1        @ d0 has 2 val pairs
-+        vpadd.i32   d4, d0, d0    @ This adds U & V separately
-+        vpadd.i32   d5, d0, d0
-+        vrshr.u16   q0, q2, #5
-+        vrshr.u16   q1, q2, #5
-+
-+        @ Store
-+1:
-+        vst1.16     {q0-q1}, [r0], r3
-+        subs        r1, #1
-+        vst1.16     {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx           lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]  (In pels)
-+
-+function ff_hevc_rpi_pred_dc_32_neon_10, export=1
-+
-+        @ Average the els of top & left
-+        @ With 10 bits we are (just) safe from overflow in i16
-+        vldm        r1, {q0-q3}
-+        vldm        r2, {q8-q11}
-+        vadd.i16    q0, q1
-+        mov         r1, #32
-+        vadd.i16    q2, q3
-+        add         r2, r0, #32
-+        vadd.i16    q8, q9
-+        lsl         r3, #1
-+        vadd.i16    q10, q11
-+        vadd.u16    q0, q2
-+        vadd.u16    q8, q10
-+        vadd.i16    q0, q8
-+        vadd.i16    d0, d1        @ d0 has 4 vals
-+        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
-+        vpadd.i16   d4, d0, d0    @ 1 (all the same)
-+        vpadd.i16   d5, d0, d0
-+        vrshr.u16   q0, q2, #6
-+        vrshr.u16   q1, q2, #6
-+
-+        @ Store
-+1:
-+        vst1.16     {q0-q1}, [r0], r3
-+        subs        r1, #1
-+        vst1.16     {q0-q1}, [r2], r3
-+        bne         1b
-+
-+        bx           lr
-+endfunc
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-new file mode 100644
-index 0000000000..f6969d3591
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-@@ -0,0 +1,881 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ All functions have the call
-+@
-+@ int ff_hevc_rpi_intra_filter_N_neon_PW(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+@
-+@ Assumptions:
-+@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
-+@  if reuseing this code)
-+@
-+@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
-+@ N==4, but do for chroma N>=8.  As we share Y/C fns that means we can ignore
-+@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
-+@
-+@ We always have at least 64 pixel H frame width rounding - this lets us
-+@ load UR widthout having to worry about exactly how many pixels are actually
-+@ within the frame.  As partial loads will only occur very occasionally this
-+@ should be a win in nearly all cases.
-+@
-+@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
-+@ so we do no maths on the contents
-+@
-+@ No filtering in 32bit fns as they are chroma only
-+
-+
-+.equ    AVAIL_UR, 1
-+.equ    AVAIL_U,  2
-+.equ    AVAIL_UL, 4
-+.equ    AVAIL_L,  8
-+.equ    AVAIL_DL, 16
-+
-+.equ    FILTER_LIGHT, 0x40
-+.equ    FILTER_STRONG, 0x80
-+
-+.equ    AVAIL_S_UR_N_U_C, 32 - 1
-+.equ    AVAIL_S_U_N_UL_C, 32 - 2
-+.equ    AVAIL_S_UL_N_L_C, 32 - 3
-+.equ    AVAIL_S_L_N_DL_C, 32 - 4
-+
-+.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
-+
-+@ On entry
-+@  r2   req
-+@  r3   avail
-+@ [sp, #sp_offset...]  args
-+@
-+@ On Exit:
-+@
-+@ Extend values:
-+@  d_l  scalar contains value for L & DL
-+@       if DL avail then this is is DL[0] so we don't need to load that
-+@  d_ul scalar containing value for UL
-+@  d_u  scalar containing value for U
-+@  d_ur scalar containing value for UR
-+@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
-+@ This means that L-light-filter works even if nreq DL (we never filter
-+@ req-DL without req-L, but we do filter req-L without req-DL)
-+@ If UR avail then d_ur == a_ur so U-filter good too
-+@
-+@ Data load pointers (only load if req & avail):
-+@  r4   DL + stride
-+@  r10  L
-+@  r6   U
-+@  r5   UR
-+@
-+@ Others:
-+@  r2   req
-+@  r7   req & avail
-+@  r3   L + stride
-+@  r8   DL + stride * 2
-+@  r9   stride * 2
-+@  cs   Load U
-+@  mi   Load UR
-+@
-+@ Clobbered:
-+@  r12
-+
-+.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
-+
-+.equ    src_l\@,   \sp_offset + 0
-+.equ    src_u\@,   \sp_offset + 4
-+.equ    src_ur\@,  \sp_offset + 8
-+.equ    stride\@,  \sp_offset + 12
-+.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
-+.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
-+
-+@ r9    stride
-+@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
-+@ r4 = b_dl, r10 = b_l,             r8 = b_u
-+
-+        ldr        r5,  [sp, #src_ur\@]
-+        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
-+        ldr        r10, [sp, #src_l\@]
-+        ldr        r9,  [sp, #stride\@]
-+        ldr        r6,  [sp, #src_u\@]
-+
-+        @ This is quite a slow instruction but it replaces
-+        @ a decent number of tests that yield a max of 2 flags/op
-+        @ It is annoying we can't branch on Q!
-+        @ If L navail (ne) then DL must be navail (pl)
-+        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
-+
-+        mov        r4,  r5
-+        sub        r7,  r10, r9
-+        it vs
-+        movvs      r4,  r6
-+        add        r8,  r6,  #b_size\@ - pw\@
-+        it cs
-+        movcs      r4,  r7
-+        ite ne
-+        movne      r10, r4
-+        addeq      r4,  r7,  r9,  lsl #\log2_s
-+        it cc
-+        movcc      r7,  r10
-+        it mi
-+        addmi      r4,  r10, r9,  lsl #\log2_s
-+        vld1.\d_type {\d_ul}, [r7]
-+        itt vc
-+        movvc      r8,  r7
-+        movvc      r6,  r7
-+        vld1.\d_type {\d_l }, [r4], r9
-+        tst        r3,  #AVAIL_UR
-+        vld1.\d_type {\d_u }, [r6]
-+        it eq
-+        moveq      r5,  r8
-+        and        r7,  r2,  r3
-+        add        r8,  r4,  r9
-+        vld1.\d_type {\d_ur}, [r5]
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        add        r3,  r10, r9
-+        lsl        r9,  #1
-+.endm
-+
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_8(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    0
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_8, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
-+
-+        it cs
-+        vldrcs     s2,  [r6]
-+        ite pl
-+        vmovpl     s3,  s4
-+        vldrmi     s3,  [r5]
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0,  #-pw
-+        bpl        1f
-+
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10]
-+        vld1.8    {d0[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.8    {d0[5]}, [r4],  r9
-+        vld1.8    {d0[6]}, [r8]
-+        vld1.8    {d0[7]}, [r4]
-+1:
-+        vstr       d1,  [r1]            @ Up
-+        vst1.8    {d31[7]}, [r12]
-+        vstr       d0,  [r0]            @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_16, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
-+
-+        it cs
-+        vldrcs     d2,  [r6]
-+        it mi
-+        vldrmi     d3,  [r5]
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0, #-pw
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10]
-+        vld1.16   {d0[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.16   {d1[1]}, [r4],  r9
-+        vld1.16   {d1[2]}, [r8]
-+        vld1.16   {d1[3]}, [r4]
-+1:
-+        vst1.16   {q1}, [r1]           @ Up
-+        vst1.16   {d31[3]}, [r12]
-+        vst1.16   {q0}, [r0]           @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_8(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    0
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+
-+function ff_hevc_rpi_intra_filter_8_neon_8, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
-+
-+        it cs
-+        vldrcs     d4,  [r6]
-+        it mi
-+        vldrmi     d5,  [r5]
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        bpl        1f
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10], r9
-+        vld1.8    {d0[3]}, [r3],  r9
-+        vld1.8    {d0[4]}, [r10], r9
-+        vld1.8    {d0[5]}, [r3],  r9
-+        vld1.8    {d0[6]}, [r10]
-+        vld1.8    {d0[7]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.8    {d1[1]}, [r4],  r9
-+        vld1.8    {d1[2]}, [r8],  r9
-+        vld1.8    {d1[3]}, [r4],  r9
-+        vld1.8    {d1[4]}, [r8],  r9
-+        vld1.8    {d1[5]}, [r4],  r9
-+        vld1.8    {d1[6]}, [r8]
-+        vld1.8    {d1[7]}, [r4]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        @ Luma light filter
-+        vext.8     q8,  q15, q2,  #15
-+        vext.8     q12, q15, q0,  #15
-+        vaddl.u8   q9,  d17, d5
-+        vaddl.u8   q8,  d16, d4
-+        vaddl.u8   q13, d25, d1
-+        vaddl.u8   q12, d24, d0
-+        vmov.u8    r3,  d5[7]           @ Save final pel
-+        vmov.u8    r2,  d1[7]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshrn.u16 d4,  q2,  #2
-+        vrshrn.u16 d5,  q3,  #2
-+        vrshrn.u16 d0,  q0,  #2
-+        vrshrn.u16 d1,  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u8    d5[7], r3            @ Restore final pel
-+        vmov.u8    d1[7], r2            @ Restore final pel
-+        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
-+
-+10:
-+        vst1.8    {q2 }, [r1]           @ Up
-+        vst1.8    {d31[7]}, [r12]       @ Up-left
-+        vst1.8    {q0 }, [r0]           @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
-+
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #4
-+        vldm       r5,  {d6, d7}
-+        bgt        1f
-+        vdup.16    d7,  d6[3]
-+1:
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vdup.16    q1,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10]
-+        vld1.16   {d1[3]}, [r3]
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.16   {d2[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.16   {d2[2]}, [r8],  r9
-+        vld1.16   {d2[3]}, [r4],  r9
-+        blt        2f
-+        vld1.16   {d3[0]}, [r8],  r9
-+        vld1.16   {d3[1]}, [r4],  r9
-+        vld1.16   {d3[2]}, [r8]
-+        vld1.16   {d3[3]}, [r4]
-+        b          1f
-+2:
-+        vdup.16    d3,  d2[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        @ Luma light filter
-+        vext.16    q9,  q2,  q3,  #7
-+        vext.16    q8,  q15, q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+        vadd.u16   q9,  q3
-+        vadd.u16   q8,  q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r3,  d7[3]           @ Save final pel
-+        vmov.u16   r2,  d3[3]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r3            @ Restore final pel
-+        vmov.u16   d3[3], r2            @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
-+
-+10:
-+        vst1.16   {q2,  q3}, [r1]       @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vst1.16   {q0,  q1}, [r0]       @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_16(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    1
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  4
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.16    q9,  d16[0]
-+        vdup.16    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {d16-d19}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #12
-+        @ Given chroma frame layout, if UR exists then it is always legit to
-+        @ load all of it even if most of it is outside the frame.
-+        vldm       r5,  {d20-d23}
-+        bgt        1f
-+        bge        4f
-+        cmp        r12,  #8
-+        bge        3f
-+        vdup.16    d21, d20[3]
-+3:      vdup.16    d22, d21[3]
-+4:      vdup.16    d23, d22[3]
-+
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        ldr        r12, [sp, #dl_size]
-+        vdup.16    q1,  d0[0]
-+        vdup.16    q2,  d0[0]
-+        vdup.16    q3,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10], r9
-+        vld1.16   {d1[3]}, [r3],  r9
-+        vld1.16   {d2[0]}, [r10], r9
-+        vld1.16   {d2[1]}, [r3],  r9
-+        vld1.16   {d2[2]}, [r10], r9
-+        vld1.16   {d2[3]}, [r3],  r9
-+        vld1.16   {d3[0]}, [r10], r9
-+        vld1.16   {d3[1]}, [r3],  r9
-+        vld1.16   {d3[2]}, [r10]
-+        vld1.16   {d3[3]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.16   {d4[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.16   {d4[2]}, [r8],  r9
-+        vld1.16   {d4[3]}, [r4],  r9
-+        ble        2f
-+        vld1.16   {d5[0]}, [r8],  r9
-+        vld1.16   {d5[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.16   {d5[2]}, [r8],  r9
-+        vld1.16   {d5[3]}, [r4],  r9
-+        blt        3f
-+        vld1.16   {d6[0]}, [r8],  r9
-+        vld1.16   {d6[1]}, [r4],  r9
-+        vld1.16   {d6[2]}, [r8],  r9
-+        vld1.16   {d6[3]}, [r4],  r9
-+        ble        4f
-+        vld1.16   {d7[0]}, [r8],  r9
-+        vld1.16   {d7[1]}, [r4],  r9
-+        vld1.16   {d7[2]}, [r8]
-+        vld1.16   {d7[3]}, [r4]
-+        b          1f
-+2:      vdup.16    d5,  d4[3]
-+3:      vdup.16    d6,  d5[3]
-+4:      vdup.16    d7,  d6[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        vpush     {q5}
-+        @ Luma light filter
-+        @ Left
-+        vext.16    q5,  q2,  q3,  #7
-+        vext.16    q14, q1,  q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+
-+        vadd.u16   q5,  q3
-+        vadd.u16   q14, q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r2,  d7[3]           @ Save final pel
-+
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q14, #1
-+        vext.16    q2,  q14, q5,  #1
-+        vext.16    q3,  q5,  q5,  #1
-+
-+        vmov       d30, d24             @ d30[0] = l[0] + ul
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+        vadd.u16   q2,  q14
-+        vadd.u16   q3,  q5
-+
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+
-+        @ Up
-+        vext.16    q5,  q10, q11, #7
-+        vext.16    q14, q9,  q10, #7
-+        vext.16    q13, q8,  q9,  #7
-+        vext.16    q12, q15, q8,  #7
-+
-+        vadd.u16   q5,  q11
-+        vadd.u16   q14, q10
-+        vadd.u16   q13, q9
-+        vadd.u16   q12, q8
-+        vmov.u16   r3,  d23[3]          @ Save final pel
-+
-+        vext.16    q8,  q12, q13, #1
-+        vext.16    q9,  q13, q14, #1
-+        vext.16    q10, q14, q5,  #1
-+        vext.16    q11, q5,  q5,  #1
-+
-+        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q8,  q12
-+        vadd.u16   q9,  q13
-+        vadd.u16   q10, q14
-+        vadd.u16   q11, q5
-+
-+        vrshr.u16  q8,  #2
-+        vrshr.u16  q9,  #2
-+        vrshr.u16  q10, #2
-+        vrshr.u16  q11, #2
-+
-+        @ Misc
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r2            @ Restore final pel
-+        vmov.u16   d23[3], r3           @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
-+        vpop      {q5}
-+
-+10:
-+        vstm       r1, {d16-d23}        @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vstm       r0, { d0-d7 }        @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
-+
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        it mi
-+        vldmmi     r5,  {d6, d7}
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        add        r12, r0,  #-pw
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10]
-+        vld1.32   {d1[1]}, [r3]
-+1:
-+        bcc        1f
-+        vld1.32   {d2[1]}, [r4],  r9
-+        vld1.32   {d3[0]}, [r8]
-+        vld1.32   {d3[1]}, [r4]
-+1:
-+        vst1.32    {q2,  q3 }, [r1]     @ Up
-+        vst1.32    {d31[1]}, [r12]
-+        vst1.32    {q0,  q1 }, [r0]     @ Left
-+        pop        {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  3
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.32    q9,  d16[0]
-+        vdup.32    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {q8,  q9 }
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #p_size
-+        vldm       r5,  {q10, q11}
-+        bge        1f
-+        vdup.32    q11, d21[1]
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        vdup.32    q2,  d0[0]
-+        vdup.32    q3,  d0[0]
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10], r9
-+        vld1.32   {d1[1]}, [r3],  r9
-+        vld1.32   {d2[0]}, [r10], r9
-+        vld1.32   {d2[1]}, [r3],  r9
-+        vld1.32   {d3[0]}, [r10]
-+        vld1.32   {d3[1]}, [r3]
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.32   {d4[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.32   {d5[0]}, [r8],  r9
-+        vld1.32   {d5[1]}, [r4],  r9
-+        blt        2f
-+        vld1.32   {d6[0]}, [r8],  r9
-+        vld1.32   {d6[1]}, [r4],  r9
-+        vld1.32   {d7[0]}, [r8]
-+        vld1.32   {d7[1]}, [r4]
-+        b          1f
-+2:
-+        vdup.32    q3,  d5[1]
-+1:
-+        add        r12, r0,  #-pw
-+        vstm       r1,  { q8-q11}       @ Up
-+        vst1.32   {d31[1]}, [r12]
-+        vstm       r0,  { q0-q3 }       @ Left
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_32(
-+@    pixel * const left,                   [r0]
-+@    pixel * const top,                    [r1]
-+@    const unsigned int req,               [r2]
-+@    const unsigned int avail,             [r3]
-+@    const pixel * const src_l,            [sp, #0]
-+@    const pixel * const src_u,            [sp, #4]
-+@    const pixel * const src_ur,           [sp, #8]
-+@    const unsigned int stride,            [sp, #12] (pels)
-+@    const unsigned int top_right_size,    [sp, #16]
-+@    const unsigned int down_left_size)    [sp, #20]
-+
-+.set    sp_base, 8*4
-+.set    ur_size, sp_base + 16
-+.set    dl_size, sp_base + 20
-+.set    pw_s,    2
-+.set    pw,      (1 << pw_s)
-+.set    log2_s,  4
-+.set    p_size,  (1 << log2_s)          @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
-+
-+        @ Once we get this big we have run out of neon regs to store
-+        @ everything at once so do in pieces
-+
-+        @ Up (have)
-+        it cs
-+        vldmcs     r6,  { q0-q3 }
-+        ldr        r12, [sp, #ur_size]
-+        it mi
-+        vldmmi     r5,  { q8-q11}
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        bpl        1f
-+        cmp        r12, #12
-+        add        lr,  r1,  #(pw << log2_s)
-+        bgt        2f
-+        cmp        r12, #8
-+        bge        3f
-+        vdup.16    q9,  d17[1]
-+4:      vdup.16    d10, d19[1]
-+3:      vdup.16    q11, d21[1]
-+2:      vstm       lr, { q8-q11}
-+1:
-+
-+        @ Left (have)
-+        add        lr,  r0,  #-pw
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vst1.32   {d30[1]}, [lr]        @ UL
-+        bpl        1f
-+        vld1.32   { d0[0]}, [r10], r9
-+        vld1.32   { d0[1]}, [r3],  r9
-+        vld1.32   { d1[0]}, [r10], r9
-+        vld1.32   { d1[1]}, [r3],  r9
-+        vld1.32   { d2[0]}, [r10], r9
-+        vld1.32   { d2[1]}, [r3],  r9
-+        vld1.32   { d3[0]}, [r10], r9
-+        vld1.32   { d3[1]}, [r3],  r9
-+        vld1.32   { d4[0]}, [r10], r9
-+        vld1.32   { d4[1]}, [r3],  r9
-+        vld1.32   { d5[0]}, [r10], r9
-+        vld1.32   { d5[1]}, [r3],  r9
-+        vld1.32   { d6[0]}, [r10], r9
-+        vld1.32   { d6[1]}, [r3],  r9
-+        vld1.32   { d7[0]}, [r10]
-+        vld1.32   { d7[1]}, [r3]
-+        vstm       r0,  { q0-q3 }
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        vld1.32   {d16[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.32   {d17[0]}, [r8],  r9
-+        vld1.32   {d17[1]}, [r4],  r9
-+        ble        2f
-+        vld1.32   {d18[0]}, [r8],  r9
-+        vld1.32   {d18[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.32   {d19[0]}, [r8],  r9
-+        vld1.32   {d19[1]}, [r4],  r9
-+        blt        3f
-+        vld1.32   {d20[0]}, [r8],  r9
-+        vld1.32   {d20[1]}, [r4],  r9
-+        vld1.32   {d21[0]}, [r8],  r9
-+        vld1.32   {d21[1]}, [r4],  r9
-+        ble        4f
-+        vld1.32   {d22[0]}, [r8],  r9
-+        vld1.32   {d22[1]}, [r4],  r9
-+        vld1.32   {d23[0]}, [r8]
-+        vld1.32   {d23[1]}, [r4]
-+        b          5f
-+2:      vdup.32    q9,  d17[1]
-+3:      vdup.32    q10, d19[1]
-+4:      vdup.32    q11, d21[1]
-+5:      vstm       lr,  { q8-q11}
-+1:
-+        eors       r7,  r2
-+        beq        99f
-+
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        vdup.32    q0,  d31[0]
-+        vdup.32    q1,  d31[0]
-+        vdup.32    q2,  d31[0]
-+        vdup.32    q3,  d31[0]
-+        add        lr,  r1,  #(pw << log2_s)
-+        vdup.32    q8,  d31[1]
-+        vdup.32    q9,  d31[1]
-+        vdup.32    q10, d31[1]
-+        vdup.32    q11, d31[1]
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        it mi
-+        vstmmi     lr,  { q8-q11}
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q0,  d30[0]
-+        vdup.32    q1,  d30[0]
-+        vdup.32    q2,  d30[0]
-+        vdup.32    q3,  d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        it mi
-+        vstmmi     r0, { q0-q3 }
-+        it cs
-+        vstmcs     lr, { q0-q3 }
-+
-+99:
-+        pop       {r4-r10, pc}
-+endfunc
-+
-+
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
-new file mode 100644
-index 0000000000..56819ae439
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
-@@ -0,0 +1,920 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+/*
-+ * Horizontal & Vertical special cases of angular intra pred
-+ *
-+ * Split out because:
-+ *  Vertical, at least, is relatively common
-+ *  Much simpler code than the general angular case
-+ *  Luma with size < 32 has extra filtering that doesn't happen anywhere else
-+ *
-+ * *** Currently luma filtering is mandatory where it occurs, but there are
-+ *     cases where it should be turned off (rdpcm & an extension sps flag).
-+ *     These don't occur in the standard conformance suite for Main Profile
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.32     {d0[0]}, [r2 :32]   @ Left
-+        add         r2, r0, r3
-+        vld1.8      {d1[]}, [r1]
-+        lsl         r3, #1
-+        vdup.8      d4, ip
-+        vmov.i8     d2, #128
-+        vhsub.u8    d4, d0, d4
-+        veor        d1, d2
-+        vld1.32     {d0[0]}, [r1 :32]   @ Top
-+        vqadd.s8    d1, d4
-+        vmov.i64    d3, #0xff
-+        vmov        d4, d0
-+        veor        d5, d1, d2
-+        veor        d1, d1, d2
-+        vbit        d0, d1, d3
-+        vshr.u64    d5, #8
-+        vst1.32     {d0[0]}, [r0], r3
-+        vshr.u64    d1, #16
-+        vbit        d4, d5, d3
-+        vshr.u64    d5, #16
-+        vst1.32     {d4[0]}, [r2], r3
-+        vbit        d0, d1, d3
-+        vst1.32     {d0[0]}, [r0]
-+        vbit        d4, d5, d3
-+        vst1.32     {d4[0]}, [r2]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {d0}, [r2 :64]      @ Left
-+        vmov.i8     d1, #128
-+        vld1.8      {d2[]}, [r1]
-+        vld1.8      {d3}, [r1 :64]      @ Top
-+        vdup.8      d4, ip
-+        vhsub.u8    d4, d0, d4
-+        veor        d2, d1
-+        vmov.i64    d0, #0xff
-+        mov         r1, #8
-+        vqadd.s8    d2, d4, d2
-+        veor        d1, d2, d1
-+1:
-+        vbit        d3, d1, d0
-+        vshr.u64    d1, #8
-+        vst1.8      {d3}, [r0 :64], r3
-+        subs        r1, #2
-+        vbit        d3, d1, d0
-+        vshr.u64    d1, #8
-+        vst1.8      {d3}, [r0 :64], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {q0}, [r2 :128]     @ Left
-+        vdup.8      q1, ip
-+        vld1.8      {d4[],d5[]}, [r1]
-+        vhsub.u8    q0, q1
-+        vmov.i8     q1, #128
-+        veor        q2, q1
-+        vmov.i64    d16, #0xff
-+        vqadd.s8    q0, q2
-+        vld1.8      {q3}, [r1 :128]     @ Top
-+        mov         r1, #16
-+        veor        q0, q1
-+        vmov        q1, q3
-+        vext.8      q2, q0, q0, #1
-+1:
-+        vbit        d2, d0, d16
-+        vbit        d6, d4, d16
-+        vext.8      q0, q0, q0, #2
-+        subs        r1, #2
-+        vst1.8      {q1}, [r0 :128], r3
-+        vext.8      q2, q2, q2, #2
-+        vst1.8      {q3}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vert_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
-+        vld1.8     {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3
-+        lsl         r3,  #1
-+        mov         r1,  #16
-+1:
-+        vst1.8     {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.8     {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
-+        vld1.16    {d0 }, [r1  :64]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+
-+        vst1.16    {d0 }, [r0  :64], r3
-+        vst1.16    {d0 }, [r2  :64], r3
-+        vst1.16    {d0 }, [r0  :64]
-+        vst1.16    {d0 }, [r2  :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
-+        vld1.16    {q0 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+        mov         r1,  #4
-+1:
-+        vst1.16    {q0 }, [r0  :128], r3
-+        subs        r1,  #2
-+        vst1.16    {q0 }, [r2  :128], r3
-+        vst1.16    {q0 }, [r0  :128], r3
-+        vst1.16    {q0 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
-+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #1
-+        lsl         r3,  #2
-+        mov         r1,  #8
-+1:
-+        vst1.16    {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.16    {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+@ ? Might be faster as simple arm
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.32     {d0[0]}, [r1 :32]   @ Top
-+        add         r1, r2, #3
-+        vld1.8      {d1[]}, [r2]!
-+        vdup.8      d2, ip
-+        vmov.i8     d3, #128
-+        vhsub.u8    d0, d2
-+        veor        d1, d3
-+        vld1.8      {d2[]}, [r2]!
-+        add         ip, r0, r3
-+        vqadd.s8    d0, d0, d1
-+        lsl         r3, #1
-+        vld1.8      {d1[]}, [r2]
-+        vld1.8      {d4[]}, [r1]
-+        veor        d0, d3
-+        vst1.32     {d0[0]}, [r0 :32], r3
-+        vst1.32     {d2[0]}, [ip :32], r3
-+        vst1.32     {d1[0]}, [r0 :32]
-+        vst1.32     {d4[0]}, [ip :32]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {d0}, [r1 :64]      @ Top
-+        vmov.i8     d1, #128
-+        vld1.8      {d2[]}, [r2]!
-+        mov         r1, #8-2
-+        vdup.8      d3, ip
-+        vhsub.u8    d0, d3
-+        veor        d2, d1
-+        vqadd.s8    d0, d2
-+          vld1.8      {d2[]}, [r2]!
-+        veor        d0, d1
-+        vst1.8      {d0}, [r0], r3
-+1:
-+            vld1.8      {d0[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {d2}, [r0 :64], r3
-+              vld1.8      {d2[]}, [r2]!
-+            vst1.8      {d0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.8      {d2}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
-+        ldrb        ip, [r2, #-1]       @ Top-left
-+        vld1.8      {q0}, [r1 :64]      @ Top
-+        mov         r1, #16-2
-+        vld1.8      {d4[],d5[]}, [r2]!
-+        vdup.8      q3, ip
-+        vhsub.u8    q0, q3
-+        vmov.i8     q1, #128
-+        veor        q2, q1
-+        vqadd.s8    q0, q2
-+          vld1.8      {d4[],d5[]}, [r2]!
-+        veor        q0, q1
-+        vst1.8      {q0}, [r0], r3
-+1:
-+            vld1.8      {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {q2}, [r0 :64], r3
-+              vld1.8      {d4[],d5[]}, [r2]!
-+            vst1.8      {q0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.8      {q2}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
-+        vld1.8      {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        mov         r1, #32-2
-+          vld1.8      {d2[],d3[]}, [r2]!
-+        vst1.8      {q0}, [r0 :128], r3
-+        vst1.8      {q0}, [ip :128], r3
-+1:
-+            vld1.8      {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.8      {q1}, [r0 :128], r3
-+          vst1.8      {q1}, [ip :128], r3
-+              vld1.8      {d2[],d3[]}, [r2]!
-+            vst1.8      {q0}, [r0 :128], r3
-+            vst1.8      {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.8      {q1}, [r0 :128]
-+              vst1.8      {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
-+        add         r1, r2, #2
-+        vld1.16     {d0[]}, [r2]
-+        add         r2, #4
-+        vld1.16     {d1[]}, [r1]
-+        add         r1, #4
-+        vld1.16     {d2[]}, [r2]
-+A       add         r2, r0, r3, lsl #1
-+T       lsl         r3, #1
-+T       add         r2, r0, r3
-+        vld1.16     {d3[]}, [r1]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vst1.16     {d0}, [r0 :64], r3
-+        vst1.16     {d1}, [r2 :64], r3
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d3}, [r2 :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        lsl         r3, #1
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        mov         r1, #8-2
-+        vst1.16     {q0}, [r0 :64], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :64], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :64], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :64]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        lsl         r3, #1
-+        add         ip, r0, #16
-+        mov         r1, #16-2
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        vst1.16     {q0}, [r0 :128], r3
-+        vst1.16     {q0}, [ip :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :128], r3
-+          vst1.16     {q1}, [ip :128], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :128]
-+              vst1.16     {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 Bit
-+@ Has clipping constants so 10-bit only but could easily be macroed up to
-+@ 14-bit before we run out of bits
-+
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {d0}, [r2 :64]      @ Left
-+        vmov.i16    d2, #0
-+        vld1.16     {d1[]}, [r1]
-+T       lsl         r3, #1
-+        vdup.16     d4, ip
-+        vmov.i16    d3, #0x3ff
-+        vld1.16     {d5}, [r1 :64]      @ Top
-+        vhsub.u16   d4, d0, d4
-+        vmov.i64    d0, #0xffff
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vadd.i16    d1, d1, d4
-+        vmov        d6, d5
-+        vmax.s16    d1, d1, d2
-+        vmin.s16    d2, d1, d3
-+        vmin.s16    d1, d1, d3
-+        vbit        d5, d1, d0
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vshr.u64    d2, #16
-+        vshr.u64    d1, #32
-+        vbit        d6, d2, d0
-+        vst1.16     {d5}, [r0], r3
-+        vshr.u64    d2, #32
-+        vst1.16     {d6}, [r2], r3
-+        vbit        d5, d1, d0
-+        vst1.16     {d5}, [r0]
-+        vbit        d6, d2, d0
-+        vst1.16     {d6}, [r2]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0}, [r2 :128]     @ Left
-+        lsl         r3, #1
-+        vdup.16     q1, ip
-+        vld1.16     {d4[],d5[]}, [r1]
-+        vhsub.u16   q0, q0, q1
-+        vmov.i16    q1, #0
-+        vadd.i16    q0, q2
-+        vmov.i16    q2, #0x3ff
-+        vld1.16     {q3}, [r1 :128]     @ Top
-+        mov         r1, #8
-+        vmax.s16    q0, q1
-+        vmov        q1, q3
-+        vmin.s16    q0, q2
-+        vmov.i64    d16, #0xffff
-+        vext.16     q2, q0, q0, #1
-+1:
-+        vbit        d2, d0, d16
-+        vbit        d6, d4, d16
-+        vext.16     q0, q0, q0, #2
-+        subs        r1, #2
-+        vst1.16     {q1}, [r0 :128], r3
-+        vext.16     q2, q2, q2, #2
-+        vst1.16     {q3}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0-q1}, [r2 :128]  @ Left
-+T       lsl         r3, #1
-+        vdup.16     q2, ip
-+A       add         r2, r0, r3, lsl #1
-+T       add         r2, r0, r3
-+        vld1.16     {d6[],d7[]}, [r1]
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vhsub.u16   q0, q2
-+        vhsub.u16   q1, q2
-+        vadd.i16    q0, q3
-+        vadd.i16    q1, q3
-+        vmov.i16    q2, #0
-+        vld1.16     {q8-q9}, [r1 :128]  @ Top
-+        mov         r1, #0
-+        vmov.i16    q3, #0x3ff
-+        vmax.s16    q0, q2
-+        vmax.s16    q1, q2
-+        vmin.s16    q0, q3
-+        vmin.s16    q1, q3
-+        vmov        q10, q8
-+        vmov        q11, q9
-+        vext.16     q2, q0, q1, #1
-+        vext.16     q3, q1, q1, #1
-+        vmov.i64    d24, #0xffff
-+1:
-+        vbit        d16, d0, d24
-+        vbit        d20, d4, d24
-+        vext.16     q0, q0, q0, #2
-+        subs        r1, #1<<30
-+        vst1.16     {q8-q9}, [r0 :128], r3
-+        vext.16     q2, q2, q2, #2
-+        vst1.16     {q10-q11}, [r2 :128], r3
-+        bne         1b
-+1:
-+        vbit        d16, d2, d24
-+        vbit        d20, d6, d24
-+        vext.16     q1, q1, q1, #2
-+        subs        r1, #1<<30
-+        vst1.16     {q8-q9}, [r0 :128], r3
-+        vext.16     q3, q3, q3, #2
-+        vst1.16     {q10-q11}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
-+        vldm        r1, { q0-q3 }    @ Up
-+        lsl         r3, #1
-+        mov         r1, #32
-+        add         r2, r0, #32
-+1:
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+        subs        r1, #1
-+        vst1.16     {q2-q3}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
-+        vld1.16    {q0 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #2
-+        lsl         r3,  #3
-+
-+        vst1.16    {q0 }, [r0  :128], r3
-+        vst1.16    {q0 }, [r2  :128], r3
-+        vst1.16    {q0 }, [r0  :128]
-+        vst1.16    {q0 }, [r2  :128]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
-+        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
-+        add         r2,  r0,  r3,  lsl #2
-+        lsl         r3,  #3
-+        mov         r1,  #4
-+1:
-+        vst1.16    {q0,  q1 }, [r0  :128], r3
-+        subs        r1,  #1
-+        vst1.16    {q0,  q1 }, [r2  :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
-+        vldm        r1, { q0-q3 }    @ Up
-+        lsl         r3, #2
-+        mov         r1, #16
-+        add         r2, r0, #32
-+1:
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+        subs        r1, #1
-+        vst1.16     {q2-q3}, [r2 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+endfunc
-+
-+@ ff_hevc_rpi_pred_horizontal_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {d0}, [r1 :64]      @ Top
-+        vmov.i16    d1, #0
-+        vld1.16     {d2[]}, [r2]!
-+T       lsl         r3, #1
-+        vdup.16     d3, ip
-+        vmov.i16    d4, #0x3ff
-+        vhsub.u16   d0, d3
-+A       add         ip, r0, r3, lsl #1
-+T       add         ip, r0, r3
-+        vld1.16     {d3[]}, [r2]!
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vadd.i16    d0, d2
-+        vld1.16     {d2[]}, [r2]!
-+        vmax.s16    d0, d1
-+        vld1.16     {d1[]}, [r2]
-+        vmin.s16    d0, d4
-+        vst1.16     {d0}, [r0 :64], r3
-+        vst1.16     {d3}, [ip :64], r3
-+        vst1.16     {d2}, [r0 :64]
-+        vst1.16     {d1}, [ip :64]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0}, [r1 :128]     @ Top
-+        lsl         r3, #1
-+        vdup.16     q1, ip
-+        mov         r1, #8-2
-+        vhsub.u16   q0, q1
-+        vld1.16     {d2[],d3[]}, [r2]!
-+        vmov.i16    q2, #0
-+        vadd.i16    q0, q1
-+        vmov.i16    q1, #0x3ff
-+        vmax.s16    q0, q2
-+          vld1.16     {d4[],d5[]}, [r2]!
-+        vmin.s16    q0, q1
-+        vst1.16     {q0}, [r0 :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q2}, [r0 :128], r3
-+              vld1.16     {d4[],d5[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+        bne         1b
-+
-+              vst1.16     {q2}, [r0 :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
-+        ldrh        ip, [r2, #-2]       @ Top-left
-+        vld1.16     {q0-q1}, [r1 :128]  @ Top
-+        lsl         r3, #1
-+        vdup.16     q2, ip
-+        add         ip, r0, r3
-+        vhsub.u16   q0, q2
-+        add         ip, #16
-+        vhsub.u16   q1, q2
-+        mov         r1, #16-2
-+        vld1.16     {d4[],d5[]}, [r2]!
-+        vmov.i16    q3, #0
-+        vadd.u16    q0, q2
-+        vadd.i16    q1, q2
-+        vmov.i16    q2, #0x3ff
-+        vmax.s16    q0, q3
-+        vmax.s16    q1, q3
-+          vld1.16     {d6[],d7[]}, [r2]!
-+        vmin.s16    q0, q2
-+        vmin.s16    q1, q2
-+        vst1.16     {q0-q1}, [r0 :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q3}, [r0 :128], r3
-+          vst1.16     {q3}, [ip :128], r3
-+              vld1.16     {d6[],d7[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q3}, [r0 :128]
-+              vst1.16     {q3}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
-+        vld1.16     {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        push        {lr}
-+        mov         lr, #32
-+          vld1.16     {d2[],d3[]}, [r2]!
-+        lsl         r3, #1
-+        vst1.16     {q0}, [r0 :128], lr
-+        sub         r3, #32
-+        vst1.16     {q0}, [ip :128], lr
-+        mov         r1, #32-2
-+        vst1.16     {q0}, [r0 :128], r3
-+        vst1.16     {q0}, [ip :128], r3
-+1:
-+            vld1.16     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.16     {q1}, [r0 :128], lr
-+          vst1.16     {q1}, [ip :128], lr
-+          vst1.16     {q1}, [r0 :128], r3
-+          vst1.16     {q1}, [ip :128], r3
-+              vld1.16     {d2[],d3[]}, [r2]!
-+            vst1.16     {q0}, [r0 :128], lr
-+            vst1.16     {q0}, [ip :128], lr
-+            vst1.16     {q0}, [r0 :128], r3
-+            vst1.16     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.16     {q1}, [r0 :128], lr
-+              vst1.16     {q1}, [ip :128], lr
-+              vst1.16     {q1}, [r0 :128]
-+              vst1.16     {q1}, [ip :128]
-+        pop         {pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
-+        add         r1, r2, #4
-+        vld1.32     {d0[],d1[]}, [r2]
-+        add         r2, #8
-+        vld1.32     {d2[],d3[]}, [r1]
-+        add         r1, #8
-+        vld1.32     {d4[],d5[]}, [r2]
-+A       add         r2, r0, r3, lsl #2
-+T       lsl         r3, #2
-+T       add         r2, r0, r3
-+        vld1.32     {d6[],d7[]}, [r1]
-+A       lsl         r3, #3
-+T       lsl         r3, #1
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q1}, [r2 :128], r3
-+        vst1.32     {q2}, [r0 :128]
-+        vst1.32     {q3}, [r2 :128]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        lsl         r3, #2
-+        add         ip, r0, #16
-+        mov         r1, #8-2
-+          vld1.32     {d2[],d3[]}, [r2]!
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q0}, [ip :128], r3
-+1:
-+            vld1.32     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.32     {q1}, [r0 :128], r3
-+          vst1.32     {q1}, [ip :128], r3
-+              vld1.32     {d2[],d3[]}, [r2]!
-+            vst1.32     {q0}, [r0 :128], r3
-+            vst1.32     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.32     {q1}, [r0 :128]
-+              vst1.32     {q1}, [ip :128]
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        add         ip, r0, #16
-+        push        {lr}
-+        mov         lr, #32
-+          vld1.32     {d2[],d3[]}, [r2]!
-+        lsl         r3, #2
-+        vst1.32     {q0}, [r0 :128], lr
-+        sub         r3, #32
-+        vst1.32     {q0}, [ip :128], lr
-+        mov         r1, #16-2
-+        vst1.32     {q0}, [r0 :128], r3
-+        vst1.32     {q0}, [ip :128], r3
-+1:
-+            vld1.32     {d0[],d1[]}, [r2]!
-+        subs        r1, #2
-+          vst1.32     {q1}, [r0 :128], lr
-+          vst1.32     {q1}, [ip :128], lr
-+          vst1.32     {q1}, [r0 :128], r3
-+          vst1.32     {q1}, [ip :128], r3
-+              vld1.32     {d2[],d3[]}, [r2]!
-+            vst1.32     {q0}, [r0 :128], lr
-+            vst1.32     {q0}, [ip :128], lr
-+            vst1.32     {q0}, [r0 :128], r3
-+            vst1.32     {q0}, [ip :128], r3
-+        bne         1b
-+
-+              vst1.32     {q1}, [r0 :128], lr
-+              vst1.32     {q1}, [ip :128], lr
-+              vst1.32     {q1}, [r0 :128]
-+              vst1.32     {q1}, [ip :128]
-+        pop         {pc}
-+endfunc
-+
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
-new file mode 100644
-index 0000000000..af8c4c03f0
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
-@@ -0,0 +1,1043 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ Planar intra pred (8.4.4.2.4)
-+@
-+@ predSamples[ x ][ y ] =
-+@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
-+@   ( x + 1 ) * p[ nTbS ][ -1 ] +
-+@   ( nTbS - 1 - y ) * p[ x ][ -1 ] +
-+@   ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
-+
-+@ All 10-bit functions would work with 9
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_8, export=1
-+
-+        vld1.8      {d0}, [r1]          @ Top
-+        adr         ip, nb_3_0_1_4
-+        vld1.8      {d1}, [r2]          @ Left
-+        vmov.i64    d2, #0xffffffff
-+        vldr        d3, [ip, #8]        @ {1,2,3,4,1,2,3,4}
-+        add         r1, r0, r3
-+        vdup.32     d4, d0[0]           @ {t0,t1,t2,t3,t0,t1,t2,t3}
-+        vdup.8      d0, d0[4]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+        vdup.8      d5, d1[4]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+        vdup.8      d6, d1[0]           @ {l0,l0,l0,l0,l0,l0,l0,l0}
-+        vshll.u8    q8, d4, #2
-+        lsl         r3, #1
-+        vsubl.u8    q2, d5, d4
-+        vmlal.u8    q8, d0, d3
-+        vld1.8      {d0}, [ip]          @ {3,2,1,0,3,2,1,0}
-+        vdup.8      d7, d1[1]           @ {l1,l1,l1,l1,l1,l1,l1,l1}
-+        vshl.s16    q9, q2, #1
-+        vbif        d6, d7, d2          @ {l0,l0,l0,l0,l1,l1,l1,l1}
-+        vadd.i16    d16, d4
-+        vdup.8      d7, d1[2]           @ {l2,l2,l2,l2,l2,l2,l2,l2}
-+        vadd.i16    d17, d18
-+        vdup.8      d1, d1[3]           @ {l3,l3,l3,l3,l3,l3,l3,l3}
-+        vadd.i16    q2, q8, q9
-+        vmlal.u8    q8, d0, d6
-+        vbif        d7, d1, d2          @ {l2,l2,l2,l2,l3,l3,l3,l3}
-+        vmlal.u8    q2, d0, d7
-+        vrshrn.i16  d0, q8, #3
-+        vst1.32     d0[0], [r0 :32], r3
-+        vst1.32     d0[1], [r1 :32], r3
-+        vrshrn.i16  d0, q2, #3
-+        vst1.32     d0[0], [r0 :32]
-+        vst1.32     d0[1], [r1 :32]
-+
-+        bx          lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_10, export=1
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0}, [r1 :64]      @ Top
-+        adr         ip, nbh_3_0_1_4
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        vldr        d3, [ip, #8]        @ {1,2,3,4}
-+T       lsl         r3, #1
-+        vshl.s16    d4, d0, #2
-+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4}
-+        vldr        d5, [r2]            @ Left (upper)
-+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4}
-+        vldr        d6, [ip]            @ {3,2,1,0}
-+        vmla.i16    d4, d3, d1          @ Acc set up
-+        vsub.i16    d0, d2, d0          @ Add set up
-+        vmov        d7, d6
-+        vdup.16     d2, d5[0]
-+        vdup.16     d3, d5[1]
-+        vdup.16     d16, d5[2]
-+        vadd.i16    d18, d0, d4
-+        vshl.s16    d0, #1              @ x2
-+        vadd.i16    d19, d0, d4
-+        vdup.16     d17, d5[3]
-+        vadd.i16    d4, d0, d18
-+A       add         r1, r0, r3, lsl #1
-+T       add         r1, r0, r3
-+        vadd.i16    d5, d0, d19
-+A       lsl         r3, #2
-+T       lsl         r3, #1
-+        vmla.i16    q9, q1, q3
-+        vmla.i16    q2, q8, q3
-+        vrshr.u16   q0, q9, #3
-+        vst1.16     {d0}, [r0], r3
-+        vrshr.u16   d2, d4, #3
-+        vst1.16     {d1}, [r1], r3
-+        vrshr.u16   d3, d5, #3
-+        vst1.16     {d2}, [r0]
-+        vst1.16     {d3}, [r1]
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_8, export=1
-+
-+        vld1.8      {q0}, [r1]          @ Top
-+        adr         ip, nb_7_0_1_8
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        mov         r1, #8
-+        vldr        d3, [ip, #8]        @ {1,2,3,4,5,6,7,8}
-+        vshll.u8    q2, d0, #3
-+        vdup.8      d1, d1[0]           @ {t8,t8,t8,t8,t8,t8,t8,t8}
-+        vdup.8      d2, d2[0]           @ {l8,l8,l8,l8,l8,l8,l8,l8}
-+        vldr        d6, [r2]            @ Left (upper)
-+        vmlal.u8    q2, d3, d1
-+        vsubl.u8    q0, d2, d0
-+        vldr        d7, [ip]            @ {7,6,5,4,3,2,1,0}
-+
-+@ u8   7..0    [1]  d7
-+@ u8  left[y]  [1]  d6
-+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vdup.8      d2, d6[0]
-+        vadd.i16    q2, q0
-+        vdup.8      d3, d6[1]
-+        vadd.i16    q8, q2, q0
-+1:
-+        vmlal.u8    q2, d7, d2
-+        subs        r1, #2
-+        vadd.i16    q9, q8, q0
-+        vmlal.u8    q8, d7, d3
-+        vdup.8      d2, d6[2]
-+        vdup.8      d3, d6[3]
-+        vrshrn.i16  d20, q2, #4
-+        vshr.u64    d6, #16
-+        vmov        q2, q9
-+        vst1.8      {d20}, [r0], r3
-+        vrshrn.i16  d20, q8, #4
-+        vadd.i16    q8, q2, q0
-+        vst1.8      {d20}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_10, export=1
-+
-+        adr         ip, nb_7_0_1_8
-+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
-+        lsl         r3, #1
-+        vld1.16     {q1}, [ip :128]     @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
-+        add         ip, r2, #16
-+        vld1.16     {d4[],d5[]}, [r1]   @ Top (right)
-+        mov         r1, #8-2
-+        vshl.s16    q3, q0, #3
-+        vmovl.u8    q8, d3              @ {1,2,3,4,5,6,7,8}
-+        vld1.16     {d18[],d19[]}, [ip] @ Left (lower)
-+        vmla.i16    q3, q8, q2          @ Acc set up
-+        vsub.i16    q0, q9, q0          @ Add set up
-+        vmovl.u8    q1, d2              @ {7,6,5,4,3,2,1,0}
-+        vadd.i16    q2, q3, q0
-+
-+@ u16  7..0        [1]  q1
-+@ u32 left[y]      [1]  [r2]
-+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.16     {d6[],d7[]}, [r2]!
-+        vadd.i16    q8, q2, q0
-+        vld1.16     {d18[],d19[]}, [r2]!
-+        vmla.i16    q2, q1, q3
-+        vadd.i16    q3, q8, q0
-+        vmla.i16    q8, q1, q9
-+1:
-+        vrshr.u16   q9, q2, #4
-+        subs        r1, #2
-+        vmov        q2, q3
-+        vrshr.u16   q10, q8, #4
-+          vld1.16     {d6[],d7[]}, [r2]!
-+        vst1.16     {q9}, [r0 :128], r3
-+          vadd.i16    q8, q2, q0
-+          vld1.16     {d18[],d19[]}, [r2]!
-+          vmla.i16    q2, q1, q3
-+          vadd.i16    q3, q8, q0
-+          vmla.i16    q8, q1, q9
-+        vst1.16     {q10}, [r0 :128], r3
-+        bne         1b
-+
-+        vrshr.u16   q9, q2, #4
-+        add         r3, r0
-+        vrshr.u16   q10, q8, #4
-+        vst1.16     {q9}, [r0 :128]
-+        vst1.16     {q10}, [r3 :128]
-+
-+        bx         lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+        .balign 64
-+
-+nb_31_0_1_32:
-+        .byte   31, 30, 29, 28, 27, 26, 25, 24
-+        .byte   23, 22, 21, 20, 19, 18, 17, 16
-+nb_15_0_1_16:
-+        .byte   15, 14, 13, 12, 11, 10,  9,  8
-+        .byte    7,  6,  5,  4,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  5,  6,  7,  8
-+        .byte    9, 10, 11, 12, 13, 14, 15, 16
-+        .byte   17, 18, 19, 20, 21, 22, 23, 24
-+        .byte   25, 26, 27, 28, 29, 30, 31, 32
-+
-+        @ should be back on a 64-byte boundary here
-+
-+        @ These could be extracted from the above array, but separate out
-+        @ out for better (16 byte) alignment
-+nb_3_0_1_4:
-+        .byte    3,  2,  1,  0,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  1,  2,  3,  4
-+nb_7_0_1_8:
-+        .byte    7,  6,  5,  4,  3,  2,  1,  0
-+        .byte    1,  2,  3,  4,  5,  6,  7,  8
-+nbh_3_0_1_4:
-+        .short   3,  2,  1,  0,  1,  2,  3,  4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_8, export=1
-+
-+        adr         ip, nb_15_0_1_16 + 16
-+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
-+        add         r2, #16
-+        vld1.8      {q1}, [ip: 128]     @ {1,2,3...16}
-+        vld1.8      {d4[]}, [r1]        @ Top (right)
-+        sub         ip, #16
-+        vshll.u8    q3, d0, #4
-+        mov         r1, #16
-+        vshll.u8    q8, d1, #4
-+        vld1.8      {d5[]}, [r2]        @ Left (lower)
-+        sub         r2, #16
-+        vmlal.u8    q3, d2, d4
-+        vmlal.u8    q8, d3, d4          @ Acc set up
-+        vsubl.u8    q1, d5, d0
-+        vsubl.u8    q0, d5, d1          @ Add set up
-+        vld1.8      {q2}, [ip :128]     @ {15,14,13...0}
-+
-+@ u8  15..0    [1]  q2
-+@ u8  left[y]  [1]  [r2]
-+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q3, q1
-+        vadd.i16    q8, q0
-+1:
-+        vadd.i16    q10, q3, q1
-+        subs        r1, #2
-+        vld1.8      {d18[]}, [r2]!
-+        vadd.i16    q11, q8, q0
-+        vld1.8      {d19[]}, [r2]!
-+        vmlal.u8    q3, d4, d18
-+        vmlal.u8    q8, d5, d18
-+        vadd.i16    q12, q10, q1
-+        vmlal.u8    q10, d4, d19
-+        vadd.i16    q13, q11, q0
-+        vmlal.u8    q11, d5, d19
-+        vrshrn.u16  d18, q3, #5
-+        vrshrn.u16  d19, q8, #5
-+        vmov        q3, q12
-+        vst1.8      {q9}, [r0 :128], r3
-+        vrshrn.u16  d18, q10, #5
-+        vrshrn.u16  d19, q11, #5
-+        vmov        q8, q13
-+        vst1.8      {q9}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        adr         ip, nb_15_0_1_16 + 16
-+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
-+        add         r2, #32
-+        vld1.8      {q2}, [ip :128]     @ {1,2,3...16}
-+        lsl         r3, #1
-+        vld1.16     {d6[],d7[]}, [r1]   @ Top (right)
-+        sub         ip, #16
-+        vmovl.u8    q8, d4
-+        mov         r1, #16
-+        vshl.i16    q9, q0, #4
-+        vmovl.u8    q2, d5
-+        vshl.i16    q10, q1, #4
-+        vld1.16     {d22[],d23[]}, [r2] @ Left (lower)
-+        sub         r2, #32
-+        vld1.8      {q12}, [ip]         @ {15,14,13...0}
-+        vmla.i16    q9, q8, q3
-+        vmla.i16    q10, q2, q3         @ Acc set up
-+        vsub.i16    q0, q11, q0
-+        vsub.i16    q1, q11, q1         @ Add set up
-+        vadd.i16    q2, q9, q0
-+        vadd.i16    q3, q10, q1
-+        vmovl.u8    q8, d24
-+        vmovl.u8    q9, d25
-+
-+@ u16  15..0       [2]  q8,q9
-+@ u32 left[y]      [2]  [r2]
-+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+        vadd.i16    q10, q2, q0
-+        subs        r1, #2
-+        vld1.16     {d24[],d25[]}, [r2]!
-+        vadd.i16    q11, q3, q1
-+        vld1.16     {d28[],d29[]}, [r2]!
-+        vmla.i16    q2, q8, q12
-+        vmla.i16    q3, q9, q12
-+        vadd.i16    q12, q10, q0
-+        vmla.i16    q10, q8, q14
-+        vadd.i16    q13, q11, q1
-+        vmla.i16    q11, q9, q14
-+        vrshr.u16   q14, q2, #5
-+        vrshr.u16   q15, q3, #5
-+        vmov        q2, q12
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        vrshr.u16   q14, q10, #5
-+        vrshr.u16   q15, q11, #5
-+        vmov        q3, q13
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        bne         1b
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_8, export=1
-+
-+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
-+        adr         ip, nb_31_0_1_32 + 32
-+        vpush       {d8-d12}
-+        vld1.8      {q2-q3}, [ip :128]  @ {1,2,3...32}
-+        add         r2, #32
-+        vld1.8      {d8[]}, [r1]        @ Top (right)
-+        sub         ip, #32
-+        vshll.u8    q8, d0, #5
-+        mov         r1, #32
-+        vld1.8      {d9[]}, [r2]        @ Left (lower)
-+        sub         r2, #32
-+        vshll.u8    q9, d1, #5
-+        vshll.u8    q10, d2, #5
-+        vshll.u8    q11, d3, #5
-+        vmlal.u8    q8, d4, d8
-+        vsubl.u8    q12, d9, d0
-+        vmlal.u8    q9, d5, d8
-+        vsubl.u8    q13, d9, d1
-+        vmlal.u8    q10, d6, d8
-+        vsubl.u8    q14, d9, d2
-+        vmlal.u8    q11, d7, d8         @ Acc set up
-+        vsubl.u8    q15, d9, d3         @ Add set up
-+        vadd.i16    q8, q12
-+        vadd.i16    q9, q13
-+        vadd.i16    q10, q14
-+        vadd.i16    q11, q15
-+        vld1.8      {q4-q5}, [ip :128]  @ {31,30,29...0}
-+
-+@ u8  31..0    [2]  q4,q5
-+@ u8  left[y]  [2]  [r2]
-+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.8      {d12[]}, [r2]!
-+        vadd.i16    q0, q8, q12
-+        b           2f
-+1:
-+          vld1.8      {d12[]}, [r2]!
-+        vrshrn.u16  d3, q1, #6
-+        vrshrn.u16  d2, q0, #6
-+          vadd.i16    q0, q8, q12
-+        vrshrn.u16  d4, q2, #6
-+        vrshrn.u16  d5, q3, #6
-+        vst1.8      {q1-q2}, [r0 :128], r3
-+2:        vadd.i16    q1, q9, q13
-+          subs        r1, #2
-+          vadd.i16    q2, q10, q14
-+          vadd.i16    q3, q11, q15
-+          vmlal.u8    q8, d8, d12
-+          vmlal.u8    q9, d9, d12
-+          vmlal.u8    q10, d10, d12
-+          vmlal.u8    q11, d11, d12
-+            vld1.8      {d12[]}, [r2]!
-+          vrshrn.u16  d19, q9, #6
-+          vrshrn.u16  d18, q8, #6
-+            vadd.i16    q8, q0, q12
-+          vrshrn.u16  d20, q10, #6
-+          vrshrn.u16  d21, q11, #6
-+          vst1.8      {q9-q10}, [r0 :128], r3
-+            vadd.i16    q9, q1, q13
-+            vadd.i16    q10, q2, q14
-+            vadd.i16    q11, q3, q15
-+            vmlal.u8    q0, d8, d12
-+            vmlal.u8    q1, d9, d12
-+            vmlal.u8    q2, d10, d12
-+            vmlal.u8    q3, d11, d12
-+
-+        bne         1b
-+
-+        vpop        {d8-d12}
-+
-+        vrshrn.u16  d3, q1, #6
-+        vrshrn.u16  d2, q0, #6
-+        vrshrn.u16  d4, q2, #6
-+        vrshrn.u16  d5, q3, #6
-+        vst1.8      {q1-q2}, [r0 :128]
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
-+        adr         ip, nb_31_0_1_32 + 32
-+        vpush       {q4-q7}
-+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
-+        add         r2, #64
-+        vld1.8      {q14-q15}, [ip :128] @ {1,2,3...32}
-+T       lsl         r3, #1
-+        vld1.16     {d8[],d9[]}, [r1]    @ Top (right)
-+        sub         ip, #32
-+        vmovl.u8    q12, d28
-+        mov         r1, #32
-+        vmovl.u8    q13, d29
-+        vld1.8      {q6-q7}, [ip :128]   @ {31,30,29...0}
-+        vmovl.u8    q14, d30
-+        vmovl.u8    q15, d31
-+        vld1.16     {d10[],d11[]}, [r2]  @ Left (lower)
-+        sub         r2, #64
-+        vshl.i16    q8, q0, #5
-+        vshl.i16    q9, q1, #5
-+        vshl.i16    q10, q2, #5
-+        vshl.i16    q11, q3, #5
-+        vmla.i16    q8, q12, q4
-+        vsub.i16    q0, q5, q0
-+        vmla.i16    q9, q13, q4
-+        vsub.i16    q1, q5, q1
-+        vmla.i16    q10, q14, q4
-+        vmov.u16    ip, d0[0]
-+        vsub.i16    q2, q5, q2
-+        vmla.i16    q11, q15, q4         @ Acc set up
-+        vsub.i16    q3, q5, q3           @ Add set up
-+        vadd.i16    q8, q0
-+        vadd.i16    q9, q1
-+        vadd.i16    q10, q2
-+        vadd.i16    q11, q3
-+        vmovl.u8    q4, d12
-+        vmovl.u8    q5, d13
-+        vmovl.u8    q6, d14
-+        vmovl.u8    q7, d15
-+
-+@ u16 31..0    [4]  q4-q7
-+@ u16 left[y]  [4]  [r2]
-+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q12, q8, q0
-+A       sub         r0, r0, r3, lsl #1
-+T       sub         r0, r3
-+1:
-+        vld1.16     {d0[0]}, [r2]!
-+A       add         r0, r0, r3, lsl #1
-+T       add         r0, r3
-+        vadd.i16    q13, q9, q1
-+        subs        r1, #2
-+        vadd.i16    q14, q10, q2
-+        vadd.i16    q15, q11, q3
-+        vmla.i16    q8, q4, d0[0]
-+        vmla.i16    q9, q5, d0[0]
-+        vmla.i16    q10, q6, d0[0]
-+        vmla.i16    q11, q7, d0[0]
-+        vmov.16     d0[0], ip
-+        vrshr.u16   q8, #6
-+        vrshr.u16   q9, #6
-+        vrshr.u16   q10, #6
-+        vrshr.u16   q11, #6
-+        vstm        r0, {q8-q11}
-+        vadd.i16    q8, q12, q0
-+A       add         r0, r0, r3, lsl #1
-+T       add         r0, r3
-+        vld1.16     {d0[0]}, [r2]!
-+        vadd.i16    q9, q13, q1
-+        vadd.i16    q10, q14, q2
-+        vadd.i16    q11, q15, q3
-+        vmla.i16    q12, q4, d0[0]
-+        vmla.i16    q13, q5, d0[0]
-+        vmla.i16    q14, q6, d0[0]
-+        vmla.i16    q15, q7, d0[0]
-+        vmov.16     d0[0], ip
-+        vrshr.u16   q12, #6
-+        vrshr.u16   q13, #6
-+        vrshr.u16   q14, #6
-+        vrshr.u16   q15, #6
-+        vstm        r0, {q12-q15}
-+        vadd.i16    q12, q8, q0
-+        bne         1b
-+
-+        vpop        {q4-q7}
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
-+
-+        vld1.8      {q0}, [r1]          @ Top
-+        adr         ip, nbx2_3_0_1_4
-+        vldr        d2, [r2, #8]        @ Left (lower)
-+        mov         r1, #4
-+        vldr        d3, [ip, #8]        @ {1,1,2,2,3,3,4,4}
-+        lsl         r3, #1
-+        vshll.u8    q2, d0, #2
-+        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+        vldr        d6, [r2]            @ Left (upper)
-+        vmlal.u8    q2, d3, d1
-+        vsubl.u8    q0, d2, d0
-+        vldr        d7, [ip]            @ {3,3,2,2,1,1,0,0}
-+
-+@ u8   3..0    [1]  d7
-+@ u8  left[y]  [1]  d6
-+@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vdup.16     d2, d6[0]
-+        vadd.i16    q2, q0
-+        vdup.16     d3, d6[1]
-+        vadd.i16    q8, q2, q0
-+1:
-+        vmlal.u8    q2, d7, d2
-+        subs        r1, #2
-+        vadd.i16    q9, q8, q0
-+        vmlal.u8    q8, d7, d3
-+        vdup.16     d2, d6[2]
-+        vdup.16     d3, d6[3]
-+        vrshrn.i16  d20, q2, #3
-+        vmov        q2, q9
-+        vst1.8      {d20}, [r0], r3
-+        vrshrn.i16  d20, q8, #3
-+        vadd.i16    q8, q2, q0
-+        vst1.8      {d20}, [r0], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
-+
-+        adr         ip, nbx2_3_0_1_4
-+        vld1.16     {q0}, [r1 :128]!    @ Top (left)
-+        lsl         r3, #2
-+        vld1.16     {q1}, [ip :128]     @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
-+        add         ip, r2, #16
-+        vld1.32     {d4[],d5[]}, [r1]   @ Top (right)
-+        vshl.s16    q3, q0, #2
-+        vmovl.u8    q8, d3              @ {1,1,2,2,3,3,4,4}
-+        vld1.32     {d18[],d19[]}, [ip] @ Left (lower)
-+        vmla.i16    q3, q8, q2          @ Acc set up
-+        vsub.i16    q0, q9, q0          @ Add set up
-+        vmovl.u8    q1, d2              @ {3,3,2,2,1,1,0,0}
-+        vadd.i16    q2, q3, q0
-+
-+@ u16  3..0        [1]  q1
-+@ u32 left[y]      [1]  [r2]
-+@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.32     {d6[],d7[]}, [r2]!
-+        vadd.i16    q8, q2, q0
-+        vld1.32     {d18[],d19[]}, [r2]!
-+        vmla.i16    q2, q1, q3
-+        vadd.i16    q3, q8, q0
-+        vmla.i16    q8, q1, q9
-+
-+        vrshr.u16   q9, q2, #3
-+        vmov        q2, q3
-+        vrshr.u16   q10, q8, #3
-+          vld1.32     {d6[],d7[]}, [r2]!
-+        vst1.16     {q9}, [r0 :128], r3
-+          vadd.i16    q8, q2, q0
-+          vld1.32     {d18[],d19[]}, [r2]!
-+          vmla.i16    q2, q1, q3
-+          vadd.i16    q3, q8, q0
-+          vmla.i16    q8, q1, q9
-+        vst1.16     {q10}, [r0 :128], r3
-+
-+          vrshr.u16   q9, q2, #3
-+          add         r3, r0
-+          vrshr.u16   q10, q8, #3
-+          vst1.16     {q9}, [r0 :128]
-+          vst1.16     {q10}, [r3 :128]
-+
-+          bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
-+
-+        adr         ip, nbx2_7_0_1_8 + 16
-+        vld1.8      {q0}, [r1 :128]!    @ Top (left)
-+        add         r2, #16
-+        vld1.8      {q1}, [ip: 128]     @ {1,1,2,2,3,3...8,8}
-+        lsl         r3, #1
-+        vld1.16     {d4[]}, [r1]        @ Top (right)
-+        sub         ip, #16
-+        vshll.u8    q3, d0, #3
-+        mov         r1, #8
-+        vshll.u8    q8, d1, #3
-+        vld1.16     {d5[]}, [r2]        @ Left (lower)
-+        sub         r2, #16
-+        vmlal.u8    q3, d2, d4
-+        vmlal.u8    q8, d3, d4          @ Acc set up
-+        vsubl.u8    q1, d5, d0
-+        vsubl.u8    q0, d5, d1          @ Add set up
-+        vld1.8      {q2}, [ip :128]     @ {7,7,6,6,5,5...0,0}
-+
-+@ u8  7..0     [1]  q2
-+@ u8  left[y]  [1]  [r2]
-+@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q3, q1
-+        vadd.i16    q8, q0
-+1:
-+        vadd.i16    q10, q3, q1
-+        subs        r1, #2
-+        vld1.16     {d18[]}, [r2]!
-+        vadd.i16    q11, q8, q0
-+        vld1.16     {d19[]}, [r2]!
-+        vmlal.u8    q3, d4, d18
-+        vmlal.u8    q8, d5, d18
-+        vadd.i16    q12, q10, q1
-+        vmlal.u8    q10, d4, d19
-+        vadd.i16    q13, q11, q0
-+        vmlal.u8    q11, d5, d19
-+        vrshrn.u16  d18, q3, #4
-+        vrshrn.u16  d19, q8, #4
-+        vmov        q3, q12
-+        vst1.8      {q9}, [r0 :128], r3
-+        vrshrn.u16  d18, q10, #4
-+        vrshrn.u16  d19, q11, #4
-+        vmov        q8, q13
-+        vst1.8      {q9}, [r0 :128], r3
-+        bne         1b
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+        .balign 64
-+
-+nbx2_15_0_1_16:
-+        .byte   15, 15, 14, 14, 13, 13, 12, 12
-+        .byte   11, 11, 10, 10,  9,  9,  8,  8
-+nbx2_7_0_1_8:
-+        .byte    7,  7,  6,  6,  5,  5,  4,  4
-+        .byte    3,  3,  2,  2,  1,  1,  0,  0
-+        .byte    1,  1,  2,  2,  3,  3,  4,  4
-+        .byte    5,  5,  6,  6,  7,  7,  8,  8
-+        .byte    9,  9, 10, 10, 11, 11, 12, 12
-+        .byte   13, 13, 14, 14, 15, 15, 16, 16
-+
-+        @ should be back on a 64-byte boundary here
-+
-+nbx2_3_0_1_4:
-+        .byte    3,  3,  2,  2,  1,  1,  0,  0
-+        .byte    1,  1,  2,  2,  3,  3,  4,  4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        adr         ip, nbx2_7_0_1_8 + 16
-+        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
-+        add         r2, #32
-+        vld1.8      {q2}, [ip :128]     @ {1,1,2,2,3,3...8,8}
-+        lsl         r3, #2
-+        vld1.32     {d6[],d7[]}, [r1]   @ Top (right)
-+        sub         ip, #16
-+        vmovl.u8    q8, d4
-+        mov         r1, #8
-+        vshl.i16    q9, q0, #3
-+        vmovl.u8    q2, d5
-+        vshl.i16    q10, q1, #3
-+        vld1.32     {d22[],d23[]}, [r2] @ Left (lower)
-+        sub         r2, #32
-+        vld1.8      {q12}, [ip]         @ {7,7,6,6,5,5...0,0}
-+        vmla.i16    q9, q8, q3
-+        vmla.i16    q10, q2, q3         @ Acc set up
-+        vsub.i16    q0, q11, q0
-+        vsub.i16    q1, q11, q1         @ Add set up
-+        vadd.i16    q2, q9, q0
-+        vadd.i16    q3, q10, q1
-+        vmovl.u8    q8, d24
-+        vmovl.u8    q9, d25
-+
-+@ u16  7..0        [2]  q8,q9
-+@ u32 left[y]      [2]  [r2]
-+@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+        vadd.i16    q10, q2, q0
-+        subs        r1, #2
-+        vld1.32     {d24[],d25[]}, [r2]!
-+        vadd.i16    q11, q3, q1
-+        vld1.32     {d28[],d29[]}, [r2]!
-+        vmla.i16    q2, q8, q12
-+        vmla.i16    q3, q9, q12
-+        vadd.i16    q12, q10, q0
-+        vmla.i16    q10, q8, q14
-+        vadd.i16    q13, q11, q1
-+        vmla.i16    q11, q9, q14
-+        vrshr.u16   q14, q2, #4
-+        vrshr.u16   q15, q3, #4
-+        vmov        q2, q12
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        vrshr.u16   q14, q10, #4
-+        vrshr.u16   q15, q11, #4
-+        vmov        q3, q13
-+        vst1.16     {q14-q15}, [r0 :128], r3
-+        bne         1b
-+
-+        bx         lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_8
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
-+
-+        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
-+        adr         ip, nbx2_15_0_1_16 + 32
-+        vpush       {d8-d12}
-+        vld1.8      {q2-q3}, [ip :128]  @ {1,1,2,2,3,3...16,16}
-+        add         r2, #32
-+        vld1.16     {d8[]}, [r1]        @ Top (right)
-+        sub         ip, #32
-+        vshll.u8    q8, d0, #4
-+        mov         r1, #16
-+        vld1.16     {d9[]}, [r2]        @ Left (lower)
-+        sub         r2, #32
-+        vshll.u8    q9, d1, #4
-+        lsl         r3, #1
-+        vshll.u8    q10, d2, #4
-+        vshll.u8    q11, d3, #4
-+        vmlal.u8    q8, d4, d8
-+        vsubl.u8    q12, d9, d0
-+        vmlal.u8    q9, d5, d8
-+        vsubl.u8    q13, d9, d1
-+        vmlal.u8    q10, d6, d8
-+        vsubl.u8    q14, d9, d2
-+        vmlal.u8    q11, d7, d8         @ Acc set up
-+        vsubl.u8    q15, d9, d3         @ Add set up
-+        vadd.i16    q8, q12
-+        vadd.i16    q9, q13
-+        vadd.i16    q10, q14
-+        vadd.i16    q11, q15
-+        vld1.8      {q4-q5}, [ip :128]  @ {15,15,14,14,13,13...0,0}
-+
-+@ u8  15..0    [2]  q4,q5
-+@ u8  left[y]  [2]  [r2]
-+@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+        vld1.16     {d12[]}, [r2]!
-+        vadd.i16    q0, q8, q12
-+        b           2f
-+1:
-+          vld1.16     {d12[]}, [r2]!
-+        vrshrn.u16  d3, q1, #5
-+        vrshrn.u16  d2, q0, #5
-+          vadd.i16    q0, q8, q12
-+        vrshrn.u16  d4, q2, #5
-+        vrshrn.u16  d5, q3, #5
-+        vst1.8      {q1-q2}, [r0 :128], r3
-+2:        vadd.i16    q1, q9, q13
-+          subs        r1, #2
-+          vadd.i16    q2, q10, q14
-+          vadd.i16    q3, q11, q15
-+          vmlal.u8    q8, d8, d12
-+          vmlal.u8    q9, d9, d12
-+          vmlal.u8    q10, d10, d12
-+          vmlal.u8    q11, d11, d12
-+            vld1.16     {d12[]}, [r2]!
-+          vrshrn.u16  d19, q9, #5
-+          vrshrn.u16  d18, q8, #5
-+            vadd.i16    q8, q0, q12
-+          vrshrn.u16  d20, q10, #5
-+          vrshrn.u16  d21, q11, #5
-+          vst1.8      {q9-q10}, [r0 :128], r3
-+            vadd.i16    q9, q1, q13
-+            vadd.i16    q10, q2, q14
-+            vadd.i16    q11, q3, q15
-+            vmlal.u8    q0, d8, d12
-+            vmlal.u8    q1, d9, d12
-+            vmlal.u8    q2, d10, d12
-+            vmlal.u8    q3, d11, d12
-+
-+        bne         1b
-+
-+        vpop        {d8-d12}
-+
-+        vrshrn.u16  d3, q1, #5
-+        vrshrn.u16  d2, q0, #5
-+        vrshrn.u16  d4, q2, #5
-+        vrshrn.u16  d5, q3, #5
-+        vst1.8      {q1-q2}, [r0 :128]
-+
-+        bx          lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_10
-+@       uint8_t *_src,          [r0]
-+@       const uint8_t *_top,    [r1]
-+@       const uint8_t *_left,   [r2]
-+@       ptrdiff_t stride)       [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
-+
-+        @ Load from bytes & expand later - at the very least this uses less
-+        @ memory than having a short table
-+        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
-+        adr         ip, nbx2_15_0_1_16 + 32
-+        vpush       {q4-q7}
-+        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
-+        add         r2, #64
-+        vld1.8      {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
-+T       lsl         r3, #2
-+        vld1.32     {d8[],d9[]}, [r1]    @ Top (right)
-+        sub         ip, #32
-+        vmovl.u8    q12, d28
-+        mov         r1, #16
-+        vmovl.u8    q13, d29
-+        vld1.8      {q6-q7}, [ip :128]   @ {15,15,14,14,13,13...0,0}
-+        vmovl.u8    q14, d30
-+        vmovl.u8    q15, d31
-+        vld1.32     {d10[],d11[]}, [r2]  @ Left (lower)
-+        sub         r2, #64
-+        vshl.i16    q8, q0, #4
-+        vshl.i16    q9, q1, #4
-+        vshl.i16    q10, q2, #4
-+        vshl.i16    q11, q3, #4
-+        vmla.i16    q8, q12, q4
-+        vsub.i16    q0, q5, q0
-+        vmla.i16    q9, q13, q4
-+        vpush       {q0}
-+        vsub.i16    q1, q5, q1
-+        vmla.i16    q10, q14, q4
-+        vsub.i16    q2, q5, q2
-+        vmla.i16    q11, q15, q4         @ Acc set up
-+        vsub.i16    q3, q5, q3           @ Add set up
-+        vadd.i16    q8, q0
-+        vadd.i16    q9, q1
-+        vadd.i16    q10, q2
-+        vadd.i16    q11, q3
-+        vmovl.u8    q4, d12
-+        vmovl.u8    q5, d13
-+        vmovl.u8    q6, d14
-+        vmovl.u8    q7, d15
-+
-+@ u16 31..0    [4]  q4-q7
-+@ u16 left[y]  [4]  [r2]
-+@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
-+
-+        vadd.i16    q12, q8, q0
-+A       sub         r0, r0, r3, lsl #2
-+T       sub         r0, r3
-+1:
-+        vld1.32     {d0[],d1[]}, [r2]!
-+A       add         r0, r0, r3, lsl #2
-+T       add         r0, r3
-+        vadd.i16    q13, q9, q1
-+        subs        r1, #2
-+        vadd.i16    q14, q10, q2
-+        vadd.i16    q15, q11, q3
-+        vmla.i16    q8, q4, q0
-+        vmla.i16    q9, q5, q0
-+        vmla.i16    q10, q6, q0
-+        vmla.i16    q11, q7, q0
-+        vld1.16     {q0}, [sp]
-+        vrshr.u16   q8, #5
-+        vrshr.u16   q9, #5
-+        vrshr.u16   q10, #5
-+        vrshr.u16   q11, #5
-+        vstm        r0, {q8-q11}
-+        vadd.i16    q8, q12, q0
-+A       add         r0, r0, r3, lsl #2
-+T       add         r0, r3
-+        vld1.32     {d0[],d1[]}, [r2]!
-+        vadd.i16    q9, q13, q1
-+        vadd.i16    q10, q14, q2
-+        vadd.i16    q11, q15, q3
-+        vmla.i16    q12, q4, q0
-+        vmla.i16    q13, q5, q0
-+        vmla.i16    q14, q6, q0
-+        vmla.i16    q15, q7, q0
-+        vld1.16     {q0}, [sp]
-+        vrshr.u16   q12, #5
-+        vrshr.u16   q13, #5
-+        vrshr.u16   q14, #5
-+        vrshr.u16   q15, #5
-+        vstm        r0, {q12-q15}
-+        vadd.i16    q12, q8, q0
-+        bne         1b
-+
-+        vpop        {q3-q7}
-+        bx          lr
-+
-+endfunc
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index d234271c5b..5699e14fea 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -2612,6 +2612,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED       16384
- #define FF_BUG_IEDGE           32768
-+#define FF_BUG_GMC_UNSUPPORTED (1<<30)
- 
-     /**
-      * strictly follow the standard (MPEG-4, ...).
-@@ -3272,8 +3273,7 @@ typedef struct AVCodecContext {
- #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
- #endif
- 
--    /**
--     * Audio only. The amount of padding (in samples) appended by the encoder to
-+    /* Audio only. The amount of padding (in samples) appended by the encoder to
-      * the end of the audio. I.e. this number of decoded samples must be
-      * discarded by the caller from the end of the stream to get the original
-      * audio without any trailing padding.
-@@ -3806,6 +3806,17 @@ typedef struct AVHWAccel {
-      * that avctx->hwaccel_priv_data is invalid.
-      */
-     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
-+
-+    /**
-+     * Called if parsing fails
-+     *
-+     * An error has occured, end_frame will not be called
-+     * start_frame & decode_slice may or may not have been called
-+     * Optional
-+     *
-+     * @param avctx the codec context
-+     */
-+    void (*abort_frame)(AVCodecContext *avctx);
- } AVHWAccel;
- 
- /**
-@@ -4666,6 +4677,17 @@ void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst);
-  */
- AVCodec *avcodec_find_decoder(enum AVCodecID id);
- 
-+/**
-+ * Find a registered decoder with a matching codec ID and pix_fmt.
-+ * A decoder will pix_fmt set to NULL will match any fmt.
-+ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
-+ *
-+ * @param id AVCodecID of the requested decoder
-+ * @param fmt AVPixelForma that msut be supported by decoder
-+ * @return A decoder if one was found, NULL otherwise.
-+ */
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
-+
- /**
-  * Find a registered decoder with the specified name.
-  *
-diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-index 1bf1c620d6..ccfa991f60 100644
---- a/libavcodec/cabac.h
-+++ b/libavcodec/cabac.h
-@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
- typedef struct CABACContext{
-     int low;
-     int range;
--    int outstanding_count;
-+    union
-+    {
-+        int outstanding_count;
-+        struct {
-+            uint16_t bits;
-+            uint16_t range;
-+        } by22;
-+    };
-     const uint8_t *bytestream_start;
-     const uint8_t *bytestream;
-     const uint8_t *bytestream_end;
-diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h
-new file mode 100644
-index 0000000000..76020ebd1e
---- /dev/null
-+++ b/libavcodec/h264-ctrls.h
-@@ -0,0 +1,214 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the H.264 state controls for use with stateless H.264
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _H264_CTRLS_H_
-+#define _H264_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* Our pixel format isn't stable at the moment */
-+#define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
-+
-+/*
-+ * This is put insanely high to avoid conflicting with controls that
-+ * would be added during the phase where those controls are not
-+ * stable. It should be fixed eventually.
-+ */
-+#define V4L2_CID_MPEG_VIDEO_H264_SPS		(V4L2_CID_MPEG_BASE+1000)
-+#define V4L2_CID_MPEG_VIDEO_H264_PPS		(V4L2_CID_MPEG_BASE+1001)
-+#define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
-+#define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
-+#define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
-+#define V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE	(V4L2_CID_MPEG_BASE+1005)
-+#define V4L2_CID_MPEG_VIDEO_H264_START_CODE	(V4L2_CID_MPEG_BASE+1006)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_H264_SPS			0x0110
-+#define V4L2_CTRL_TYPE_H264_PPS			0x0111
-+#define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
-+#define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
-+#define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
-+
-+enum v4l2_mpeg_video_h264_decode_mode {
-+	V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_h264_start_code {
-+	V4L2_MPEG_VIDEO_H264_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
-+#define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
-+#define V4L2_H264_SPS_CONSTRAINT_SET2_FLAG			0x04
-+#define V4L2_H264_SPS_CONSTRAINT_SET3_FLAG			0x08
-+#define V4L2_H264_SPS_CONSTRAINT_SET4_FLAG			0x10
-+#define V4L2_H264_SPS_CONSTRAINT_SET5_FLAG			0x20
-+
-+#define V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE		0x01
-+#define V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS	0x02
-+#define V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO		0x04
-+#define V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED	0x08
-+#define V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY			0x10
-+#define V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD		0x20
-+#define V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE			0x40
-+
-+struct v4l2_ctrl_h264_sps {
-+	__u8 profile_idc;
-+	__u8 constraint_set_flags;
-+	__u8 level_idc;
-+	__u8 seq_parameter_set_id;
-+	__u8 chroma_format_idc;
-+	__u8 bit_depth_luma_minus8;
-+	__u8 bit_depth_chroma_minus8;
-+	__u8 log2_max_frame_num_minus4;
-+	__u8 pic_order_cnt_type;
-+	__u8 log2_max_pic_order_cnt_lsb_minus4;
-+	__u8 max_num_ref_frames;
-+	__u8 num_ref_frames_in_pic_order_cnt_cycle;
-+	__s32 offset_for_ref_frame[255];
-+	__s32 offset_for_non_ref_pic;
-+	__s32 offset_for_top_to_bottom_field;
-+	__u16 pic_width_in_mbs_minus1;
-+	__u16 pic_height_in_map_units_minus1;
-+	__u32 flags;
-+};
-+
-+#define V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE				0x0001
-+#define V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT	0x0002
-+#define V4L2_H264_PPS_FLAG_WEIGHTED_PRED				0x0004
-+#define V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT		0x0008
-+#define V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED			0x0010
-+#define V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT			0x0020
-+#define V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE				0x0040
-+#define V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT			0x0080
-+
-+struct v4l2_ctrl_h264_pps {
-+	__u8 pic_parameter_set_id;
-+	__u8 seq_parameter_set_id;
-+	__u8 num_slice_groups_minus1;
-+	__u8 num_ref_idx_l0_default_active_minus1;
-+	__u8 num_ref_idx_l1_default_active_minus1;
-+	__u8 weighted_bipred_idc;
-+	__s8 pic_init_qp_minus26;
-+	__s8 pic_init_qs_minus26;
-+	__s8 chroma_qp_index_offset;
-+	__s8 second_chroma_qp_index_offset;
-+	__u16 flags;
-+};
-+
-+struct v4l2_ctrl_h264_scaling_matrix {
-+	__u8 scaling_list_4x4[6][16];
-+	__u8 scaling_list_8x8[6][64];
-+};
-+
-+struct v4l2_h264_weight_factors {
-+	__s16 luma_weight[32];
-+	__s16 luma_offset[32];
-+	__s16 chroma_weight[32][2];
-+	__s16 chroma_offset[32][2];
-+};
-+
-+struct v4l2_h264_pred_weight_table {
-+	__u16 luma_log2_weight_denom;
-+	__u16 chroma_log2_weight_denom;
-+	struct v4l2_h264_weight_factors weight_factors[2];
-+};
-+
-+#define V4L2_H264_SLICE_TYPE_P				0
-+#define V4L2_H264_SLICE_TYPE_B				1
-+#define V4L2_H264_SLICE_TYPE_I				2
-+#define V4L2_H264_SLICE_TYPE_SP				3
-+#define V4L2_H264_SLICE_TYPE_SI				4
-+
-+#define V4L2_H264_SLICE_FLAG_FIELD_PIC			0x01
-+#define V4L2_H264_SLICE_FLAG_BOTTOM_FIELD		0x02
-+#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED	0x04
-+#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH		0x08
-+
-+struct v4l2_ctrl_h264_slice_params {
-+	/* Size in bytes, including header */
-+	__u32 size;
-+
-+	/* Offset in bytes to the start of slice in the OUTPUT buffer. */
-+	__u32 start_byte_offset;
-+
-+	/* Offset in bits to slice_data() from the beginning of this slice. */
-+	__u32 header_bit_size;
-+
-+	__u16 first_mb_in_slice;
-+	__u8 slice_type;
-+	__u8 pic_parameter_set_id;
-+	__u8 colour_plane_id;
-+	__u8 redundant_pic_cnt;
-+	__u16 frame_num;
-+	__u16 idr_pic_id;
-+	__u16 pic_order_cnt_lsb;
-+	__s32 delta_pic_order_cnt_bottom;
-+	__s32 delta_pic_order_cnt0;
-+	__s32 delta_pic_order_cnt1;
-+
-+	struct v4l2_h264_pred_weight_table pred_weight_table;
-+	/* Size in bits of dec_ref_pic_marking() syntax element. */
-+	__u32 dec_ref_pic_marking_bit_size;
-+	/* Size in bits of pic order count syntax. */
-+	__u32 pic_order_cnt_bit_size;
-+
-+	__u8 cabac_init_idc;
-+	__s8 slice_qp_delta;
-+	__s8 slice_qs_delta;
-+	__u8 disable_deblocking_filter_idc;
-+	__s8 slice_alpha_c0_offset_div2;
-+	__s8 slice_beta_offset_div2;
-+	__u8 num_ref_idx_l0_active_minus1;
-+	__u8 num_ref_idx_l1_active_minus1;
-+	__u32 slice_group_change_cycle;
-+
-+	/*
-+	 * Entries on each list are indices into
-+	 * v4l2_ctrl_h264_decode_params.dpb[].
-+	 */
-+	__u8 ref_pic_list0[32];
-+	__u8 ref_pic_list1[32];
-+
-+	__u32 flags;
-+};
-+
-+#define V4L2_H264_DPB_ENTRY_FLAG_VALID		0x01
-+#define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE		0x02
-+#define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM	0x04
-+#define V4L2_H264_DPB_ENTRY_FLAG_FIELD_PICTURE	0x08
-+#define V4L2_H264_DPB_ENTRY_FLAG_REF_TOP	0x10
-+#define V4L2_H264_DPB_ENTRY_FLAG_REF_BOTTOM	0x20
-+#define V4L2_H264_DPB_ENTRY_FLAG_REF_FRAME	0x30
-+
-+struct v4l2_h264_dpb_entry {
-+	__u64 reference_ts;
-+	__u16 frame_num;
-+	__u16 pic_num;
-+	/* Note that field is indicated by v4l2_buffer.field */
-+	__s32 top_field_order_cnt;
-+	__s32 bottom_field_order_cnt;
-+	__u32 flags; /* V4L2_H264_DPB_ENTRY_FLAG_* */
-+};
-+
-+#define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC	0x01
-+
-+struct v4l2_ctrl_h264_decode_params {
-+	struct v4l2_h264_dpb_entry dpb[16];
-+	__u16 num_slices;
-+	__u16 nal_ref_idc;
-+	__s32 top_field_order_cnt;
-+	__s32 bottom_field_order_cnt;
-+	__u32 flags; /* V4L2_H264_DECODE_PARAM_FLAG_* */
-+};
-+
-+#endif
-diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
-index 5ceee107a0..7a6a894ed1 100644
---- a/libavcodec/h264_slice.c
-+++ b/libavcodec/h264_slice.c
-@@ -765,6 +765,7 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
- #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \
-                      (CONFIG_H264_D3D11VA_HWACCEL * 2) + \
-                      CONFIG_H264_NVDEC_HWACCEL + \
-+                     CONFIG_H264_V4L2REQUEST_HWACCEL + \
-                      CONFIG_H264_VAAPI_HWACCEL + \
-                      CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \
-                      CONFIG_H264_VDPAU_HWACCEL)
-@@ -849,6 +850,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
- #endif
- #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
-             *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
-+#endif
-+#if CONFIG_H264_V4L2REQUEST_HWACCEL
-+            *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-             if (h->avctx->codec->pix_fmts)
-                 choices = h->avctx->codec->pix_fmts;
-@@ -1731,7 +1735,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-     unsigned int slice_type, tmp, i;
-     int field_pic_flag, bottom_field_flag;
-     int first_slice = sl == h->slice_ctx && !h->current_slice;
--    int picture_structure;
-+    int picture_structure, pos;
- 
-     if (first_slice)
-         av_assert0(!h->setup_finished);
-@@ -1819,8 +1823,9 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-     }
- 
-     if (nal->type == H264_NAL_IDR_SLICE)
--        get_ue_golomb_long(&sl->gb); /* idr_pic_id */
-+        sl->idr_pic_id = get_ue_golomb_long(&sl->gb);
- 
-+    pos = sl->gb.index;
-     if (sps->poc_type == 0) {
-         sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
- 
-@@ -1834,6 +1839,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-         if (pps->pic_order_present == 1 && picture_structure == PICT_FRAME)
-             sl->delta_poc[1] = get_se_golomb(&sl->gb);
-     }
-+    sl->pic_order_cnt_bit_size = sl->gb.index - pos;
- 
-     sl->redundant_pic_count = 0;
-     if (pps->redundant_pic_cnt_present)
-@@ -1873,9 +1879,11 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
- 
-     sl->explicit_ref_marking = 0;
-     if (nal->ref_idc) {
-+        int bit_pos = sl->gb.index;
-         ret = ff_h264_decode_ref_pic_marking(sl, &sl->gb, nal, h->avctx);
-         if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
-             return AVERROR_INVALIDDATA;
-+        sl->ref_pic_marking_size_in_bits = sl->gb.index - bit_pos;
-     }
- 
-     if (sl->slice_type_nos != AV_PICTURE_TYPE_I && pps->cabac) {
-diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
-index 8d1bd16a8e..7acd292fe1 100644
---- a/libavcodec/h264dec.c
-+++ b/libavcodec/h264dec.c
-@@ -1080,6 +1080,9 @@ AVCodec ff_h264_decoder = {
- #endif
- #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
-                                HWACCEL_VIDEOTOOLBOX(h264),
-+#endif
-+#if CONFIG_H264_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(h264),
- #endif
-                                NULL
-                            },
-diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h
-index 1d9723260d..ec71e22cec 100644
---- a/libavcodec/h264dec.h
-+++ b/libavcodec/h264dec.h
-@@ -184,6 +184,8 @@ typedef struct H264SliceContext {
-     int slice_type_nos;         ///< S free slice type (SI/SP are remapped to I/P)
-     int slice_type_fixed;
- 
-+    int idr_pic_id;
-+
-     int qscale;
-     int chroma_qp[2];   // QPc
-     int qp_thresh;      ///< QP threshold to skip loopfilter
-@@ -322,11 +324,13 @@ typedef struct H264SliceContext {
-     MMCO mmco[MAX_MMCO_COUNT];
-     int  nb_mmco;
-     int explicit_ref_marking;
-+    int ref_pic_marking_size_in_bits;
- 
-     int frame_num;
-     int poc_lsb;
-     int delta_poc_bottom;
-     int delta_poc[2];
-+    int pic_order_cnt_bit_size;
-     int curr_pic_num;
-     int max_pic_num;
- } H264SliceContext;
-diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h
-new file mode 100644
-index 0000000000..13698d3f33
---- /dev/null
-+++ b/libavcodec/hevc-ctrls.h
-@@ -0,0 +1,230 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+
-+	__u8	padding;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
-+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	rps;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	num_active_dpb_entries;
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	num_rps_poc_st_curr_before;
-+	__u8	num_rps_poc_st_curr_after;
-+	__u8	num_rps_poc_lt_curr;
-+
-+	__u8	padding;
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+#endif
-diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index 8f1c162ace..072b7c103d 100644
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -373,14 +373,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
-                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
-                      CONFIG_HEVC_NVDEC_HWACCEL + \
-+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
-                      CONFIG_HEVC_VAAPI_HWACCEL + \
-                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
-+                     CONFIG_HEVC_RPI4_8_HWACCEL + \
-+                     CONFIG_HEVC_RPI4_10_HWACCEL + \
-                      CONFIG_HEVC_VDPAU_HWACCEL)
-     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
- 
-     switch (sps->pix_fmt) {
-     case AV_PIX_FMT_YUV420P:
-     case AV_PIX_FMT_YUVJ420P:
-+#if CONFIG_HEVC_RPI4_8_HWACCEL
-+        *fmt++ = AV_PIX_FMT_RPI4_8;
-+#endif
- #if CONFIG_HEVC_DXVA2_HWACCEL
-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
- #endif
-@@ -399,9 +405,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #endif
- #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
-         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-         break;
-     case AV_PIX_FMT_YUV420P10:
-+#if CONFIG_HEVC_RPI4_10_HWACCEL
-+        *fmt++ = AV_PIX_FMT_RPI4_10;
-+#endif
- #if CONFIG_HEVC_DXVA2_HWACCEL
-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
- #endif
-@@ -417,6 +429,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #endif
- #if CONFIG_HEVC_NVDEC_HWACCEL
-         *fmt++ = AV_PIX_FMT_CUDA;
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-         break;
-     case AV_PIX_FMT_YUV444P:
-@@ -3220,7 +3235,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
-     s->ref = NULL;
-     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
-     if (ret < 0)
-+    {
-+        // Ensure that hwaccel knows this frame is over
-+        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
-+            s->avctx->hwaccel->abort_frame(s->avctx);
-+        }
-+
-         return ret;
-+    }
- 
-     if (avctx->hwaccel) {
-         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
-@@ -3592,6 +3614,15 @@ AVCodec ff_hevc_decoder = {
- #endif
- #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
-                                HWACCEL_VIDEOTOOLBOX(hevc),
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(hevc),
-+#endif
-+#if CONFIG_HEVC_RPI4_8_HWACCEL
-+                               HWACCEL_RPI4_8(hevc),
-+#endif
-+#if CONFIG_HEVC_RPI4_10_HWACCEL
-+                               HWACCEL_RPI4_10(hevc),
- #endif
-                                NULL
-                            },
-diff --git a/libavcodec/hwaccel.h b/libavcodec/hwaccel.h
-index 3aaa92571c..22888ea414 100644
---- a/libavcodec/hwaccel.h
-+++ b/libavcodec/hwaccel.h
-@@ -24,6 +24,7 @@
- 
- 
- #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
-+#define HWACCEL_CAP_MT_SAFE         (1 << 1)
- 
- 
- typedef struct AVCodecHWConfigInternal {
-@@ -80,5 +81,11 @@ typedef struct AVCodecHWConfigInternal {
-     HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD,  NONE,         ff_ ## codec ## _d3d11va_hwaccel)
- #define HWACCEL_XVMC(codec) \
-     HW_CONFIG_HWACCEL(0, 0, 1, XVMC,         NONE,         ff_ ## codec ## _xvmc_hwaccel)
-+#define HWACCEL_V4L2REQUEST(codec) \
-+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
-+#define HWACCEL_RPI4_8(codec) \
-+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8,       NONE,         ff_ ## codec ## _rpi4_8_hwaccel)
-+#define HWACCEL_RPI4_10(codec) \
-+    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10,      NONE,         ff_ ## codec ## _rpi4_10_hwaccel)
- 
- #endif /* AVCODEC_HWACCEL_H */
-diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
-index 7d73da8676..4eeea9dc0d 100644
---- a/libavcodec/hwaccels.h
-+++ b/libavcodec/hwaccels.h
-@@ -27,6 +27,7 @@ extern const AVHWAccel ff_h264_d3d11va_hwaccel;
- extern const AVHWAccel ff_h264_d3d11va2_hwaccel;
- extern const AVHWAccel ff_h264_dxva2_hwaccel;
- extern const AVHWAccel ff_h264_nvdec_hwaccel;
-+extern const AVHWAccel ff_h264_v4l2request_hwaccel;
- extern const AVHWAccel ff_h264_vaapi_hwaccel;
- extern const AVHWAccel ff_h264_vdpau_hwaccel;
- extern const AVHWAccel ff_h264_videotoolbox_hwaccel;
-@@ -34,6 +35,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
- extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
- extern const AVHWAccel ff_hevc_dxva2_hwaccel;
- extern const AVHWAccel ff_hevc_nvdec_hwaccel;
-+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
- extern const AVHWAccel ff_hevc_vaapi_hwaccel;
- extern const AVHWAccel ff_hevc_vdpau_hwaccel;
- extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
-@@ -47,6 +49,7 @@ extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel;
- extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel;
- extern const AVHWAccel ff_mpeg2_nvdec_hwaccel;
- extern const AVHWAccel ff_mpeg2_dxva2_hwaccel;
-+extern const AVHWAccel ff_mpeg2_v4l2request_hwaccel;
- extern const AVHWAccel ff_mpeg2_vaapi_hwaccel;
- extern const AVHWAccel ff_mpeg2_vdpau_hwaccel;
- extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel;
-@@ -63,6 +66,7 @@ extern const AVHWAccel ff_vc1_vaapi_hwaccel;
- extern const AVHWAccel ff_vc1_vdpau_hwaccel;
- extern const AVHWAccel ff_vp8_nvdec_hwaccel;
- extern const AVHWAccel ff_vp8_vaapi_hwaccel;
-+extern const AVHWAccel ff_vp8_v4l2request_hwaccel;
- extern const AVHWAccel ff_vp9_d3d11va_hwaccel;
- extern const AVHWAccel ff_vp9_d3d11va2_hwaccel;
- extern const AVHWAccel ff_vp9_dxva2_hwaccel;
-@@ -74,5 +78,7 @@ extern const AVHWAccel ff_wmv3_dxva2_hwaccel;
- extern const AVHWAccel ff_wmv3_nvdec_hwaccel;
- extern const AVHWAccel ff_wmv3_vaapi_hwaccel;
- extern const AVHWAccel ff_wmv3_vdpau_hwaccel;
-+extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
-+extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
- 
- #endif /* AVCODEC_HWACCELS_H */
-diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
-index 12c63245f8..1bbb83eda3 100644
---- a/libavcodec/libdav1d.c
-+++ b/libavcodec/libdav1d.c
-@@ -53,6 +53,16 @@ static const enum AVPixelFormat pix_fmt_rgb[3] = {
-     AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12,
- };
- 
-+static enum AVPixelFormat libdav1d_get_format(AVCodecContext *avctx, const Dav1dPicture *p)
-+{
-+   enum AVPixelFormat pix_fmts[2], *fmt = pix_fmts;
-+
-+   *fmt++ = pix_fmt[p->p.layout][p->seq_hdr->hbd];
-+   *fmt = AV_PIX_FMT_NONE;
-+
-+   return ff_get_format(avctx, pix_fmts);
-+}
-+
- static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl)
- {
-     AVCodecContext *c = opaque;
-@@ -229,6 +239,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
-     c->profile = p->seq_hdr->profile;
-     c->level = ((p->seq_hdr->operating_points[0].major_level - 2) << 2)
-                | p->seq_hdr->operating_points[0].minor_level;
-+    frame->format = c->pix_fmt = libdav1d_get_format(c, p);
-     frame->width = p->p.w;
-     frame->height = p->p.h;
-     if (c->width != p->p.w || c->height != p->p.h) {
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 647a22ef7c..4ed35d1126 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
-  * MMAL Video Decoder
-  */
- 
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- #include <stdatomic.h>
- 
- #include "avcodec.h"
-diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
-index 83e537884b..305127bc94 100644
---- a/libavcodec/mpeg12dec.c
-+++ b/libavcodec/mpeg12dec.c
-@@ -1156,6 +1156,9 @@ static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = {
- #endif
- #if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
-     AV_PIX_FMT_VIDEOTOOLBOX,
-+#endif
-+#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
-+    AV_PIX_FMT_DRM_PRIME,
- #endif
-     AV_PIX_FMT_YUV420P,
-     AV_PIX_FMT_NONE
-@@ -2941,6 +2944,9 @@ AVCodec ff_mpeg2video_decoder = {
- #endif
- #if CONFIG_MPEG2_XVMC_HWACCEL
-                         HWACCEL_XVMC(mpeg2),
-+#endif
-+#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
-+                        HWACCEL_V4L2REQUEST(mpeg2),
- #endif
-                         NULL
-                     },
-diff --git a/libavcodec/mpeg2-ctrls.h b/libavcodec/mpeg2-ctrls.h
-new file mode 100644
-index 0000000000..6601455b3d
---- /dev/null
-+++ b/libavcodec/mpeg2-ctrls.h
-@@ -0,0 +1,82 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the MPEG2 state controls for use with stateless MPEG-2
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _MPEG2_CTRLS_H_
-+#define _MPEG2_CTRLS_H_
-+
-+#define V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS		(V4L2_CID_MPEG_BASE+250)
-+#define V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION		(V4L2_CID_MPEG_BASE+251)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS 0x0103
-+#define	V4L2_CTRL_TYPE_MPEG2_QUANTIZATION 0x0104
-+
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_I	1
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_P	2
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_B	3
-+#define V4L2_MPEG2_PICTURE_CODING_TYPE_D	4
-+
-+struct v4l2_mpeg2_sequence {
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
-+	__u16	horizontal_size;
-+	__u16	vertical_size;
-+	__u32	vbv_buffer_size;
-+
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
-+	__u16	profile_and_level_indication;
-+	__u8	progressive_sequence;
-+	__u8	chroma_format;
-+};
-+
-+struct v4l2_mpeg2_picture {
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
-+	__u8	picture_coding_type;
-+
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
-+	__u8	f_code[2][2];
-+	__u8	intra_dc_precision;
-+	__u8	picture_structure;
-+	__u8	top_field_first;
-+	__u8	frame_pred_frame_dct;
-+	__u8	concealment_motion_vectors;
-+	__u8	q_scale_type;
-+	__u8	intra_vlc_format;
-+	__u8	alternate_scan;
-+	__u8	repeat_first_field;
-+	__u16	progressive_frame;
-+};
-+
-+struct v4l2_ctrl_mpeg2_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+	__u64	backward_ref_ts;
-+	__u64	forward_ref_ts;
-+
-+	struct v4l2_mpeg2_sequence sequence;
-+	struct v4l2_mpeg2_picture picture;
-+
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
-+	__u32	quantiser_scale_code;
-+};
-+
-+struct v4l2_ctrl_mpeg2_quantization {
-+	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
-+	__u8	load_intra_quantiser_matrix;
-+	__u8	load_non_intra_quantiser_matrix;
-+	__u8	load_chroma_intra_quantiser_matrix;
-+	__u8	load_chroma_non_intra_quantiser_matrix;
-+
-+	__u8	intra_quantiser_matrix[64];
-+	__u8	non_intra_quantiser_matrix[64];
-+	__u8	chroma_intra_quantiser_matrix[64];
-+	__u8	chroma_non_intra_quantiser_matrix[64];
-+};
-+
-+#endif
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index 055afabc7e..fa208660c8 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2662,6 +2662,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
- 
-         if (ctx->divx_version >= 0)
-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
-+
-+        if (ctx->num_sprite_warping_points > 1)
-+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-     }
- 
-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2686,6 +2689,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
- 
-+    avctx->workaround_bugs = s->workaround_bugs;
-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-         s->codec_id == AV_CODEC_ID_MPEG4 &&
-         avctx->idct_algo == FF_IDCT_AUTO) {
-diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
-index 36ac0ac1e5..9f8dc81d38 100644
---- a/libavcodec/pthread_frame.c
-+++ b/libavcodec/pthread_frame.c
-@@ -191,7 +191,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
- 
-         /* if the previous thread uses hwaccel then we take the lock to ensure
-          * the threads don't run concurrently */
--        if (avctx->hwaccel) {
-+        if (avctx->hwaccel &&
-+            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
-             pthread_mutex_lock(&p->parent->hwaccel_mutex);
-             p->hwaccel_serializing = 1;
-         }
-@@ -603,7 +604,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
- 
-     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
- 
--    if (avctx->hwaccel && !p->hwaccel_serializing) {
-+    if (avctx->hwaccel &&
-+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
-+        !p->hwaccel_serializing) {
-         pthread_mutex_lock(&p->parent->hwaccel_mutex);
-         p->hwaccel_serializing = 1;
-     }
-diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index b6fb91c1c6..7b2770e780 100644
---- a/libavcodec/raw.c
-+++ b/libavcodec/raw.c
-@@ -289,10 +289,20 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
-     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
-     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
- 
-+    /* RPI (Might as well define for everything) */
-+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
-+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
-+
-     /* special */
-     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
-     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
- 
-+     /* RPI (Might as well define for everything) */
-+     { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
-+     { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
-+     { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
-+     { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
-+
-     { AV_PIX_FMT_NONE, 0 },
- };
- 
-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d181b74570..3fe2ab445f 100644
---- a/libavcodec/rawenc.c
-+++ b/libavcodec/rawenc.c
-@@ -24,6 +24,7 @@
-  * Raw Video Encoder
-  */
- 
-+#include "config.h"
- #include "avcodec.h"
- #include "raw.h"
- #include "internal.h"
-@@ -31,6 +32,10 @@
- #include "libavutil/intreadwrite.h"
- #include "libavutil/imgutils.h"
- #include "libavutil/internal.h"
-+#include "libavutil/avassert.h"
-+#if CONFIG_SAND
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- static av_cold int raw_encode_init(AVCodecContext *avctx)
- {
-@@ -49,12 +54,95 @@ FF_ENABLE_DEPRECATION_WARNINGS
-     return 0;
- }
- 
-+#if CONFIG_SAND
-+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3 / 2;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height;
-+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
-+    dst += width * height * 2;
-+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height * 2;
-+    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
-+    return 0;
-+}
-+#endif
-+
-+
- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-                       const AVFrame *frame, int *got_packet)
- {
--    int ret = av_image_get_buffer_size(frame->format,
--                                       frame->width, frame->height, 1);
-+    int ret;
-+
-+#if CONFIG_SAND
-+    if (av_rpi_is_sand_frame(frame)) {
-+        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
-+            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
-+            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
-+        *got_packet = (ret == 0);
-+        return ret;
-+    }
-+#endif
- 
-+    ret = av_image_get_buffer_size(frame->format,
-+                                       frame->width, frame->height, 1);
-     if (ret < 0)
-         return ret;
- 
-diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
-new file mode 100644
-index 0000000000..58c094c5f8
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac.c
-@@ -0,0 +1,2257 @@
-+/*
-+ * HEVC CABAC decoding
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#define UNCHECKED_BITSTREAM_READER 1
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+
-+#include "cabac_functions.h"
-+#include "rpi_hevc_data.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// BY22 is probably faster than simple bypass if the processor has
-+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
-+// x86 has fast int divide
-+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
-+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
-+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
-+// Use native divide if we have a fast one - otherwise use mpy 1/x
-+// x86 has a fast integer divide - arm doesn't - unsure about other
-+// architectures
-+#define USE_BY22_DIV  ARCH_X86
-+
-+// Special case blocks with a single significant ceoff
-+// Decreases the complexity of the code for a common case but increases the
-+// code size.
-+#define USE_N_END_1 1
-+
-+#if !USE_BY22_DIV
-+// * 1/x @ 32 bits gets us 22 bits of accuracy
-+#define CABAC_BY22_PEEK_BITS  22
-+#else
-+// A real 32-bit divide gets us another bit
-+// If we have a 64 bit int & a unit time divider then we should get a lot
-+// of bits (55)  but that is untested and it is unclear if it would give
-+// us a large advantage
-+#define CABAC_BY22_PEEK_BITS  23
-+#endif
-+
-+#define CABAC_MAX_BIN 31
-+
-+
-+#if USE_BY22 && !USE_BY22_DIV
-+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
-+
-+static const uint32_t cabac_by22_inv_range[256] = {
-+                                                    0,      I(257), I(258), I(259),
-+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
-+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
-+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
-+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
-+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
-+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
-+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
-+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
-+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
-+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
-+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
-+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
-+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
-+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
-+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
-+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
-+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
-+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
-+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
-+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
-+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
-+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
-+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
-+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
-+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
-+    I(510), I(511)
-+};
-+#undef I
-+#endif  // USE_BY22
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_cabac.h"
-+#endif
-+
-+/**
-+ * number of bin by SyntaxElement.
-+ */
-+static const int8_t num_bins_in_se[] = {
-+     1, // sao_merge_flag
-+     1, // sao_type_idx
-+     0, // sao_eo_class
-+     0, // sao_band_position
-+     0, // sao_offset_abs
-+     0, // sao_offset_sign
-+     0, // end_of_slice_flag
-+     3, // split_coding_unit_flag
-+     1, // cu_transquant_bypass_flag
-+     3, // skip_flag
-+     3, // cu_qp_delta
-+     1, // pred_mode
-+     4, // part_mode
-+     0, // pcm_flag
-+     1, // prev_intra_luma_pred_mode
-+     0, // mpm_idx
-+     0, // rem_intra_luma_pred_mode
-+     2, // intra_chroma_pred_mode
-+     1, // merge_flag
-+     1, // merge_idx
-+     5, // inter_pred_idc
-+     2, // ref_idx_l0
-+     2, // ref_idx_l1
-+     2, // abs_mvd_greater0_flag
-+     2, // abs_mvd_greater1_flag
-+     0, // abs_mvd_minus2
-+     0, // mvd_sign_flag
-+     1, // mvp_lx_flag
-+     1, // no_residual_data_flag
-+     3, // split_transform_flag
-+     2, // cbf_luma
-+     4, // cbf_cb, cbf_cr
-+     2, // transform_skip_flag[][]
-+     2, // explicit_rdpcm_flag[][]
-+     2, // explicit_rdpcm_dir_flag[][]
-+    18, // last_significant_coeff_x_prefix
-+    18, // last_significant_coeff_y_prefix
-+     0, // last_significant_coeff_x_suffix
-+     0, // last_significant_coeff_y_suffix
-+     4, // significant_coeff_group_flag
-+    44, // significant_coeff_flag
-+    24, // coeff_abs_level_greater1_flag
-+     6, // coeff_abs_level_greater2_flag
-+     0, // coeff_abs_level_remaining
-+     0, // coeff_sign_flag
-+     8, // log2_res_scale_abs
-+     2, // res_scale_sign_flag
-+     1, // cu_chroma_qp_offset_flag
-+     1, // cu_chroma_qp_offset_idx
-+};
-+
-+/**
-+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
-+ */
-+static const int elem_offset[sizeof(num_bins_in_se)] = {
-+    0, // sao_merge_flag
-+    1, // sao_type_idx
-+    2, // sao_eo_class
-+    2, // sao_band_position
-+    2, // sao_offset_abs
-+    2, // sao_offset_sign
-+    2, // end_of_slice_flag
-+    2, // split_coding_unit_flag
-+    5, // cu_transquant_bypass_flag
-+    6, // skip_flag
-+    9, // cu_qp_delta
-+    12, // pred_mode
-+    13, // part_mode
-+    17, // pcm_flag
-+    17, // prev_intra_luma_pred_mode
-+    18, // mpm_idx
-+    18, // rem_intra_luma_pred_mode
-+    18, // intra_chroma_pred_mode
-+    20, // merge_flag
-+    21, // merge_idx
-+    22, // inter_pred_idc
-+    27, // ref_idx_l0
-+    29, // ref_idx_l1
-+    31, // abs_mvd_greater0_flag
-+    33, // abs_mvd_greater1_flag
-+    35, // abs_mvd_minus2
-+    35, // mvd_sign_flag
-+    35, // mvp_lx_flag
-+    36, // no_residual_data_flag
-+    37, // split_transform_flag
-+    40, // cbf_luma
-+    42, // cbf_cb, cbf_cr
-+    46, // transform_skip_flag[][]
-+    48, // explicit_rdpcm_flag[][]
-+    50, // explicit_rdpcm_dir_flag[][]
-+    52, // last_significant_coeff_x_prefix
-+    70, // last_significant_coeff_y_prefix
-+    88, // last_significant_coeff_x_suffix
-+    88, // last_significant_coeff_y_suffix
-+    88, // significant_coeff_group_flag
-+    92, // significant_coeff_flag
-+    136, // coeff_abs_level_greater1_flag
-+    160, // coeff_abs_level_greater2_flag
-+    166, // coeff_abs_level_remaining
-+    166, // coeff_sign_flag
-+    166, // log2_res_scale_abs
-+    174, // res_scale_sign_flag
-+    176, // cu_chroma_qp_offset_flag
-+    177, // cu_chroma_qp_offset_idx
-+};
-+
-+#define CNU 154
-+/**
-+ * Indexed by init_type
-+ */
-+static const uint8_t init_values[3][HEVC_CONTEXTS] = {
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      200,
-+      // split_coding_unit_flag
-+      139, 141, 157,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      CNU, CNU, CNU,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      CNU,
-+      // part_mode
-+      184, CNU, CNU, CNU,
-+      // prev_intra_luma_pred_mode
-+      184,
-+      // intra_chroma_pred_mode
-+      63, 139,
-+      // merge_flag
-+      CNU,
-+      // merge_idx
-+      CNU,
-+      // inter_pred_idc
-+      CNU, CNU, CNU, CNU, CNU,
-+      // ref_idx_l0
-+      CNU, CNU,
-+      // ref_idx_l1
-+      CNU, CNU,
-+      // abs_mvd_greater1_flag
-+      CNU, CNU,
-+      // abs_mvd_greater1_flag
-+      CNU, CNU,
-+      // mvp_lx_flag
-+      CNU,
-+      // no_residual_data_flag
-+      CNU,
-+      // split_transform_flag
-+      153, 138, 138,
-+      // cbf_luma
-+      111, 141,
-+      // cbf_cb, cbf_cr
-+      94, 138, 182, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+       79, 108, 123,  63,
-+      // last_significant_coeff_y_prefix
-+      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+       79, 108, 123,  63,
-+      // significant_coeff_group_flag
-+      91, 171, 134, 141,
-+      // significant_coeff_flag
-+      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
-+      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
-+      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
-+      141, 111,
-+      // coeff_abs_level_greater1_flag
-+      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
-+      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
-+      // coeff_abs_level_greater2_flag
-+      138, 153, 136, 167, 152, 152,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      185,
-+      // split_coding_unit_flag
-+      107, 139, 126,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      197, 185, 201,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      149,
-+      // part_mode
-+      154, 139, 154, 154,
-+      // prev_intra_luma_pred_mode
-+      154,
-+      // intra_chroma_pred_mode
-+      152, 139,
-+      // merge_flag
-+      110,
-+      // merge_idx
-+      122,
-+      // inter_pred_idc
-+      95, 79, 63, 31, 31,
-+      // ref_idx_l0
-+      153, 153,
-+      // ref_idx_l1
-+      153, 153,
-+      // abs_mvd_greater1_flag
-+      140, 198,
-+      // abs_mvd_greater1_flag
-+      140, 198,
-+      // mvp_lx_flag
-+      168,
-+      // no_residual_data_flag
-+      79,
-+      // split_transform_flag
-+      124, 138, 94,
-+      // cbf_luma
-+      153, 111,
-+      // cbf_cb, cbf_cr
-+      149, 107, 167, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
-+       94, 108, 123, 108,
-+      // last_significant_coeff_y_prefix
-+      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
-+       94, 108, 123, 108,
-+      // significant_coeff_group_flag
-+      121, 140, 61, 154,
-+      // significant_coeff_flag
-+      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
-+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
-+      140, 140,
-+      // coeff_abs_level_greater1_flag
-+      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
-+      // coeff_abs_level_greater2_flag
-+      107, 167, 91, 122, 107, 167,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+    { // sao_merge_flag
-+      153,
-+      // sao_type_idx
-+      160,
-+      // split_coding_unit_flag
-+      107, 139, 126,
-+      // cu_transquant_bypass_flag
-+      154,
-+      // skip_flag
-+      197, 185, 201,
-+      // cu_qp_delta
-+      154, 154, 154,
-+      // pred_mode
-+      134,
-+      // part_mode
-+      154, 139, 154, 154,
-+      // prev_intra_luma_pred_mode
-+      183,
-+      // intra_chroma_pred_mode
-+      152, 139,
-+      // merge_flag
-+      154,
-+      // merge_idx
-+      137,
-+      // inter_pred_idc
-+      95, 79, 63, 31, 31,
-+      // ref_idx_l0
-+      153, 153,
-+      // ref_idx_l1
-+      153, 153,
-+      // abs_mvd_greater1_flag
-+      169, 198,
-+      // abs_mvd_greater1_flag
-+      169, 198,
-+      // mvp_lx_flag
-+      168,
-+      // no_residual_data_flag
-+      79,
-+      // split_transform_flag
-+      224, 167, 122,
-+      // cbf_luma
-+      153, 111,
-+      // cbf_cb, cbf_cr
-+      149, 92, 167, 154,
-+      // transform_skip_flag
-+      139, 139,
-+      // explicit_rdpcm_flag
-+      139, 139,
-+      // explicit_rdpcm_dir_flag
-+      139, 139,
-+      // last_significant_coeff_x_prefix
-+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
-+       79, 108, 123,  93,
-+      // last_significant_coeff_y_prefix
-+      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
-+       79, 108, 123,  93,
-+      // significant_coeff_group_flag
-+      121, 140, 61, 154,
-+      // significant_coeff_flag
-+      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
-+      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
-+      140, 140,
-+      // coeff_abs_level_greater1_flag
-+      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
-+      // coeff_abs_level_greater2_flag
-+      107, 167, 91, 107, 107, 167,
-+      // log2_res_scale_abs
-+      154, 154, 154, 154, 154, 154, 154, 154,
-+      // res_scale_sign_flag
-+      154, 154,
-+      // cu_chroma_qp_offset_flag
-+      154,
-+      // cu_chroma_qp_offset_idx
-+      154,
-+    },
-+};
-+
-+static const uint8_t scan_1x1[1] = {
-+    0,
-+};
-+
-+static const uint8_t horiz_scan2x2_x[4] = {
-+    0, 1, 0, 1,
-+};
-+
-+static const uint8_t horiz_scan2x2_y[4] = {
-+    0, 0, 1, 1
-+};
-+
-+static const uint8_t horiz_scan4x4_x[16] = {
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+    0, 1, 2, 3,
-+};
-+
-+static const uint8_t horiz_scan4x4_y[16] = {
-+    0, 0, 0, 0,
-+    1, 1, 1, 1,
-+    2, 2, 2, 2,
-+    3, 3, 3, 3,
-+};
-+
-+static const uint8_t horiz_scan8x8_inv[8][8] = {
-+    {  0,  1,  2,  3, 16, 17, 18, 19, },
-+    {  4,  5,  6,  7, 20, 21, 22, 23, },
-+    {  8,  9, 10, 11, 24, 25, 26, 27, },
-+    { 12, 13, 14, 15, 28, 29, 30, 31, },
-+    { 32, 33, 34, 35, 48, 49, 50, 51, },
-+    { 36, 37, 38, 39, 52, 53, 54, 55, },
-+    { 40, 41, 42, 43, 56, 57, 58, 59, },
-+    { 44, 45, 46, 47, 60, 61, 62, 63, },
-+};
-+
-+static const uint8_t diag_scan2x2_x[4] = {
-+    0, 0, 1, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_y[4] = {
-+    0, 1, 0, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_inv[2][2] = {
-+    { 0, 2, },
-+    { 1, 3, },
-+};
-+
-+static const uint8_t diag_scan4x4_inv[4][4] = {
-+    { 0,  2,  5,  9, },
-+    { 1,  4,  8, 12, },
-+    { 3,  7, 11, 14, },
-+    { 6, 10, 13, 15, },
-+};
-+
-+static const uint8_t diag_scan8x8_inv[8][8] = {
-+    {  0,  2,  5,  9, 14, 20, 27, 35, },
-+    {  1,  4,  8, 13, 19, 26, 34, 42, },
-+    {  3,  7, 12, 18, 25, 33, 41, 48, },
-+    {  6, 11, 17, 24, 32, 40, 47, 53, },
-+    { 10, 16, 23, 31, 39, 46, 52, 57, },
-+    { 15, 22, 30, 38, 45, 51, 56, 60, },
-+    { 21, 29, 37, 44, 50, 55, 59, 62, },
-+    { 28, 36, 43, 49, 54, 58, 61, 63, },
-+};
-+
-+
-+typedef struct
-+{
-+    uint16_t coeff;
-+    uint16_t scale;
-+} xy_off_t;
-+
-+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
-+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
-+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
-+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
-+
-+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
-+
-+#define OFF_DIAG(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
-+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
-+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
-+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_HORIZ(t) {\
-+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
-+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
-+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
-+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_VERT(t) {\
-+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
-+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
-+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
-+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+static const xy_off_t off_xys[3][4][16] =
-+{
-+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
-+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
-+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
-+};
-+
-+
-+// Helper fns
-+#ifndef hevc_mem_bits32
-+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
-+{
-+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
-+}
-+#endif
-+
-+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
-+#define hevc_clz32 hevc_clz32_builtin
-+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
-+{
-+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
-+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
-+}
-+#endif
-+
-+// It is unlikely that we will ever need this but include for completeness
-+#ifndef hevc_clz32
-+static inline unsigned int hevc_clz32(unsigned int x)
-+{
-+    unsigned int n = 1;
-+    if ((x & 0xffff0000) == 0) {
-+        n += 16;
-+        x <<= 16;
-+    }
-+    if ((x & 0xff000000) == 0) {
-+        n += 8;
-+        x <<= 8;
-+    }
-+    if ((x & 0xf0000000) == 0) {
-+        n += 4;
-+        x <<= 4;
-+    }
-+    if ((x & 0xc0000000) == 0) {
-+        n += 2;
-+        x <<= 2;
-+    }
-+    return n - ((x >> 31) & 1);
-+}
-+#endif
-+
-+static inline int cabac_overflow(const CABACContext * const cc)
-+{
-+    av_assert0(cc->bytestream >= cc->bytestream_start);
-+    return cc->bytestream >= cc->bytestream_end + 4;
-+}
-+
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
-+{
-+    return cabac_overflow(&lc->cc);
-+}
-+
-+#if !USE_BY22
-+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
-+// will no longer be called but the setup calls will still exist and we want
-+// to null them out
-+#define bypass_start(s)
-+#define bypass_finish(s)
-+#else
-+// Use BY22 for residual bypass block
-+
-+#define bypass_start(cc) get_cabac_by22_start(cc)
-+#define bypass_finish(cc) get_cabac_by22_finish(cc)
-+
-+// BY22 notes that bypass is simply a divide into the bitstream and so we
-+// can peek out large quantities of bits at once and treat the result as if
-+// it was VLC.  In many cases this will lead to O(1) processing rather than
-+// O(n) though the setup and teardown is sufficiently expensive that it is
-+// only worth using if we expect to be dealing with more than a few bits
-+// The definition of "a few bits" will vary from platform to platform but
-+// tests on ARM show that it probably isn't worth it for a single coded
-+// residual, but is for >1 - it also seems likely that if there are
-+// more residuals then they are likely to be bigger and this will make the
-+// O(1) nature of the code more worthwhile.
-+
-+
-+// Bypass block start
-+// Must be called before _by22_peek is used as it sets the CABAC environment
-+// into the correct state.  _by22_finish must be called to return to 'normal'
-+// (i.e. non-bypass) cabac decoding
-+#ifndef get_cabac_by22_start
-+static inline void get_cabac_by22_start(CABACContext * const c)
-+{
-+    const unsigned int bits = __builtin_ctz(c->low);
-+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
-+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
-+#if !USE_BY22_DIV
-+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
-+#endif
-+
-+    c->bytestream -= (CABAC_BITS / 8);
-+    c->by22.bits = bits;
-+#if !USE_BY22_DIV
-+    c->by22.range = c->range;
-+    c->range = inv;
-+#endif
-+    c->low = x;
-+}
-+#endif
-+
-+// Bypass block finish
-+// Must be called at the end of the bypass block to return to normal operation
-+static inline void get_cabac_by22_finish(CABACContext * const c)
-+{
-+    unsigned int used = c->by22.bits;
-+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
-+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
-+
-+    c->bytestream += bytes_used + (CABAC_BITS / 8);
-+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
-+#if !USE_BY22_DIV
-+    c->range = c->by22.range;
-+#endif
-+}
-+
-+// Peek bypass bits
-+// _by22_start must be called before _by22_peek is called and _by22_flush
-+// must be called afterwards to flush any used bits
-+// The actual number of valid bits returned is
-+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
-+// will be at least 22 which should be long enough for any prefix or suffix
-+// though probably not long enough for the worst case combination
-+#ifndef get_cabac_by22_peek
-+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
-+{
-+#if USE_BY22_DIV
-+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
-+#else
-+    uint32_t x = c->low & ~1U;
-+    const uint32_t inv = c->range;
-+
-+    if (inv != 0)
-+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
-+
-+    return x << 1;
-+#endif
-+}
-+#endif
-+
-+// Flush bypass bits peeked by _by22_peek
-+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
-+// val is an unmodified copy of whatever _by22_peek returned
-+#ifndef get_cabac_by22_flush
-+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
-+{
-+    // Subtract the bits used & reshift up to the top of the word
-+#if USE_BY22_DIV
-+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
-+#else
-+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
-+#endif
-+
-+    // and refill lower bits
-+    // We will probably OR over some existing bits but that doesn't matter
-+    c->by22.bits += n;
-+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
-+}
-+#endif
-+
-+#endif  // USE_BY22
-+
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
-+{
-+    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
-+    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
-+}
-+
-+static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
-+    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
-+}
-+
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
-+{
-+    GetBitContext * const gb = &lc->gb;
-+    skip_bits(gb, 1);
-+    align_get_bits(gb);
-+    return ff_init_cabac_decoder(&lc->cc,
-+                          gb->buffer + get_bits_count(gb) / 8,
-+                          (get_bits_left(gb) + 7) / 8);
-+}
-+
-+static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int init_type = 2 - s->sh.slice_type;
-+    int i;
-+
-+    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
-+        init_type ^= 3;
-+
-+    for (i = 0; i < HEVC_CONTEXTS; i++) {
-+        int init_value = init_values[init_type][i];
-+        int m = (init_value >> 4) * 5 - 45;
-+        int n = ((init_value & 15) << 3) - 16;
-+        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
-+
-+        pre ^= pre >> 31;
-+        if (pre > 124)
-+            pre = 124 + (pre & 1);
-+        lc->cabac_state[i] = pre;
-+    }
-+
-+    for (i = 0; i < 4; i++)
-+        lc->stat_coeff[i] = 0;
-+}
-+
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
-+{
-+    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
-+    {
-+        lc->qPy_pred = s->sh.slice_qp;
-+        cabac_init_state(s, lc);
-+    }
-+    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
-+    {
-+        lc->qPy_pred = s->sh.slice_qp;
-+        load_states(s, lc);
-+    }
-+    lc->cabac_init_req = 0;
-+}
-+
-+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
-+{
-+    return get_cabac_inline(c, state);
-+}
-+
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
-+{
-+    return get_cabac_terminate(c);
-+}
-+
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
-+        return 0;
-+
-+    if (!get_cabac_bypass(&lc->cc))
-+        return SAO_BAND;
-+    return SAO_EDGE;
-+}
-+
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 0; i < 4; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int i = 0;
-+    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
-+
-+    while (i < length && get_cabac_bypass(&lc->cc))
-+        i++;
-+    return i;
-+}
-+
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return get_cabac_bypass(&lc->cc);
-+}
-+
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret = get_cabac_bypass(&lc->cc) << 1;
-+    ret    |= get_cabac_bypass(&lc->cc);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
-+{
-+    int val = 1;
-+
-+    if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
-+        return 0;
-+
-+    while (val < 5 &&
-+           get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
-+        val++;
-+
-+    if (val >= 5) {
-+        unsigned int k = 0;
-+        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+            val += 1 << k;
-+            k++;
-+        }
-+//        if (k == CABAC_MAX_BIN)
-+//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+
-+        while (k--)
-+            val += get_cabac_bypass(&lc->cc) << k;
-+    }
-+    return get_cabac_bypass(&lc->cc) ? -val : val;
-+}
-+
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
-+    int i = 0;
-+
-+    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
-+        i++;
-+
-+    return i;
-+}
-+
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
-+{
-+    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
-+        return PART_2Nx2N;
-+    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+        if (lc->cu.pred_mode == MODE_INTRA) // 0
-+            return PART_NxN;
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+            return PART_2NxN;
-+        if (log2_cb_size == 3) // 00
-+            return PART_Nx2N;
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
-+            return PART_Nx2N;
-+        return PART_NxN; // 000
-+    }
-+
-+    if (!s->ps.sps->amp_enabled_flag) {
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+            return PART_2NxN;
-+        return PART_Nx2N;
-+    }
-+
-+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
-+        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
-+            return PART_2NxN;
-+        if (get_cabac_bypass(&lc->cc)) // 0101
-+            return PART_2NxnD;
-+        return PART_2NxnU; // 0100
-+    }
-+
-+    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
-+        return PART_Nx2N;
-+    if (get_cabac_bypass(&lc->cc)) // 0001
-+        return PART_nRx2N;
-+    return PART_nLx2N;  // 0000
-+}
-+
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i = 0;
-+    while (i < 2 && get_cabac_bypass(&lc->cc))
-+        i++;
-+    return i;
-+}
-+
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int i;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 0; i < 4; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret;
-+    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
-+        return 4;
-+
-+    ret  = get_cabac_bypass(&lc->cc) << 1;
-+    ret |= get_cabac_bypass(&lc->cc);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
-+
-+    if (i != 0) {
-+        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
-+            i++;
-+    }
-+    return i;
-+}
-+
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
-+{
-+    if (nPbW + nPbH == 12)
-+        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
-+        return PRED_BI;
-+
-+    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+}
-+
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
-+{
-+    int i = 0;
-+    int max = num_ref_idx_lx - 1;
-+    int max_ctx = FFMIN(max, 2);
-+
-+    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
-+        i++;
-+    if (i == 2) {
-+        while (i < max && get_cabac_bypass(&lc->cc))
-+            i++;
-+    }
-+
-+    return i;
-+}
-+
-+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
-+}
-+
-+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
-+}
-+
-+#if !USE_BY22
-+static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
-+{
-+    int ret = 2;
-+    int k = 1;
-+
-+    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+        ret += 1U << k;
-+        k++;
-+    }
-+    if (k == CABAC_MAX_BIN) {
-+        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+        return 0;
-+    }
-+
-+    while (k--)
-+        ret += get_cabac_bypass(&lc->cc) << k;
-+    return get_cabac_bypass_sign(&lc->cc, -ret);
-+}
-+#endif
-+
-+static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return get_cabac_bypass_sign(&lc->cc, -1);
-+}
-+
-+static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
-+}
-+
-+
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
-+    int i =0;
-+
-+    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
-+        i++;
-+
-+    return i;
-+}
-+
-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
-+                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
-+{
-+    int i = 0;
-+    int max = (log2_size << 1) - 1;
-+    int ctx_offset, ctx_shift;
-+
-+    if (!c_idx_nz) {
-+        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
-+        ctx_shift = (log2_size + 1) >> 2;
-+    } else {
-+        ctx_offset = 15;
-+        ctx_shift = log2_size - 2;
-+    }
-+    while (i < max &&
-+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+        i++;
-+    *last_scx_prefix = i;
-+
-+    i = 0;
-+    while (i < max &&
-+           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+        i++;
-+    *last_scy_prefix = i;
-+}
-+
-+static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
-+                                                 int last_significant_coeff_prefix)
-+{
-+    int i;
-+    int length = (last_significant_coeff_prefix >> 1) - 1;
-+    int value = get_cabac_bypass(&lc->cc);
-+
-+    for (i = 1; i < length; i++)
-+        value = (value << 1) | get_cabac_bypass(&lc->cc);
-+    return value;
-+}
-+
-+static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
-+{
-+    int inc;
-+
-+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
-+
-+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
-+}
-+
-+static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
-+{
-+    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
-+}
-+
-+#if !USE_BY22
-+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
-+#endif
-+
-+
-+#ifndef coeff_abs_level_remaining_decode_bypass
-+static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
-+{
-+    uint32_t y;
-+    unsigned int prefix;
-+    unsigned int last_coeff_abs_level_remaining;
-+    unsigned int n;
-+
-+    y = get_cabac_by22_peek(c);
-+    prefix = hevc_clz32(~y);
-+    // y << prefix will always have top bit 0
-+
-+    if (prefix < 3) {
-+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
-+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
-+        n = prefix + 1 + rice_param;
-+    }
-+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
-+    {
-+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
-+
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix * 2 + rice_param - 2;
-+    }
-+    else {
-+        unsigned int suffix;
-+
-+        get_cabac_by22_flush(c, prefix, y);
-+        y = get_cabac_by22_peek(c);
-+
-+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+        n = prefix + rice_param - 2;
-+    }
-+
-+    get_cabac_by22_flush(c, n, y);
-+
-+    return last_coeff_abs_level_remaining;
-+}
-+#endif
-+
-+static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
-+{
-+    int prefix = 0;
-+    int suffix = 0;
-+    int last_coeff_abs_level_remaining;
-+    int i;
-+
-+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-+        prefix++;
-+    if (prefix == CABAC_MAX_BIN) {
-+//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-+        return 0;
-+    }
-+
-+    if (prefix < 3) {
-+        for (i = 0; i < rc_rice_param; i++)
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-+        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-+    } else {
-+        int prefix_minus3 = prefix - 3;
-+        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-+            suffix = (suffix << 1) | get_cabac_bypass(c);
-+        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-+                                              << rc_rice_param) + suffix;
-+    }
-+
-+    return last_coeff_abs_level_remaining;
-+}
-+
-+#if !USE_BY22
-+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
-+static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
-+{
-+    unsigned int i;
-+    uint32_t ret = 0;
-+
-+    for (i = 0; i < nb; i++)
-+        ret = (ret << 1) | get_cabac_bypass(c);
-+
-+    return ret << (32 - nb);
-+}
-+#endif
-+
-+#ifndef coeff_sign_flag_decode_bypass
-+static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
-+{
-+    uint32_t y;
-+    y = get_cabac_by22_peek(c);
-+    get_cabac_by22_flush(c, nb, y);
-+    return y & ~(0xffffffffU >> nb);
-+}
-+#endif
-+
-+
-+#ifndef get_cabac_greater1_bits
-+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
-+    uint8_t * const state0)
-+{
-+    unsigned int i;
-+    unsigned int rv = 0;
-+    for (i = 0; i != n; ++i) {
-+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
-+        const unsigned int b = get_cabac(c, state0 + idx);
-+        rv = (rv << 1) | b;
-+    }
-+    return rv;
-+}
-+#endif
-+
-+
-+// N.B. levels returned are the values assuming coeff_abs_level_remaining
-+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
-+// this version of events.
-+static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
-+    int * const pprev_subset_coded, int * const psum,
-+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
-+{
-+    CABACContext * const c = &lc->cc;
-+    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
-+    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
-+    unsigned int rv;
-+    unsigned int i;
-+    const unsigned int n = FFMIN(n_end, 8);
-+
-+    // Really this is i != n but the simple unconditional loop is cheaper
-+    // and faster
-+    for (i = 0; i != 8; ++i)
-+        levels[i] = 1;
-+
-+    rv = get_cabac_greater1_bits(c, n, state0);
-+
-+    *pprev_subset_coded = 0;
-+    *psum = n;
-+
-+    rv <<= (32 - n);
-+    if (rv != 0)
-+    {
-+        *pprev_subset_coded = 1;
-+        *psum = n + 1;
-+        i = hevc_clz32(rv);
-+        levels[i] = 2;
-+        if (get_cabac(c, state_gt2) == 0)
-+        {
-+            // Unset first coded bit
-+            rv &= ~(0x80000000U >> i);
-+        }
-+    }
-+
-+    if (n_end > 8) {
-+        const unsigned int g8 = n_end - 8;
-+        rv |= ((1 << g8) - 1) << (24 - g8);
-+        for (i = 0; i != g8; ++i) {
-+            levels[i + 8] = 0;
-+        }
-+    }
-+
-+    return rv;
-+}
-+
-+// extended_precision_processing_flag must be false given we are
-+// putting the result into a 16-bit array
-+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
-+// scale_m is uint8_t
-+//
-+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
-+//   or it can be 2 (if we have transquant_bypass)
-+// shift is set to one less than we really want but would normally be
-+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
-+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
-+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
-+// to achieve it
-+
-+#ifndef trans_scale_sat
-+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
-+}
-+#endif
-+
-+
-+#ifndef update_rice
-+static inline void update_rice(uint8_t * const stat_coeff,
-+    const unsigned int last_coeff_abs_level_remaining,
-+    const unsigned int c_rice_param)
-+{
-+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
-+    if (x >= 6)
-+        (*stat_coeff)++;
-+    else if (x == 0 && *stat_coeff > 0)
-+        (*stat_coeff)--;
-+}
-+#endif
-+
-+
-+// n must be > 0 on entry
-+#ifndef get_cabac_sig_coeff_flag_idxs
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t const * ctx_map,
-+    uint8_t * p)
-+{
-+    do {
-+        if (get_cabac(c, state0 + ctx_map[n]))
-+            *p++ = n;
-+    } while (--n != 0);
-+    return p;
-+}
-+#endif
-+
-+
-+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+    unsigned int n,
-+    const uint8_t * ctx_map,  // const ptr here but not in asm
-+    uint8_t * const flag_idx)
-+{
-+    int rv;
-+
-+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
-+
-+    return rv;
-+}
-+
-+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x1,  x2,  x3,\
-+     x4,  x5,  x6,  x7,\
-+     x8,  x9, x10, x11,\
-+    x12, x13, x14, x15}
-+
-+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x8, x12,\
-+     x1,  x5,  x9, x13,\
-+     x2,  x6, x10, x14,\
-+     x3,  x7, x11, x15}
-+
-+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+     x0,  x4,  x1,  x8,\
-+     x5,  x2, x12,  x9,\
-+     x6,  x3, x13, x10,\
-+     x7, x14, x11, x15}
-+
-+
-+static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
-+    uint8_t * const significant_coeff_group_flag,
-+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
-+    int * const pPrev_sig)
-+{
-+    while (--i >= 0) {
-+        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
-+        const unsigned int x_cg = scan_x_cg[i];
-+
-+        // For the flag decode we only care about Z/NZ but
-+        // we use the full Right * 2 + Down when calculating
-+        // significant coeff flags so we obtain it here.
-+        //
-+        // The group flag array is one longer than it needs to
-+        // be so we don't need to check for y_cg limits
-+        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
-+
-+        if (i == 0 ||
-+            significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
-+        {
-+            gf_y[0] |= (1 << x_cg);
-+            *pPrev_sig = prev_sig;
-+            break;
-+        }
-+    }
-+
-+    return i;
-+}
-+
-+static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+    const AVFrame * const frame = s->frame;
-+    const unsigned int stride = frame_stride1(s->frame, c_idx);
-+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
-+    uint8_t * const dst = !is_sliced ?
-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+        c_idx == 0 ?
-+            av_rpi_sand_frame_pos_y(frame, x, y) :
-+            av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+    const unsigned int i = jb->intra.n;
-+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+        pc->ta.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->ta.stride == stride);
-+
-+        pc->type = RPI_PRED_ADD_RESIDUAL_C;
-+    }
-+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+        pc->dc.dst == dst)
-+    {
-+        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->dc.stride == stride);
-+
-+        // Rewrite as add residual - must rewrite all fields as different union member
-+        pc->type = RPI_PRED_ADD_RESIDUAL_V;
-+        pc->ta.buf = coeffs;
-+        pc->ta.dst = dst;
-+        pc->ta.stride = stride;
-+        pc->ta.dc = dc;
-+    }
-+    else
-+    {
-+        HEVCPredCmd * const cmd = pc + 1;
-+        jb->intra.n = i + 1;
-+
-+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
-+        cmd->size = log2_trafo_size;
-+        cmd->ta.buf = coeffs;
-+        cmd->ta.dst = dst;
-+        cmd->ta.stride = stride;
-+        cmd->ta.dc = 0;
-+    }
-+}
-+
-+
-+static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+    const AVFrame * const frame = s->frame;
-+    const unsigned int stride = frame_stride1(s->frame, c_idx);
-+    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+    const int is_sliced = 1;
-+    uint8_t * const dst = !is_sliced ?
-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+        c_idx == 0 ?
-+            av_rpi_sand_frame_pos_y(frame, x, y) :
-+            av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
-+    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
-+
-+    const unsigned int i = jb->intra.n;
-+    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+        pc->ta.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->ta.stride == stride);
-+
-+        pc->ta.dc = (int16_t)coeff;
-+    }
-+    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+        pc->dc.dst == dst)
-+    {
-+        av_assert1(pc->size == log2_trafo_size &&
-+                   pc->c_idx == 1 &&
-+                   pc->dc.stride == stride &&
-+                   (pc->dc.dc & ~0xffff) == 0);
-+
-+        pc->dc.dc |= (coeff << 16);
-+    }
-+    else
-+    {
-+        HEVCPredCmd * const cmd = pc + 1;
-+        jb->intra.n = i + 1;
-+
-+        cmd->type = RPI_PRED_ADD_DC + c_idx;
-+        cmd->size = log2_trafo_size;
-+        cmd->dc.dst = dst;
-+        cmd->dc.stride = stride;
-+        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
-+    }
-+}
-+
-+
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int log2_trafo_size, const enum ScanType scan_idx,
-+                                const int c_idx)
-+{
-+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
-+
-+    int last_significant_coeff_x, last_significant_coeff_y;
-+    int num_coeff = 0;
-+    int prev_subset_coded = 0;
-+
-+    int num_last_subset;
-+    int x_cg_last_sig, y_cg_last_sig;
-+
-+    const uint8_t *scan_x_cg, *scan_y_cg;
-+    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+
-+    int use_vpu;
-+#if RPI_COMPRESS_COEFFS                                
-+    int num_nonzero = 0;
-+    int use_compress = 0;
-+    int *coeffs32;
-+#endif
-+    int use_dc = 0;
-+    int16_t *coeffs;
-+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
-+    int explicit_rdpcm_flag = 0;
-+    int explicit_rdpcm_dir_flag;
-+
-+    int i;
-+    int shift,scale;
-+    const uint8_t *scale_matrix = NULL;
-+    uint8_t dc_scale;
-+    const int c_idx_nz = (c_idx != 0);
-+    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
-+    int prev_sig = 0;
-+    int may_hide_sign;
-+
-+    int16_t dummy_coeffs[16];
-+
-+    // Derive QP for dequant
-+    if (!lc->cu.cu_transquant_bypass_flag) {
-+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
-+
-+        if (s->ps.pps->transform_skip_enabled_flag &&
-+            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-+            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
-+            if (transform_skip_flag) {
-+                trans_skip_or_bypass = 1;
-+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
-+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
-+                    may_hide_sign = 0;
-+                }
-+            }
-+        }
-+
-+        {
-+            static const uint8_t level_scale[8] = {
-+                40, 45, 51, 57, 64, 72, 0, 0  // Pad to 8
-+            };
-+            const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
-+
-+            // Shift is set to one less than will actually occur as the scale
-+            // and saturate step adds 1 and then shifts right again
-+            scale = level_scale[qp6 & 7];
-+//            shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
-+            shift = log2_trafo_size - (qp6 >> 3);
-+
-+            if (shift < 0) {
-+                scale <<= -shift;
-+                shift = 0;
-+            }
-+        }
-+
-+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-+            const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
-+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-+            const unsigned int matrix_id =
-+                lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
-+
-+            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-+            dc_scale = scale_matrix[0];
-+            if (log2_trafo_size >= 4)
-+                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-+        }
-+        else
-+        {
-+            static const uint8_t sixteen_scale[64] = {
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16,
-+                16, 16, 16, 16, 16, 16, 16, 16
-+            };
-+            scale_matrix = sixteen_scale;
-+            dc_scale = 16;
-+        }
-+    } else {
-+        static const uint8_t unit_scale[64] = {
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+            1, 1, 1, 1, 1, 1, 1, 1,
-+        };
-+        scale_matrix = unit_scale;
-+        shift        = 0;
-+        scale        = 2;  // We will shift right to kill this
-+        dc_scale     = 1;
-+
-+        may_hide_sign = 0;
-+    }
-+
-+
-+
-+
-+    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-+        trans_skip_or_bypass) {
-+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
-+        if (explicit_rdpcm_flag) {
-+            may_hide_sign = 0;
-+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
-+        }
-+    }
-+
-+    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
-+                                           &last_significant_coeff_x, &last_significant_coeff_y);
-+
-+    if (last_significant_coeff_x > 3) {
-+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
-+        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-+        (2 + (last_significant_coeff_x & 1)) +
-+        suffix;
-+    }
-+
-+    if (last_significant_coeff_y > 3) {
-+        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
-+        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-+        (2 + (last_significant_coeff_y & 1)) +
-+        suffix;
-+    }
-+
-+    if (scan_idx == SCAN_VERT)
-+        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
-+
-+    x_cg_last_sig = last_significant_coeff_x >> 2;
-+    y_cg_last_sig = last_significant_coeff_y >> 2;
-+
-+    switch (scan_idx) {
-+    case SCAN_DIAG: {
-+        int last_x_c = last_significant_coeff_x & 3;
-+        int last_y_c = last_significant_coeff_y & 3;
-+
-+        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-+
-+        switch (log2_trafo_size) {
-+        case 2:
-+            scan_x_cg = scan_1x1;
-+            scan_y_cg = scan_1x1;
-+            break;
-+        case 3:
-+            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = diag_scan2x2_x;
-+            scan_y_cg = diag_scan2x2_y;
-+            break;
-+        case 4:
-+            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
-+            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
-+            break;
-+        case 5:
-+        default:
-+            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
-+            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
-+            break;
-+        }
-+        break;
-+    }
-+    case SCAN_HORIZ:
-+        scan_x_cg = horiz_scan2x2_x;
-+        scan_y_cg = horiz_scan2x2_y;
-+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-+        break;
-+    default: //SCAN_VERT
-+        scan_x_cg = horiz_scan2x2_y;
-+        scan_y_cg = horiz_scan2x2_x;
-+        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-+        break;
-+    }
-+    num_coeff++;
-+    num_last_subset = (num_coeff - 1) >> 4;
-+
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+
-+    {
-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+        const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */;  // These need special processing
-+        use_vpu = 0;
-+        use_dc = (num_coeff == 1) && !special &&
-+            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
-+
-+        if (use_dc) {
-+            // Just need a little empty space
-+            coeffs = dummy_coeffs;
-+            // No need to clear
-+        }
-+        else
-+        {
-+            use_vpu = !special && log2_trafo_size >= 4;
-+#if RPI_COMPRESS_COEFFS
-+            use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
-+#endif
-+            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if RPI_COMPRESS_COEFFS
-+            coeffs32 = (int*)coeffs;
-+            if (!use_compress)
-+#endif
-+#if HAVE_NEON
-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+        }
-+    }
-+
-+    i = num_last_subset;
-+    do {
-+        int implicit_non_zero_coeff = 0;
-+        int n_end;
-+
-+        uint8_t significant_coeff_flag_idx[16];
-+        unsigned int nb_significant_coeff_flag = 0;
-+
-+        if (i == num_last_subset) {
-+            // First time through
-+            int last_scan_pos = num_coeff - (i << 4) - 1;
-+            n_end = last_scan_pos - 1;
-+            significant_coeff_flag_idx[0] = last_scan_pos;
-+            nb_significant_coeff_flag = 1;
-+        } else {
-+            n_end = 15;
-+            implicit_non_zero_coeff = (i != 0);
-+        }
-+
-+        if (n_end >= 0) {
-+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
-+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
-+            };
-+            // N.B. prev_sig = Right * 2 + Down
-+            static const uint8_t ctx_idx_maps[3][4][16] = {
-+                {
-+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                },
-+                {
-+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
-+                }
-+            };
-+            const uint8_t *ctx_idx_map_p;
-+            int scf_offset = 0;
-+
-+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                ctx_idx_map_p = ctx_idx_maps[0][3];
-+                scf_offset = 40 + c_idx_nz;
-+            } else {
-+                if (c_idx_nz != 0)
-+                    scf_offset = 27;
-+
-+                if (log2_trafo_size == 2) {
-+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-+                } else {
-+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
-+                    if (!c_idx_nz) {
-+                        if (i != 0)
-+                            scf_offset += 3;
-+
-+                        if (log2_trafo_size == 3) {
-+                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-+                        } else {
-+                            scf_offset += 21;
-+                        }
-+                    } else {
-+                        if (log2_trafo_size == 3)
-+                            scf_offset += 9;
-+                        else
-+                            scf_offset += 12;
-+                    }
-+                }
-+            }
-+
-+            if (n_end > 0) {
-+                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
-+                    lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
-+                    n_end, ctx_idx_map_p,
-+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
-+
-+                nb_significant_coeff_flag += cnt;
-+                if (cnt != 0) {
-+                    implicit_non_zero_coeff = 0;
-+                }
-+            }
-+
-+            if (implicit_non_zero_coeff == 0) {
-+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+                    scf_offset = 42 + c_idx_nz;
-+                } else {
-+                    if (i == 0) {
-+                        scf_offset = c_idx_nz ? 27 : 0;
-+                    } else {
-+                        scf_offset = 2 + scf_offset;
-+                    }
-+                }
-+                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
-+                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                    nb_significant_coeff_flag++;
-+                }
-+            } else {
-+                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+                nb_significant_coeff_flag++;
-+            }
-+        }
-+#if RPI_COMPRESS_COEFFS
-+        if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
-+          int16_t temp[32*32];
-+          const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
-+          lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
-+          memcpy(temp, coeffs, sizeof(int)*num_nonzero);
-+          coeffs32 = (int *)temp;
-+          memset(coeffs, 0, ccount * sizeof(int16_t));
-+          num_nonzero--;
-+          while (num_nonzero >= 0) {
-+            const unsigned int res = coeffs32[num_nonzero];
-+            const unsigned int offset = res & 0xffff;
-+            coeffs[ offset ] = res >> 16;
-+            num_nonzero--;
-+          }
-+          use_compress = 0;
-+        }
-+#endif            
-+
-+        if (nb_significant_coeff_flag != 0) {
-+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
-+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
-+                prev_subset_coded;
-+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
-+                (gt1_idx_delta << 2);
-+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
-+                gt1_idx_delta;
-+
-+            const unsigned int x_cg = scan_x_cg[i];
-+            const unsigned int y_cg = scan_y_cg[i];
-+            int16_t * const blk_coeffs = coeffs +
-+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
-+            // This calculation is 'wrong' for log2_traffo_size == 2
-+            // but that doesn't matter as in this case x_cg & y_cg
-+            // are always 0 so result is correct (0) anyway
-+            const uint8_t * const blk_scale = scale_matrix +
-+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
-+
-+            // * The following code block doesn't deal with these flags:
-+            //   (nor did the one it replaces)
-+            //
-+            // cabac_bypass_alignment_enabled_flag
-+            //    This should be easy but I can't find a test case
-+            // extended_precision_processing_flag
-+            //    This can extend the required precision past 16bits
-+            //    so is probably tricky - also no example found yet
-+
-+#if USE_N_END_1
-+            if (nb_significant_coeff_flag == 1) {
-+                // There is a small gain to be had from special casing the single
-+                // transform coefficient case.  The reduction in complexity
-+                // makes up for the code duplicatioon.
-+
-+                int trans_coeff_level = 1;
-+                int coeff_sign_flag;
-+                int coded_val = 0;
-+
-+                // initialize first elem of coeff_bas_level_greater1_flag
-+                prev_subset_coded = 0;
-+
-+                if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
-+                    trans_coeff_level = 2;
-+                    prev_subset_coded = 1;
-+                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
-+                }
-+
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&lc->cc);
-+
-+                if (coded_val)
-+                {
-+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
-+                    } else {
-+                        uint8_t * const stat_coeff =
-+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                        const unsigned int c_rice_param = *stat_coeff >> 2;
-+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
-+
-+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
-+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                    }
-+                }
-+
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
-+                    const int res = trans_scale_sat(
-+                        (trans_coeff_level ^ k) - k,  // Apply sign
-+                        scale,
-+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
-+                        shift);
-+#if RPI_COMPRESS_COEFFS                                
-+                      if (use_compress)
-+                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+                      else
-+#endif
-+                      blk_coeffs[xy_off->coeff] = res;
-+                }
-+            }
-+            else
-+#endif
-+            {
-+                int sign_hidden = may_hide_sign;
-+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
-+                uint32_t coeff_sign_flags;
-+                uint32_t coded_vals = 0;
-+                // Sum(abs(level[]))
-+                // In fact we only need the bottom bit and in some future
-+                // version that may be all we calculate
-+                unsigned int sum_abs;
-+
-+                coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
-+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
-+
-+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
-+                    sign_hidden = 0;
-+
-+                // -- Start bypass block
-+
-+                bypass_start(&lc->cc);
-+
-+                coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
-+
-+                if (coded_vals != 0)
-+                {
-+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
-+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
-+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
-+                    int * level = levels - 1;
-+
-+                    do {
-+                        {
-+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
-+                            level += z;
-+                            coded_vals <<= z;
-+                        }
-+
-+                        {
-+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
-+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+                            sum_abs += last_coeff_abs_level_remaining + 1;
-+                            *level = trans_coeff_level;
-+
-+                            if (stat_coeff != NULL)
-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                            stat_coeff = NULL;
-+
-+                            if (trans_coeff_level > (3 << c_rice_param) &&
-+                                (c_rice_param < 4 || rice_adaptation_enabled))
-+                                ++c_rice_param;
-+                        }
-+                    } while (coded_vals != 0);
-+                }
-+
-+                // sign_hidden = 0 or 1 so we can combine the tests
-+                if ((sign_hidden & sum_abs) != 0) {
-+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-+                }
-+
-+                bypass_finish(&lc->cc);
-+
-+                // -- Finish bypass block
-+
-+                // Scale loop
-+                {
-+                    int m = nb_significant_coeff_flag - 1;
-+
-+                    // Deal with DC component (if any) first
-+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
-+                    {
-+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+                        const int res = trans_scale_sat(
-+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
-+#if RPI_COMPRESS_COEFFS
-+                        if (use_compress)
-+                        {
-+                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
-+                        }
-+                        else
-+#endif
-+                        {
-+                            blk_coeffs[0] = res;
-+                        }
-+                        --m;
-+                    }
-+
-+#if !USE_N_END_1
-+                    // If N_END_1 set then m was at least 1 initially
-+                    if (m >= 0)
-+#endif
-+                    {
-+                        do {
-+                            const xy_off_t * const xy_off = scan_xy_off +
-+                                significant_coeff_flag_idx[m];
-+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+                            const int res = trans_scale_sat(
-+                                (levels[m] ^ k) - k,
-+                                scale,
-+                                blk_scale[xy_off->scale],
-+                                shift);
-+#if RPI_COMPRESS_COEFFS
-+                            if (use_compress) {
-+                              coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+                            } else
-+#endif
-+                              blk_coeffs[xy_off->coeff] = res;
-+                        } while (--m >= 0);
-+                    }
-+                }
-+
-+            }
-+        }
-+    } while ((i = next_subset(lc, i, c_idx_nz,
-+                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
-+             !cabac_overflow(&lc->cc));
-+
-+    if (lc->cu.cu_transquant_bypass_flag) {
-+        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
-+
-+            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+        }
-+    } else {
-+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-+            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-+                      log2_trafo_size == 2 &&
-+                      lc->cu.pred_mode == MODE_INTRA;
-+            if (rot) {
-+                for (i = 0; i < 8; i++)
-+                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-+            }
-+
-+            s->hevcdsp.dequant(coeffs, log2_trafo_size);
-+
-+            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+                                        lc->cu.pred_mode == MODE_INTRA &&
-+                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
-+
-+                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+            }
-+        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+            s->hevcdsp.transform_4x4_luma(coeffs);
-+        }
-+        else if (!use_vpu)
-+        {
-+            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+            if (max_xy == 0)
-+            {
-+                if (use_dc)
-+                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+                else
-+                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
-+            }
-+            else {
-+                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                if (max_xy < 4)
-+                    col_limit = FFMIN(4, col_limit);
-+                else if (max_xy < 8)
-+                    col_limit = FFMIN(8, col_limit);
-+                else if (max_xy < 12)
-+                    col_limit = FFMIN(24, col_limit);
-+                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
-+            }
-+        }
-+    }
-+
-+#if 0
-+    // Mildly rotted - we support no mode where cross is valid
-+    if (lc->tu.cross_pf) {
-+        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
-+        const int ccount = 1 << (log2_trafo_size * 2);
-+
-+        for (i = 0; i < ccount; i++) {
-+            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+        }
-+    }
-+#endif
-+
-+    if (!use_dc) {
-+#if RPI_COMPRESS_COEFFS                                
-+        if (use_compress) {
-+          coeffs32[num_nonzero] = 0;
-+        }
-+#endif      
-+        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+    }
-+}
-+
-+#if !USE_BY22
-+// Stores results to lc
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+    int x = abs_mvd_greater0_flag_decode(lc);
-+    int y = abs_mvd_greater0_flag_decode(lc);
-+
-+    if (x)
-+        x += abs_mvd_greater1_flag_decode(lc);
-+    if (y)
-+        y += abs_mvd_greater1_flag_decode(lc);
-+
-+    switch (x) {
-+    case 2: x = mvd_decode(lc);           break;
-+    case 1: x = mvd_sign_flag_decode(lc); break;
-+    case 0: x = 0;                       break;
-+    }
-+
-+    switch (y) {
-+    case 2: y = mvd_decode(lc);           break;
-+    case 1: y = mvd_sign_flag_decode(lc); break;
-+    case 0: y = 0;                       break;
-+    }
-+    return MV_XY(x,y);
-+}
-+#else
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+    int x = abs_mvd_greater0_flag_decode(lc);
-+    int y = abs_mvd_greater0_flag_decode(lc);
-+
-+    if ((x | y) == 0)
-+        return 0;
-+
-+    if (x != 0)
-+        x += abs_mvd_greater1_flag_decode(lc);
-+    if (y != 0)
-+        y += abs_mvd_greater1_flag_decode(lc);
-+
-+    if ((x | y) == 1)
-+    {
-+        // Not worth starting BY22
-+        if (x != 0)
-+            x = mvd_sign_flag_decode(lc);
-+        if (y != 0)
-+            y = mvd_sign_flag_decode(lc);
-+    }
-+    else
-+    {
-+        CABACContext * const cc = &lc->cc;
-+        uint32_t val;
-+        uint32_t b;
-+        unsigned int n = 0;
-+
-+        bypass_start(cc);
-+        b = val = get_cabac_by22_peek(cc);
-+
-+        if (x == 1) {
-+            x = ((int32_t)b >> 31) | 1;
-+            n = 1;
-+            b <<= 1;
-+        }
-+        else if (x == 2) {
-+            // EG1 so we have (leading one bits + 1) of suffix
-+            // This makes prefix & suffix lengths the same
-+            const unsigned int k = hevc_clz32(~b) + 1;
-+            int s;
-+
-+            av_assert2(k <= 15);
-+
-+            b <<= k;
-+            n = 2 * k + 1; // Includes suffix & sign
-+
-+            // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
-+            // if we are going to do this without a flush
-+            if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
-+            {
-+                // Need too many bits - flush
-+                // n = k
-+                get_cabac_by22_flush(cc, k, val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = k + 1;
-+            }
-+
-+            x = (b >> (32 - k)) + (1 << k);
-+            b <<= k;
-+            s = (int32_t)b >> 31;
-+            x = (x ^ s) - s;
-+            b <<= 1;
-+
-+            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
-+            if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
-+            {
-+                get_cabac_by22_flush(cc, n, val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = 0;
-+            }
-+        }
-+
-+        if (y == 1) {
-+            y = ((int32_t)b >> 31) | 1;
-+            ++n;
-+            // don't care about b anymore
-+        }
-+        else if (y == 2) {
-+            const unsigned int k = hevc_clz32(~b) + 1;
-+            int s;
-+
-+            av_assert2(k <= 15);
-+
-+            // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
-+            // if we are going to do this without a flush
-+            b <<= k;
-+            n += 2 * k + 1;
-+
-+            if (n > CABAC_BY22_PEEK_BITS)
-+            {
-+                // Need too many bits - flush
-+                get_cabac_by22_flush(cc, n - (k + 1), val);
-+                b = val = get_cabac_by22_peek(cc);
-+                n = k + 1;
-+            }
-+
-+            y = (b >> (32 - k)) + (1 << k);
-+            s = (int32_t)(b << k) >> 31;
-+            y = (y ^ s) - s;
-+            // don't care about b anymore
-+        }
-+
-+        get_cabac_by22_flush(cc, n, val);
-+        bypass_finish(cc);
-+    }
-+
-+    return MV_XY(x, y);
-+}
-+#endif
-diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
-new file mode 100644
-index 0000000000..ca191f00d9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac_fns.h
-@@ -0,0 +1,217 @@
-+/*
-+ * HEVC CABAC decoding
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2018 John Cox
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
-+#define AVCODEC_RPI_HEVC_CABAC_FNS_H
-+
-+#include "config.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
-+
-+//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int log2_trafo_size, const enum ScanType scan_idx,
-+                                const int c_idx);
-+
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
-+
-+#define HEVC_BIN_SAO_MERGE_FLAG                         0
-+#define HEVC_BIN_SAO_TYPE_IDX                           1
-+#define HEVC_BIN_SAO_EO_CLASS                           2
-+#define HEVC_BIN_SAO_BAND_POSITION                      2
-+#define HEVC_BIN_SAO_OFFSET_ABS                         2
-+#define HEVC_BIN_SAO_OFFSET_SIGN                        2
-+#define HEVC_BIN_END_OF_SLICE_FLAG                      2
-+#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG                 2
-+#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG              5
-+#define HEVC_BIN_SKIP_FLAG                              6
-+#define HEVC_BIN_CU_QP_DELTA                            9
-+#define HEVC_BIN_PRED_MODE                              12
-+#define HEVC_BIN_PART_MODE                              13
-+#define HEVC_BIN_PCM_FLAG                               17
-+#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE              17
-+#define HEVC_BIN_MPM_IDX                                18
-+#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE               18
-+#define HEVC_BIN_INTRA_CHROMA_PRED_MODE                 18
-+#define HEVC_BIN_MERGE_FLAG                             20
-+#define HEVC_BIN_MERGE_IDX                              21
-+#define HEVC_BIN_INTER_PRED_IDC                         22
-+#define HEVC_BIN_REF_IDX_L0                             27
-+#define HEVC_BIN_REF_IDX_L1                             29
-+#define HEVC_BIN_ABS_MVD_GREATER0_FLAG                  31
-+#define HEVC_BIN_ABS_MVD_GREATER1_FLAG                  33
-+#define HEVC_BIN_ABS_MVD_MINUS2                         35
-+#define HEVC_BIN_MVD_SIGN_FLAG                          35
-+#define HEVC_BIN_MVP_LX_FLAG                            35
-+#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG                  36
-+#define HEVC_BIN_SPLIT_TRANSFORM_FLAG                   37
-+#define HEVC_BIN_CBF_LUMA                               40
-+#define HEVC_BIN_CBF_CB_CR                              42
-+#define HEVC_BIN_TRANSFORM_SKIP_FLAG                    46
-+#define HEVC_BIN_EXPLICIT_RDPCM_FLAG                    48
-+#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG                50
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX        52
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX        70
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX        88
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX        88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG           88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG                 92
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG          136
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG          160
-+#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING              166
-+#define HEVC_BIN_COEFF_SIGN_FLAG                        166
-+#define HEVC_BIN_LOG2_RES_SCALE_ABS                     166
-+#define HEVC_BIN_RES_SCALE_SIGN_FLAG                    174
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG               176
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX                177
-+
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
-+
-+static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
-+    const uint8_t *ptr = c->bytestream;
-+
-+    if (c->low & 0x1)
-+        ptr--;
-+#if CABAC_BITS == 16
-+    if (c->low & 0x1FF)
-+        ptr--;
-+#endif
-+    if ((int) (c->bytestream_end - ptr) < n)
-+        return NULL;
-+    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
-+        return NULL;
-+
-+    return ptr;
-+}
-+
-+static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                                            const unsigned int ct_depth,
-+                                                            const unsigned int x0, const unsigned int y0)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
-+                                 ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
-+                                 ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
-+}
-+
-+static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                             const int x0, const int y0, const int x_cb, const int y_cb)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
-+                                 (s->cabac_stash_left[y0 >> 3] & 1) +
-+                                 (s->cabac_stash_up[x0 >> 3] & 1));
-+}
-+
-+static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+}
-+
-+static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
-+}
-+
-+static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
-+{
-+    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
-+}
-+
-+
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c
-new file mode 100644
-index 0000000000..341bb77d9d
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.c
-@@ -0,0 +1,75 @@
-+/*
-+ * HEVC shared tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_data.h"
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
-+    0, 0, 1, 0,
-+    1, 2, 0, 1,
-+    2, 3, 1, 2,
-+    3, 2, 3, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
-+    0, 1, 0, 2,
-+    1, 0, 3, 2,
-+    1, 0, 3, 2,
-+    1, 3, 2, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
-+    0, 0, 1, 0,
-+    1, 2, 0, 1,
-+    2, 3, 0, 1,
-+    2, 3, 4, 0,
-+    1, 2, 3, 4,
-+    5, 0, 1, 2,
-+    3, 4, 5, 6,
-+    0, 1, 2, 3,
-+    4, 5, 6, 7,
-+    1, 2, 3, 4,
-+    5, 6, 7, 2,
-+    3, 4, 5, 6,
-+    7, 3, 4, 5,
-+    6, 7, 4, 5,
-+    6, 7, 5, 6,
-+    7, 6, 7, 7,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
-+    0, 1, 0, 2,
-+    1, 0, 3, 2,
-+    1, 0, 4, 3,
-+    2, 1, 0, 5,
-+    4, 3, 2, 1,
-+    0, 6, 5, 4,
-+    3, 2, 1, 0,
-+    7, 6, 5, 4,
-+    3, 2, 1, 0,
-+    7, 6, 5, 4,
-+    3, 2, 1, 7,
-+    6, 5, 4, 3,
-+    2, 7, 6, 5,
-+    4, 3, 7, 6,
-+    5, 4, 7, 6,
-+    5, 7, 6, 7,
-+};
-diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h
-new file mode 100644
-index 0000000000..0aee673d8b
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.h
-@@ -0,0 +1,31 @@
-+/*
-+ * HEVC shared data tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_DATA_H
-+#define AVCODEC_RPI_HEVC_DATA_H
-+
-+#include <stdint.h>
-+
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
-+
-+#endif /* AVCODEC_RPI_HEVC_DATA_H */
-diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
-new file mode 100644
-index 0000000000..c8a22bd3d8
---- /dev/null
-+++ b/libavcodec/rpi_hevc_filter.c
-@@ -0,0 +1,1210 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Originally by:
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Seppo Tomperi
-+ * Copyright (C) 2013 Wassim Hamidouche
-+ *
-+ * Substantially rewritten:
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+//#define DISABLE_SAO
-+//#define DISABLE_DEBLOCK
-+//#define DISABLE_STRENGTHS
-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-+//#define DISABLE_DEBLOCK_NONREF
-+
-+#include "libavutil/common.h"
-+#include "libavutil/internal.h"
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+
-+#include "rpi_qpu.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#define LUMA 0
-+#define CB 1
-+#define CR 2
-+
-+// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
-+// so -12,75 overall
-+static const uint8_t tctablex[] = {
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
-+    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
-+    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24,          // QP 38...53
-+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24                    // 54..75
-+};
-+#define tctable (tctablex + 12 + 6*8)
-+
-+static const uint8_t betatablex[] = {
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
-+
-+    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
-+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
-+     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
-+    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,                      // QP 38...51
-+    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64                    // 52..73
-+};
-+#define betatable (betatablex + 12 + 6*8)
-+
-+static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
-+                            const int c_idx, const int tc_offset)
-+{
-+    return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
-+}
-+
-+static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int xBase, const unsigned int yBase)
-+{
-+    const unsigned int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
-+    const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
-+    const unsigned int xQgBase              = xBase & MinCuQpDeltaSizeMask;
-+    const unsigned int yQgBase              = yBase & MinCuQpDeltaSizeMask;
-+    const unsigned int min_cb_width         = s->ps.sps->min_cb_width;
-+    const unsigned int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
-+    const unsigned int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
-+    const int qPy_pred = lc->qPy_pred;
-+
-+    return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+             s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
-+            ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+             s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
-+}
-+
-+// * Only called from bitstream decode in foreground
-+//   so should be safe
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
-+{
-+    const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
-+
-+    if (lc->tu.cu_qp_delta != 0) {
-+        // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
-+        int off = s->ps.sps->qp_bd_offset;
-+        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
-+                                 52 + off) - off;
-+    } else
-+        lc->qp_y = qp_y;
-+}
-+
-+static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
-+{
-+    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
-+}
-+
-+// "DSP" these?
-+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
-+{
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            *(uint32_t *)dst = *(uint32_t *)src;
-+            break;
-+        case 1:
-+            *(uint16_t *)dst = *(uint16_t *)src;
-+            break;
-+        default:
-+            *dst = *src;
-+            break;
-+    }
-+}
-+
-+static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
-+                           ptrdiff_t stride_src, int x, int y, int width, int height,
-+                           int c_idx, int x_ctb, int y_ctb)
-+{
-+    const unsigned int sh = pixel_shift(s, c_idx);
-+    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
-+    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
-+
-+    /* copy horizontal edges */
-+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
-+        src, width << sh);
-+    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
-+        src + stride_src * (height - 1), width << sh);
-+
-+    /* copy vertical edges */
-+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
-+
-+    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
-+}
-+
-+// N.B. Src & dst are swapped as this is a restore!
-+// x0 & y0 are in luma coords
-+// Width & height are in Y/C pels as appropriate
-+// * Clear scope for optimsation here but not used enough to be worth it
-+static void restore_tqb_pixels(const HEVCRpiContext * const s,
-+                               uint8_t *src1, const uint8_t *dst1,
-+                               const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int width, const int height,
-+                               const int c_idx)
-+{
-+    if (s->ps.pps->transquant_bypass_enable_flag ||
-+        s->ps.sps->pcm.loop_filter_disable_flag)
-+    {
-+        const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
-+        int blks_y = height >> (c_idx == 0 ? 3 : 2);
-+        const unsigned int bwidth = 8 << s->ps.sps->pixel_shift;  // Y & C have the same width in sand
-+        const unsigned int bheight = (c_idx == 0) ? 8 : 4;
-+        const unsigned int sh = ((x0 >> 3) & 7);
-+        const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
-+
-+        do {
-+            unsigned int m = (*pcm >> sh) & mask;
-+            uint8_t * bd = src1;
-+            const uint8_t * bs = dst1;
-+            while (m != 0) {
-+                if ((m & 1) != 0) {
-+                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
-+                }
-+                m >>= 1;
-+                bs += bwidth;
-+                bd += bwidth;
-+            }
-+            src1 += stride_src * bheight;
-+            dst1 += stride_dst * bheight;
-+            pcm += s->ps.sps->pcm_width;
-+        } while (--blks_y > 0);
-+    }
-+}
-+
-+#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
-+
-+static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
-+{
-+#if SAO_FILTER_N == 5
-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#elif SAO_FILTER_N == 6
-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#else
-+#error Confused by size of sao fn array
-+#endif
-+    int c_idx;
-+    int edges[4];  // 0 left 1 top 2 right 3 bottom
-+    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
-+    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
-+    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
-+    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
-+    RpiSAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
-+    // flags indicating unfilterable edges
-+    uint8_t vert_edge[]      = { 0, 0 };
-+    uint8_t horiz_edge[]     = { 0, 0 };
-+    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
-+    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
-+    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
-+                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
-+    uint8_t restore          = no_tile_filter || !lfase;
-+    uint8_t left_tile_edge   = 0;
-+    uint8_t right_tile_edge  = 0;
-+    uint8_t up_tile_edge     = 0;
-+    uint8_t bottom_tile_edge = 0;
-+    const int sliced = 1;
-+    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
-+
-+    edges[0]   = x_ctb == 0;
-+    edges[1]   = y_ctb == 0;
-+    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
-+    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
-+
-+#ifdef DISABLE_SAO
-+    return;
-+#endif
-+
-+    if (restore) {
-+        if (!edges[0]) {
-+            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-+            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
-+        }
-+        if (!edges[2]) {
-+            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
-+            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
-+        }
-+        if (!edges[1]) {
-+            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
-+            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
-+        }
-+        if (!edges[3]) {
-+            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
-+            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
-+        }
-+        if (!edges[0] && !edges[1]) {
-+            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
-+        }
-+        if (!edges[1] && !edges[2]) {
-+            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
-+        }
-+        if (!edges[2] && !edges[3]) {
-+            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
-+        }
-+        if (!edges[0] && !edges[3]) {
-+            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
-+        }
-+    }
-+
-+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
-+        const unsigned int vshift = ctx_vshift(s, c_idx);
-+        const unsigned int hshift = ctx_hshift(s, c_idx);
-+        const int x0 = x >> hshift;
-+        const int y0 = y >> vshift;
-+        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
-+        const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
-+        const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
-+        const int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
-+        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
-+        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
-+        ptrdiff_t stride_dst;
-+        uint8_t *dst;
-+
-+        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
-+        uint8_t * const src = !sliced ?
-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
-+            !sliced ? src - (1 << sh) :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
-+            !sliced ? src + (width << sh) :
-+            c_idx == 0 ?
-+                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
-+                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
-+
-+        if (sliced && c_idx > 1) {
-+            break;
-+        }
-+
-+//        if (c_idx == 1)
-+//            printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
-+
-+        switch (sao->type_idx[c_idx]) {
-+        case SAO_BAND:
-+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+                           x_ctb, y_ctb);
-+            if (s->ps.pps->transquant_bypass_enable_flag ||
-+                s->ps.sps->pcm.loop_filter_disable_flag)
-+            {
-+                // Can't use the edge buffer here as it may be in use by the foreground
-+                DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
-+                dst = dstbuf;
-+                stride_dst = 2*MAX_PB_SIZE;
-+                s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+                if (sliced && c_idx != 0)
-+                {
-+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
-+                                                    sao->offset_val[1], sao->band_position[1],
-+                                                    sao->offset_val[2], sao->band_position[2],
-+                                                    width, height);
-+                }
-+                else
-+                {
-+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
-+                                                    width, height);
-+                }
-+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+                                   x, y, width, height, c_idx);
-+            } else {
-+                if (sliced && c_idx != 0)
-+                {
-+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
-+                                                    sao->offset_val[1], sao->band_position[1],
-+                                                    sao->offset_val[2], sao->band_position[2],
-+                                                    width, height);
-+                }
-+                else
-+                {
-+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
-+                                                    width, height);
-+                }
-+            }
-+            sao->type_idx[c_idx] = SAO_APPLIED;
-+            break;
-+        case SAO_EDGE:
-+        {
-+            const int w = s->ps.sps->width >> hshift;
-+            const int h = s->ps.sps->height >> vshift;
-+            int top_edge = edges[1];
-+            int bottom_edge = edges[3];
-+            // Can't use the edge buffer here as it may be in use by the foreground
-+            DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+                [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
-+
-+            stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
-+            dst = dstbuf + stride_dst + 32;
-+
-+            if (!top_edge) {
-+                uint8_t *dst1;
-+                int src_idx;
-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
-+
-+                dst1 = dst - stride_dst;
-+
-+                if (src_l != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
-+                }
-+
-+                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
-+                           SAO_APPLIED);
-+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
-+
-+                if (src_r != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
-+                }
-+            }
-+            if (!bottom_edge) {
-+                uint8_t * const dst1 = dst + height * stride_dst;
-+                int src_idx;
-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
-+                const unsigned int hoff = height * stride_src;
-+
-+                if (src_l != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
-+                }
-+
-+                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
-+                           SAO_APPLIED);
-+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
-+
-+                if (src_r != NULL) {
-+                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
-+                               SAO_APPLIED);
-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
-+                }
-+            }
-+            if (src_l != NULL) {
-+                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
-+                              sh, height, stride_dst, 1 << sh);
-+                } else {
-+                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+                              src_l,
-+                              sh, height, stride_dst, stride_src);
-+                }
-+            }
-+            if (src_r != NULL) {
-+                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
-+                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
-+                              sh, height, stride_dst, 1 << sh);
-+                } else {
-+                    ff_hevc_rpi_copy_vert(dst + (width << sh),
-+                              src_r,
-+                              sh, height, stride_dst, stride_src);
-+                }
-+            }
-+
-+            s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+
-+            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+                           x_ctb, y_ctb);
-+            if (sliced && c_idx != 0)
-+            {
-+                // Class always the same for both U & V (which is just as well :-))
-+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
-+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
-+                                                width, height);
-+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
-+                                                    stride_src, stride_dst,
-+                                                    sao,
-+                                                    edges, width,
-+                                                    height, c_idx,
-+                                                    vert_edge,
-+                                                    horiz_edge,
-+                                                    diag_edge);
-+            }
-+            else
-+            {
-+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
-+                                                sao->eo_class[c_idx], width, height);
-+                s->hevcdsp.sao_edge_restore[restore](src, dst,
-+                                                    stride_src, stride_dst,
-+                                                    sao,
-+                                                    edges, width,
-+                                                    height, c_idx,
-+                                                    vert_edge,
-+                                                    horiz_edge,
-+                                                    diag_edge);
-+            }
-+            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+                               x, y, width, height, c_idx);
-+            sao->type_idx[c_idx] = SAO_APPLIED;
-+            break;
-+        }
-+        }
-+    }
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
-+        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
-+    {
-+        const unsigned int stride1 = frame_stride1(s->frame, 1);
-+        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        const unsigned int xoff = (x >> 8) * stride2 * stride1;
-+        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
-+        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
-+        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
-+        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
-+        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
-+        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
-+
-+//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
-+        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
-+        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
-+    }
-+#endif
-+}
-+
-+// When bits are delivered to deblock we want them
-+//#define TL 1
-+//#define TR 2
-+//#define BL 4
-+//#define BR 8
-+
-+// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
-+// so we need to rearrange before passing on
-+
-+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+    return (pcm[0] |
-+        (pcm[1] << 8) |
-+        (pcm[s->ps.sps->pcm_width] << 16) |
-+        (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
-+}
-+
-+static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+    return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
-+}
-+
-+// We cast away const here as we want this to work for both get and set
-+static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+    return (uint32_t *)(bs +
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#warning Unexpected masks
-+        // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
-+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+            (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+    return (uint8_t *)(bs +
-+        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+            (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
-+        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+
-+// Get block strength
-+// Given how we call we will always get within the 32bit boundries
-+static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
-+                                unsigned int xl, unsigned int xr, const unsigned int y)
-+{
-+    if (xr <= xl) {
-+        return 0;
-+    }
-+    else
-+    {
-+#if HAVE_ARMV6T2_INLINE
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#error This case not yet handled in bs_get32
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+        uint32_t tmp;
-+        __asm__ (
-+            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
-+            "rsb         %[xr], %[xl], %[xr]                         \n\t"
-+            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
-+            "add         %[xr], %[xr], #7                            \n\t"
-+            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
-+            "bic         %[xr], %[xr], #7                            \n\t"
-+            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
-+            "lsr         %[xr], %[xr], #1                            \n\t"
-+            "cmp         %[xr], #32                                  \n\t"
-+            "mvn         %[tmp], #0                                  \n\t"
-+            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
-+            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
-+            "lsr         %[xl], %[bs], %[xl]                         \n\t"
-+            "it ne                                                   \n\t"
-+            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
-+            :  // Outputs
-+                      [bs]"+r"(bs),
-+                 [stride2]"+r"(stride2),
-+                      [xl]"+r"(xl),
-+                      [xr]"+r"(xr),
-+                     [tmp]"=&r"(tmp)
-+            :  // Inputs
-+                       [y]"r"(y),
-+                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
-+                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
-+                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+            :  // Clobbers
-+                "cc"
-+        );
-+        return (uint32_t) bs;
-+#else
-+        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
-+        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
-+
-+        return n == 32 ? a :
-+            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
-+#endif
-+    }
-+}
-+
-+static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+    return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
-+}
-+
-+static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+    return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
-+}
-+
-+
-+static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 :  1);
-+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+    const DBParams * cb_dbp = s->deblock + ctb_n;
-+    const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+
-+    unsigned int cb_x;
-+
-+    // Do in CTB-shaped blocks
-+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
-+    {
-+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+        const unsigned int bv_l = FFMAX(cb_x, 8);
-+        const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
-+        const unsigned int bh_l = bv_l - 8;
-+        unsigned int y;
-+
-+        // Main body
-+        for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
-+        {
-+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
-+
-+            const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
-+            const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+            const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+            if (vbs != 0)
-+            {
-+                const uint8_t * const tcv = tctable + dbp->tc_offset;
-+                const uint8_t * const betav = betatable + dbp->beta_offset;
-+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+                unsigned int x;
-+
-+                for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
-+                {
-+                    if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
-+                    {
-+                        const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                        s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                         frame_stride1(s->frame, LUMA),
-+                                                         betav[qp],
-+                                                         ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
-+                                                          (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
-+                                                         pcmfa & 3,
-+                                                         av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
-+                    }
-+                }
-+            }
-+
-+            if (y != 0)
-+            {
-+                uint32_t hbs;
-+
-+                // H left - mostly separated out so we only need a uint32_t hbs
-+                if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
-+                {
-+                    const unsigned int x = bh_l;
-+                    const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+                    const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                    const DBParams * const dbph = dbp - 1;
-+                    const uint8_t * const tc = tctable + dbph->tc_offset + qp;
-+
-+                    av_assert2(cb_x - bh_l == 8);
-+
-+                    s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                         frame_stride1(s->frame, LUMA),
-+                                                         betatable[qp + dbph->beta_offset],
-+                                                         ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+                                                            (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+                                                         (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+                }
-+
-+                // H
-+                if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0)  // Will give (x <= bh_r) in for loop
-+                {
-+                    unsigned int x;
-+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);
-+
-+                    for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
-+                    {
-+                        if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
-+                        {
-+                            const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                            const uint8_t * const tc = tctable + dbp->tc_offset + qp;
-+                            s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                                frame_stride1(s->frame, LUMA),
-+                                                                betatable[qp + dbp->beta_offset],
-+                                                                ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+                                                                   (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+                                                                (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+                        }
-+                    }
-+                }
-+            }
-+
-+        }
-+    }
-+}
-+
-+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+    return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
-+}
-+
-+static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 :  8);
-+    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+    const DBParams * dbp = s->deblock + ctb_n;
-+    const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+    const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
-+    const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
-+
-+    unsigned int cb_x;
-+
-+    av_assert1((bounds.x & (ctb_size - 1)) == 0);
-+    av_assert1((bounds.y & (ctb_size - 1)) == 0);
-+    av_assert1(bounds.h <= ctb_size);
-+
-+    // Do in CTB-shaped blocks
-+    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
-+        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+        const unsigned int bv_l = FFMAX(cb_x, 16);
-+        unsigned int y;
-+
-+        // V above
-+        if (bounds.y != 0) {
-+            // Deblock V up 8
-+            // CTB above current
-+            // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
-+            const unsigned int y = bounds.y - 8;
-+            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
-+
-+            if (vbs != 0)
-+            {
-+                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+                const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
-+                unsigned int x;
-+
-+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+                {
-+                    if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
-+                    {
-+                        const int qp0 = q2h(s, x, y);
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                                       frame_stride1(s->frame, 1),
-+                                                       tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
-+                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+                                                       pcmfa & 3);
-+                    }
-+                }
-+            }
-+        }
-+
-+        for (y = bounds.y; y < b_b; y += 16)
-+        {
-+            uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
-+                (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
-+
-+            // V
-+            if (vbs != 0)
-+            {
-+                unsigned int x;
-+                unsigned int pcmfa =
-+                    (y + 16 > b_b ?
-+                        pcm2(s, bv_l - 1, y) | 0xffff0000 :
-+                        pcm4(s, bv_l - 1, y));
-+                const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+                {
-+                    if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+                    {
-+                        const int qp0 = q2h(s, x, y);
-+                        const int qp1 = q2h(s, x, y + 8);
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                            frame_stride1(s->frame, 1),
-+                            ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                            av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                    }
-+                }
-+            }
-+
-+            // H
-+            if (y != 0)
-+            {
-+                uint32_t hbs;
-+                const unsigned int bh_l = bv_l - 16;
-+                const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
-+                const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+                const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+                // H left - mostly separated out so we only need a uint32_t hbs
-+                // Stub is width 8 to the left of bounds, but width 16 internally
-+                if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
-+                {
-+                    unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+
-+                    // Chop off bits we don't want...
-+                    if (bh_l < bounds.x) {
-+                        pcmfa |= 0x10001; // TL|BL pre rearrangement
-+                        hbs &= ~3;  // Make BS 0
-+                    }
-+
-+                    // Double check we still want this
-+                    if (hbs != 0 && (~pcmfa & 0x30003) != 0)
-+                    {
-+                        const unsigned int x = bh_l;
-+                        const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                        const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+                        const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
-+
-+                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                            frame_stride1(s->frame, 1),
-+                            ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                    }
-+                }
-+
-+                // H main
-+                if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
-+                {
-+                    unsigned int x;
-+                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);  // Might like to mask out far right writes but probably not worth it
-+
-+                    for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
-+                    {
-+                        if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+                        {
-+                            const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+                            const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+                            const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+                            s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                frame_stride1(s->frame, 1),
-+                                ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+                                    ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+                                (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+                        }
-+                    }
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
-+{
-+    return x & ~(~0U << log2_n);
-+}
-+
-+static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+    av_assert2((y & 7) == 0);
-+
-+    // This doesn't have the same simultainious update issues that bsf_stash
-+    // does (other threads will have a different y) so we can do it the easy way
-+    if ((bsf &= mask) != 0)
-+        *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
-+}
-+
-+
-+static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+    // We arrange this in a slightly odd fashion but it lines up with
-+    // how we are going to use it in the actual deblock code & it is easier
-+    // to do the contortions here than there
-+    //
-+    // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
-+
-+    av_assert2((x & 7) == 0);
-+
-+    if ((bsf &= mask) != 0)
-+    {
-+        uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
-+        const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
-+
-+        if (mask <= 0xf)
-+        {
-+            *p |= (bsf << sh);
-+        }
-+        else
-+        {
-+            do {
-+                *p |= (bsf & 0xf) << sh;
-+                p += HEVC_RPI_BS_STRIDE1_BYTES;
-+            } while ((bsf >>= 4) != 0);
-+        }
-+    }
-+}
-+
-+static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
-+                              const unsigned int rep, const unsigned int dup,
-+                              const unsigned int mvf_stride0,
-+                              const unsigned int mvf_stride1,
-+                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
-+                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
-+{
-+    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
-+            mvf_p, mvf_q,
-+            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
-+            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
-+}
-+
-+
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
-+                                               const HEVCRpiLocalContext * const lc,
-+                                               const unsigned int x0, const unsigned int y0,
-+                                               const unsigned int log2_trafo_size,
-+                                               const int is_coded_block)
-+{
-+    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
-+    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
-+    const RefPicList * const rpl        = s->refPicList;
-+    // Rep count for bsf_mv when running with min_pu chuncks
-+    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
-+    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
-+    const unsigned int trafo_size       = (1U << log2_trafo_size);
-+    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
-+    const uint32_t bsf_cbf              = (bsf_mask & 0x55555555);
-+
-+    // Do we cover a pred split line?
-+    const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
-+    const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
-+
-+    uint32_t bsf_h;
-+    uint32_t bsf_v;
-+
-+#ifdef DISABLE_STRENGTHS
-+    return;
-+#endif
-+
-+    // We are always on a size boundary
-+    av_assert2((x0 & (trafo_size - 1)) == 0);
-+    av_assert2((y0 & (trafo_size - 1)) == 0);
-+    // log2_trafo_size not really a transform size; we can have to deal
-+    // with size 2^6 blocks
-+    av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
-+
-+    // Retrieve and update coded (b0), intra (b1) bs flags
-+    //
-+    // Store on min width (rather than uint32_t) to avoid possible issues
-+    // with another thread on another core running wpp using the same
-+    // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
-+    //
-+    // In bsf BS=2 is represented by 3 as it is much easier to test & set
-+    // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
-+    // 3 will work the same
-+    {
-+        // Given where we are called from is_cbf_luma & is_intra will be constant over the block
-+        const uint32_t bsf0 =  (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
-+        uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
-+        uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
-+
-+        switch (log2_trafo_size)
-+        {
-+            case 2:
-+            case 3:
-+            {
-+                const unsigned int sh_h = (x0 >> 1) & 7;
-+                const unsigned int sh_v = (y0 >> 1) & 7;
-+                bsf_h = *p;
-+                bsf_v = *q;
-+                *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
-+                *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
-+                bsf_h >>= sh_h;
-+                bsf_v >>= sh_v;
-+                break;
-+            }
-+            case 4:
-+                bsf_h = *p;
-+                bsf_v = *q;
-+                *p = bsf0;
-+                *q = bsf0;
-+                break;
-+            case 5:
-+                bsf_h = *(uint16_t *)p;
-+                bsf_v = *(uint16_t *)q;
-+                *(uint16_t *)p = bsf0;
-+                *(uint16_t *)q = bsf0;
-+                break;
-+            case 6:
-+            default:
-+                bsf_h = *(uint32_t *)p;
-+                bsf_v = *(uint32_t *)q;
-+                *(uint32_t *)p = bsf0;
-+                *(uint32_t *)q = bsf0;
-+                break;
-+        }
-+
-+        bsf_h |= bsf0;
-+        bsf_v |= bsf0;
-+    }
-+
-+    // Do Horizontal
-+    if ((y0 & 7) == 0)
-+    {
-+        // Boundary upper
-+        if (y0 != 0 &&
-+            (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
-+             (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
-+        {
-+            // Look at MVs (BS=1) if we don't already has a full set of bs bits
-+            if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
-+            {
-+                // If we aren't on the top boundary we must be in the middle
-+                // and in that case we know where mvf can change
-+                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
-+                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
-+                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
-+                      rpl;
-+
-+                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                    trafo_size >> (log2_min_pu_size + log2_rep),
-+                    trafo_size >> (log2_min_pu_size + log2_rep),
-+                    rpl, rpl_top,
-+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
-+            }
-+
-+            // Finally put the results into bs
-+            hbs_set(s, x0, y0, bsf_mask, bsf_h);
-+        }
-+
-+        // Max of 1 pu internal split - ignore if not on 8pel boundary
-+        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
-+        {
-+            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
-+            // If we have the x split as well then it must be in the middle
-+            const unsigned int log2_rep = has_x_split ? 1 : 0;
-+
-+            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
-+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                   trafo_size >> (log2_min_pu_size + log2_rep),
-+                   trafo_size >> (log2_min_pu_size + log2_rep),
-+                   rpl, rpl,
-+                   mvf, mvf - MVF_STASH_WIDTH_PU));
-+        }
-+    }
-+
-+    // And again for vertical - same logic as horizontal just in the other direction
-+    if ((x0 & 7) == 0)
-+    {
-+        // Boundary left
-+        if (x0 != 0 &&
-+            (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
-+             (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
-+        {
-+            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
-+            {
-+                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
-+                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
-+                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
-+                    rpl;
-+
-+                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                    rpl, rpl_left,
-+                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
-+            }
-+
-+            vbs_set(s, x0, y0, bsf_mask, bsf_v);
-+        }
-+
-+        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
-+        {
-+            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
-+            const unsigned int log2_rep = has_y_split ? 1 : 0;
-+
-+            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
-+                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+                   rpl, rpl,
-+                   mvf, mvf - 1));
-+        }
-+    }
-+}
-+
-+#undef LUMA
-+#undef CB
-+#undef CR
-+
-+static inline unsigned int ussub(const unsigned int a, const unsigned int b)
-+{
-+    return a < b ? 0 : a - b;
-+}
-+
-+static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
-+{
-+    return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
-+}
-+
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
-+{
-+    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+    int x, y;
-+
-+    const unsigned int br = FFMIN(bounds.x + bounds.w, s->ps.sps->width);
-+    const unsigned int bb = FFMIN(bounds.y + bounds.h, s->ps.sps->height);
-+
-+    const int x_end = (br >= s->ps.sps->width);
-+    const int y_end = (bb >= s->ps.sps->height);
-+
-+    // Deblock may not touch the edges of the bound as they are still needed
-+    // for Intra pred
-+    //
-+    // Deblock is disabled with a per-slice flag
-+    // Given that bounds may cover multiple slices & we dblock outside bounds
-+    // anyway we can't avoid deblock using that flag - about the only thing we
-+    // could do is have a "no deblock seen yet" flag but it doesn't really
-+    // seem worth the effort
-+
-+    deblock_y_blk(s, bounds, x_end, y_end);
-+    deblock_uv_blk(s, bounds, x_end, y_end);
-+
-+    // SAO needs
-+    // (a) CTB alignment
-+    // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
-+    {
-+        const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
-+        const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
-+        const unsigned int yt = ussub(bounds.y, yo);
-+        const unsigned int yb = y_end ? bb : ussub(bb, yo);
-+        const unsigned int xl = ussub(bounds.x, xo);
-+        const unsigned int xr = x_end ? br : ussub(br, xo);
-+
-+        if (s->ps.sps->sao_enabled)
-+        {
-+            for (y = yt; y < yb; y += ctb_size) {
-+                for (x = xl; x < xr; x += ctb_size) {
-+                    sao_filter_CTB(s, x, y);
-+                }
-+            }
-+        }
-+
-+        // Cache invalidate
-+        y = 0;
-+        if (xr != 0 && yb != 0)
-+        {
-+            const unsigned int llen =
-+                (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
-+            const unsigned int mask = ~(llen - 1);
-+            const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
-+            const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
-+            const unsigned int it = ussub(yt, 1);
-+            const unsigned int ib = y_end ? bb : yb - 1;
-+
-+            if (il < ir) {
-+                rpi_cache_buf_t cbuf;
-+                rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
-+                rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+                  il, it, ir - il, ib - it,
-+                  ctx_vshift(s, 1), 1, 1);
-+
-+                // If we have to commit the right hand tile boundry due to
-+                // cache boundry considerations then at EoTile we must commit
-+                // that boundry to bottom of tile (bounds)
-+                if (ib != bb && ir == br && eot) {
-+                    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+                      br - 1, ib, 1, bb - ib,
-+                      ctx_vshift(s, 1), 1, 1);
-+                }
-+
-+                rpi_cache_flush_finish(rfe);
-+
-+                if (x_end)
-+                    y = y_end ? INT_MAX : ib;
-+
-+//                printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
-+            }
-+        }
-+    }
-+
-+    return y;
-+}
-+
-diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h
-new file mode 100644
-index 0000000000..6b36f5e737
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mv.h
-@@ -0,0 +1,71 @@
-+#ifndef AVCODEC_RPI_HEVC_MV_H
-+#define AVCODEC_RPI_HEVC_MV_H
-+
-+#include "config.h"
-+
-+typedef int32_t MvXY;
-+
-+typedef struct HEVCRpiMvField {
-+    MvXY xy[2];
-+    int8_t ref_idx[2];
-+    int8_t pred_flag;
-+    int8_t dummy; // To 12 bytes
-+} HEVCRpiMvField;
-+
-+
-+#define MV_X(xy) (((xy) << 16) >> 16)
-+#define MV_Y(xy) ((xy) >> 16)
-+#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_mv_arm.h"
-+#endif
-+
-+#ifndef mvxy_add
-+static inline MvXY mvxy_add(const MvXY a, const MvXY b)
-+{
-+    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
-+}
-+#endif
-+
-+
-+#ifndef mv_scale_xy
-+static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
-+{
-+    int tx, scale_factor;
-+
-+    td = td == 0 ? 1 : av_clip_int8(td);
-+    tb = av_clip_int8(tb);
-+    tx = (0x4000 + (abs(td) >> 1)) / td;
-+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
-+    return MV_XY(
-+        av_clip_int16((scale_factor * MV_X(src) + 127 +
-+                           (scale_factor * MV_X(src) < 0)) >> 8),
-+        av_clip_int16((scale_factor * MV_Y(src) + 127 +
-+                           (scale_factor * MV_Y(src) < 0)) >> 8));
-+}
-+#endif
-+
-+// 8.3.1 states that the bitstream may not contain poc diffs that do not
-+// fit in 16 bits, so given that we don't care about the high bits we only
-+// store the low 16 + LT & Inter flags
-+
-+#define COL_POC_INTRA   0
-+#define COL_POC_INTER   (1 << 16)
-+#define COL_POC_LT      (1 << 17)
-+#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
-+#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
-+#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
-+
-+typedef struct ColMv_s {
-+    int32_t poc;
-+    int32_t xy;
-+} ColMv;
-+
-+typedef struct ColMvField_s {
-+    ColMv L[2];
-+} ColMvField;
-+
-+
-+
-+#endif // AVCODEC_RPI_HEVC_MV_H
-diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
-new file mode 100644
-index 0000000000..27a9f69525
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mvs.c
-@@ -0,0 +1,487 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Anand Meher Kotra
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+static av_always_inline int
-+is_eq_mer(const unsigned int plevel,
-+    const unsigned int xN, const unsigned int yN,
-+    const unsigned int xP, const unsigned int yP)
-+{
-+    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
-+}
-+
-+// check if the mv's and refidx are the same between A and B
-+static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+    return a->pred_flag == b->pred_flag &&
-+        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
-+        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
-+    return 0;
-+}
-+
-+/*
-+ * 8.5.3.1.7  temporal luma motion vector prediction
-+ */
-+static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
-+                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+                                       const int nPbW, const int nPbH, const int refIdxLx,
-+                                       MvXY * const mvLXCol, const int X)
-+{
-+    int x, y;
-+    const ColMv * cmv = NULL;
-+
-+    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
-+    const RefPicList * const refPicList = s->refPicList + X;
-+    const int cur_lt = refPicList->isLongTerm[refIdxLx];
-+
-+    *mvLXCol = 0;
-+    // Unlikely but we might have a col_ref IDR frame!
-+    if (col_ref->col_mvf == NULL)
-+        return 0;
-+
-+    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
-+
-+    //bottom right collocated motion vector
-+    x = x0 + nPbW;
-+    y = y0 + nPbH;
-+
-+    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
-+        y < s->ps.sps->height &&
-+        x < s->ps.sps->width)
-+    {
-+        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+            (y >> 4) * s->col_mvf_stride;
-+
-+        if (col->L[0].poc != COL_POC_INTRA &&
-+            (col->L[1].poc == COL_POC_INTRA ||
-+             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+        {
-+            cmv = col->L + 0;
-+        }
-+        else if (col->L[1].poc != COL_POC_INTRA)
-+        {
-+            cmv = col->L + 1;
-+        }
-+    }
-+
-+    // derive center collocated motion vector
-+    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
-+    {
-+        cmv = NULL;
-+        x                  = x0 + (nPbW >> 1);
-+        y                  = y0 + (nPbH >> 1);
-+
-+        {
-+            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+              (y >> 4) * s->col_mvf_stride;
-+
-+            if (col->L[0].poc != COL_POC_INTRA &&
-+              (col->L[1].poc == COL_POC_INTRA ||
-+               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+            {
-+              cmv = col->L + 0;
-+            }
-+            else if (col->L[1].poc != COL_POC_INTRA)
-+            {
-+              cmv = col->L + 1;
-+            }
-+        }
-+    }
-+
-+    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
-+        return 0;
-+
-+    {
-+        const int col_poc  = col_ref->poc;
-+        const int ref_poc  = refPicList->list[refIdxLx];
-+
-+        *mvLXCol = (cur_lt ||
-+                        cmv->poc == col_poc ||
-+                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
-+                    cmv->xy :
-+                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
-+    }
-+
-+    return cmv != NULL;
-+}
-+
-+static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+    return b != NULL && compare_mv_ref_idx(a, b);
-+}
-+
-+
-+
-+/*
-+ * 8.5.3.1.2  Derivation process for spatial merging candidates
-+ */
-+static inline const HEVCRpiMvField *
-+derive_spatial_merge_candidates(
-+    const HEVCRpiContext * const s,
-+    const HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    const unsigned int part_idx,
-+    const unsigned int merge_idx,
-+    HEVCRpiMvField * const mvf_t)
-+{
-+    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
-+    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
-+
-+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
-+    const unsigned int part_mode = lc->cu.part_mode;
-+
-+    const HEVCRpiMvField * perm[4];
-+    unsigned int nb_merge_cand = 0;
-+
-+    // singleMCLFlag => part_idx == 0 so no need to test for it
-+    if ((avail & AVAIL_L) == 0 ||
-+        (part_idx == 1 &&
-+            ((parts_a1 >> part_mode) & 1) != 0 ||
-+                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
-+        mvf_a1->pred_flag == PF_INTRA)
-+    {
-+        mvf_a1 = NULL;
-+    }
-+    else
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_a1;
-+        perm[nb_merge_cand++] = mvf_a1;
-+    }
-+
-+    if ((avail & AVAIL_U) == 0 ||
-+            (part_idx == 1 &&
-+               ((parts_b1 >> part_mode) & 1) != 0 ||
-+                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
-+            mvf_b1->pred_flag == PF_INTRA)
-+    {
-+        mvf_b1 = NULL;
-+    }
-+    else if (!mvf_eq(mvf_b1, mvf_a1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_b1;
-+        perm[nb_merge_cand++] = mvf_b1;
-+    }
-+
-+    // above right spatial merge candidate
-+    // Never need mvf_b0 again so don't bother zeroing if navail
-+    if ((avail & AVAIL_UR) != 0 &&
-+        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
-+        mvf_b0->pred_flag != PF_INTRA &&
-+        !mvf_eq(mvf_b0, mvf_b1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_b0;
-+        perm[nb_merge_cand++] = mvf_b0;
-+    }
-+
-+    // left bottom spatial merge candidate
-+    // Never need mvf_a0 again so don't bother zeroing if navail
-+    if ((avail & AVAIL_DL) != 0 &&
-+        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
-+        mvf_a0->pred_flag != PF_INTRA &&
-+        !mvf_eq(mvf_a0, mvf_a1))
-+    {
-+        if (merge_idx == nb_merge_cand)
-+            return mvf_a0;
-+        perm[nb_merge_cand++] = mvf_a0;
-+    }
-+
-+    // above left spatial merge candidate
-+    if (nb_merge_cand != 4 &&
-+        (avail & AVAIL_UL) != 0 &&
-+        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
-+    {
-+        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
-+
-+        if (mvf_b2->pred_flag != PF_INTRA &&
-+            !mvf_eq(mvf_b2, mvf_a1) &&
-+            !mvf_eq(mvf_b2, mvf_b1))
-+        {
-+            if (merge_idx == nb_merge_cand)
-+                return mvf_b2;
-+            perm[nb_merge_cand++] = mvf_b2;
-+        }
-+    }
-+
-+    // temporal motion vector candidate
-+    if (s->sh.slice_temporal_mvp_enabled_flag)
-+    {
-+        static const HEVCRpiMvField mvf_z = {{0}};
-+
-+        *mvf_t = mvf_z;
-+
-+        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                        0, mvf_t->xy + 0, 0))
-+            mvf_t->pred_flag = PF_L0;
-+
-+        if (s->sh.slice_type == HEVC_SLICE_B &&
-+                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                            0, mvf_t->xy + 1, 1))
-+            mvf_t->pred_flag |= PF_L1;
-+
-+        if (mvf_t->pred_flag != 0)
-+        {
-+            if (merge_idx == nb_merge_cand)
-+                return mvf_t;
-+            perm[nb_merge_cand++] = mvf_t;
-+        }
-+    }
-+
-+    // combined bi-predictive merge candidates  (applies for B slices)
-+    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
-+    {
-+        unsigned int comb_idx = 0;
-+        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
-+        const RefPicList * const refPicList = s->refPicList;
-+
-+        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
-+        {
-+            static const uint8_t l0_l1_cand_idx[12][2] = {
-+                { 0, 1, },
-+                { 1, 0, },
-+                { 0, 2, },
-+                { 2, 0, },
-+                { 1, 2, },
-+                { 2, 1, },
-+                { 0, 3, },
-+                { 3, 0, },
-+                { 1, 3, },
-+                { 3, 1, },
-+                { 2, 3, },
-+                { 3, 2, },
-+            };
-+
-+            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
-+            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
-+            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
-+            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
-+
-+            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
-+                (mvf_c1->pred_flag & PF_L1) != 0 &&
-+                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
-+                 mvf_c0->xy[0] != mvf_c1->xy[1]))
-+            {
-+                if (merge_idx == nb_merge_cand++)
-+                {
-+                    // Need to be a bit careful as we will construct mvf_t and we
-+                    // may already be using that as one of our condidates
-+                    // so build & copy rather than build in place
-+                    const HEVCRpiMvField mvf_m = {
-+                        .xy = {
-+                            mvf_c0->xy[0],
-+                            mvf_c1->xy[1]},
-+                        .ref_idx = {
-+                            mvf_c0->ref_idx[0],
-+                            mvf_c1->ref_idx[1]},
-+                        .pred_flag = PF_BI
-+                    };
-+                    *mvf_t = mvf_m;
-+                    return mvf_t;
-+                }
-+            }
-+        }
-+    }
-+
-+    // "append" Zero motion vector candidates
-+    {
-+        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
-+                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
-+        const unsigned int zero_idx = merge_idx - nb_merge_cand;
-+
-+        const HEVCRpiMvField mvf_m = {
-+            .xy = {0, 0},
-+            .ref_idx = {
-+                zero_idx < nb_refs ? zero_idx : 0,
-+                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
-+            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
-+        };
-+
-+        *mvf_t = mvf_m;
-+        return mvf_t;
-+    }
-+}
-+
-+
-+// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, HEVCRpiMvField * const mv)
-+{
-+    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
-+        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
-+                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
-+                                        0, merge_idx, mv) :
-+        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
-+                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
-+                                        part_idx, merge_idx, mv);
-+
-+    if (mvf_m != mv)
-+        *mv = *mvf_m;
-+
-+    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
-+        mv->pred_flag = PF_L0;
-+}
-+
-+
-+static av_always_inline const MvXY *
-+mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
-+{
-+    if (mvf != NULL)
-+    {
-+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
-+            return mvf->xy + pfi0;
-+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
-+            return mvf->xy + pfi1;
-+    }
-+    return NULL;
-+}
-+
-+static av_always_inline const MvXY *
-+mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
-+              const int islt0, const int poc0, const int poc_cur,
-+              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
-+{
-+    if (mvf != NULL)
-+    {
-+        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
-+        {
-+            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
-+            if (islt0 || poc1 == poc0) {
-+                return mvf->xy + pfi0;
-+            }
-+            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
-+            return mv_t;
-+        }
-+        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
-+        {
-+            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
-+            if (islt0 || poc1 == poc0) {
-+                return mvf->xy + pfi1;
-+            }
-+            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
-+            return mv_t;
-+        }
-+    }
-+    return NULL;
-+}
-+
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    HEVCRpiMvField * const mv,
-+    const unsigned int mvp_lx_flag, const unsigned int LX)
-+{
-+    const unsigned int pfi0 = LX;
-+    const unsigned int pfi1 = LX == 0 ? 1 : 0;
-+    const RefPicList * const rpl = s->refPicList;
-+    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
-+    const int poc_cur = s->poc;
-+    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
-+
-+    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
-+    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+    const MvXY * mva = NULL;
-+    const MvXY * mvb;
-+    MvXY * const mv_rv = mv->xy + LX;
-+    MvXY mvt_a, mvt_b;
-+
-+    *mv_rv = 0;
-+
-+    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
-+        mvf_a0 = NULL;
-+    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
-+        goto use_mva;
-+
-+    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
-+        mvf_a1 = NULL;
-+
-+    if (mva == NULL &&
-+        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
-+        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
-+        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
-+
-+    if (mvp_lx_flag == 0 && mva != NULL)
-+        goto use_mva;
-+
-+    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
-+        mvf_b0 = NULL;
-+    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
-+        mvf_b1 = NULL;
-+    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
-+        mvf_b2 = NULL;
-+
-+    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
-+        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
-+        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
-+
-+    if (mvf_a0 == NULL && mvf_a1 == NULL) {
-+        mva = mvb;
-+        if (mvp_lx_flag == 0 && mva != NULL)
-+            goto use_mva;
-+
-+        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
-+            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
-+            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
-+    }
-+
-+    if (mva == NULL) {
-+        mva = mvb;
-+        mvb = NULL;
-+    }
-+
-+    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
-+        mvb = NULL;
-+
-+    if (mvp_lx_flag == 0 && mva != NULL) {
-+        goto use_mva;
-+    }
-+    else if (mvp_lx_flag != 0 && mvb != NULL) {
-+        *mv_rv = *mvb;
-+    }
-+    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
-+        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
-+                                    nPbH, mv->ref_idx[LX],
-+                                    mv_rv, LX);
-+    }
-+    return;
-+
-+use_mva:
-+    *mv_rv = *mva;
-+    return;
-+}
-+
-diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
-new file mode 100644
-index 0000000000..e58a59ce5e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.c
-@@ -0,0 +1,143 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "bytestream.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_parse.h"
-+
-+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
-+                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
-+                                 int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+    int i;
-+    int ret = 0;
-+    H2645Packet pkt = { 0 };
-+
-+    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
-+                                nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
-+    if (ret < 0) {
-+        goto done;
-+    }
-+
-+    for (i = 0; i < pkt.nb_nals; i++) {
-+        H2645NAL *nal = &pkt.nals[i];
-+
-+        /* ignore everything except parameter sets and VCL NALUs */
-+        switch (nal->type) {
-+        case HEVC_NAL_VPS:
-+            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_SPS:
-+            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_PPS:
-+            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        case HEVC_NAL_SEI_PREFIX:
-+        case HEVC_NAL_SEI_SUFFIX:
-+            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
-+            if (ret < 0)
-+                goto done;
-+            break;
-+        default:
-+            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
-+            break;
-+        }
-+    }
-+
-+done:
-+    ff_h2645_packet_uninit(&pkt);
-+    if (err_recognition & AV_EF_EXPLODE)
-+        return ret;
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+                             int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+    int ret = 0;
-+    GetByteContext gb;
-+
-+    bytestream2_init(&gb, data, size);
-+
-+    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
-+        /* It seems the extradata is encoded as hvcC format.
-+         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
-+         * is finalized. When finalized, configurationVersion will be 1 and we
-+         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
-+        int i, j, num_arrays, nal_len_size;
-+
-+        *is_nalff = 1;
-+
-+        bytestream2_skip(&gb, 21);
-+        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
-+        num_arrays   = bytestream2_get_byte(&gb);
-+
-+        /* nal units in the hvcC always have length coded with 2 bytes,
-+         * so put a fake nal_length_size = 2 while parsing them */
-+        *nal_length_size = 2;
-+
-+        /* Decode nal units from hvcC. */
-+        for (i = 0; i < num_arrays; i++) {
-+            int type = bytestream2_get_byte(&gb) & 0x3f;
-+            int cnt  = bytestream2_get_be16(&gb);
-+
-+            for (j = 0; j < cnt; j++) {
-+                // +2 for the nal size field
-+                int nalsize = bytestream2_peek_be16(&gb) + 2;
-+                if (bytestream2_get_bytes_left(&gb) < nalsize) {
-+                    av_log(logctx, AV_LOG_ERROR,
-+                           "Invalid NAL unit size in extradata.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
-+                                            *nal_length_size, err_recognition, apply_defdispwin,
-+                                            logctx);
-+                if (ret < 0) {
-+                    av_log(logctx, AV_LOG_ERROR,
-+                           "Decoding nal unit %d %d from hvcC failed\n",
-+                           type, i);
-+                    return ret;
-+                }
-+                bytestream2_skip(&gb, nalsize);
-+            }
-+        }
-+
-+        /* Now store right nal length size, that will be used to parse
-+         * all other nals */
-+        *nal_length_size = nal_len_size;
-+    } else {
-+        *is_nalff = 0;
-+        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
-+                                    err_recognition, apply_defdispwin, logctx);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    return ret;
-+}
-diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h
-new file mode 100644
-index 0000000000..4b4d032a16
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.h
-@@ -0,0 +1,36 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * H.265 parser code
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PARSE_H
-+#define AVCODEC_RPI_HEVC_PARSE_H
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+                             int err_recognition, int apply_defdispwin, void *logctx);
-+
-+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
-diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
-new file mode 100644
-index 0000000000..f4e31f7d1d
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.c
-@@ -0,0 +1,1938 @@
-+/*
-+ * HEVC Parameter Set decoding
-+ *
-+ * Copyright (C) 2012 - 2103 Guillaume Martres
-+ * Copyright (C) 2012 - 2103 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/imgutils.h"
-+#include "golomb.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevcdec.h"
-+
-+static const uint8_t default_scaling_list_intra[] = {
-+    16, 16, 16, 16, 17, 18, 21, 24,
-+    16, 16, 16, 16, 17, 19, 22, 25,
-+    16, 16, 17, 18, 20, 22, 25, 29,
-+    16, 16, 18, 21, 24, 27, 31, 36,
-+    17, 17, 20, 24, 30, 35, 41, 47,
-+    18, 19, 22, 27, 35, 44, 54, 65,
-+    21, 22, 25, 31, 41, 54, 70, 88,
-+    24, 25, 29, 36, 47, 65, 88, 115
-+};
-+
-+static const uint8_t default_scaling_list_inter[] = {
-+    16, 16, 16, 16, 17, 18, 20, 24,
-+    16, 16, 16, 17, 18, 20, 24, 25,
-+    16, 16, 17, 18, 20, 24, 25, 28,
-+    16, 17, 18, 20, 24, 25, 28, 33,
-+    17, 18, 20, 24, 25, 28, 33, 41,
-+    18, 20, 24, 25, 28, 33, 41, 54,
-+    20, 24, 25, 28, 33, 41, 54, 71,
-+    24, 25, 28, 33, 41, 54, 71, 91
-+};
-+
-+static const AVRational vui_sar[] = {
-+    {  0,   1 },
-+    {  1,   1 },
-+    { 12,  11 },
-+    { 10,  11 },
-+    { 16,  11 },
-+    { 40,  33 },
-+    { 24,  11 },
-+    { 20,  11 },
-+    { 32,  11 },
-+    { 80,  33 },
-+    { 18,  11 },
-+    { 15,  11 },
-+    { 64,  33 },
-+    { 160, 99 },
-+    {  4,   3 },
-+    {  3,   2 },
-+    {  2,   1 },
-+};
-+
-+
-+// pps_cb_qp_offset: -12,+12
-+// slice_cb_qp_offset: -12,+12 also
-+//   "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
-+// cr_qp_offset_list[n]: -12,+12
-+// So worst case total offset: -24,+24
-+
-+#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
-+#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
-+#define M(B,n) C(B,(-n))
-+
-+// Sizeof the QP_START_BLOCK
-+#define QP_OFFSET_0 (8*6 + 12*2)
-+#define QP_START(B) \
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+\
-+    M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
-+    M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
-+    M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
-+    M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
-+    M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
-+    M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
-+    M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
-+    M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
-+#define QP_END(B) \
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
-+
-+#define T1(B)\
-+{\
-+    QP_START(B),\
-+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+    C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
-+    C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
-+    C(B,44), C(B,45),\
-+    C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
-+    QP_END(B)\
-+}
-+#define T0(B)\
-+{\
-+    QP_START(B),\
-+    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+    C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
-+    C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
-+    C(B,50), C(B,51),\
-+    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+    QP_END(B)\
-+}
-+
-+#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
-+
-+static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
-+static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
-+
-+#undef T
-+#undef C
-+#undef QP_END
-+
-+#define C(B,n) ((n)<0?0:(n)>51?51:(n))
-+// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
-+#define QP_DBLK_OFFSET_0 QP_OFFSET_0
-+#define QP_END(B)\
-+ 51, 51, 51, 51, 51, 51
-+
-+// These don't need all the padding we have here (12 top/bottom would be enough)
-+static const uint8_t qp_c_dblk_0[] = T0(0);
-+static const uint8_t qp_c_dblk_1[] = T1(0);
-+
-+#undef T
-+#undef M
-+#undef C
-+#undef QP_END
-+#undef QP_START
-+
-+
-+static void remove_pps(HEVCRpiParamSets * const s, const int id)
-+{
-+    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
-+        s->pps = NULL;
-+    av_buffer_unref(&s->pps_list[id]);
-+}
-+
-+static void remove_sps(HEVCRpiParamSets * const s, const int id)
-+{
-+    int i;
-+    if (s->sps_list[id]) {
-+        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
-+            s->sps = NULL;
-+
-+        /* drop all PPS that depend on this SPS */
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
-+            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
-+                remove_pps(s, i);
-+
-+        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
-+    }
-+    av_buffer_unref(&s->sps_list[id]);
-+}
-+
-+static void remove_vps(HEVCRpiParamSets * const s, const int id)
-+{
-+    int i;
-+    if (s->vps_list[id]) {
-+        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
-+            s->vps = NULL;
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
-+            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
-+                remove_sps(s, i);
-+    }
-+    av_buffer_unref(&s->vps_list[id]);
-+}
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                  ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
-+{
-+    uint8_t rps_predict = 0;
-+    int delta_poc;
-+    int k0 = 0;
-+    int k1 = 0;
-+    int k  = 0;
-+    int i;
-+
-+    if (rps != sps->st_rps && sps->nb_st_rps)
-+        rps_predict = get_bits1(gb);
-+
-+    if (rps_predict) {
-+        const ShortTermRPS *rps_ridx;
-+        int delta_rps;
-+        unsigned abs_delta_rps;
-+        uint8_t use_delta_flag = 0;
-+        uint8_t delta_rps_sign;
-+
-+        if (is_slice_header) {
-+            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
-+            if (delta_idx > sps->nb_st_rps) {
-+                av_log(avctx, AV_LOG_ERROR,
-+                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
-+                       delta_idx, sps->nb_st_rps);
-+                return AVERROR_INVALIDDATA;
-+            }
-+            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
-+            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
-+        } else
-+            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
-+
-+        delta_rps_sign = get_bits1(gb);
-+        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
-+        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "Invalid value of abs_delta_rps: %d\n",
-+                   abs_delta_rps);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
-+        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
-+            int used = rps->used[k] = get_bits1(gb);
-+
-+            if (!used)
-+                use_delta_flag = get_bits1(gb);
-+
-+            if (used || use_delta_flag) {
-+                if (i < rps_ridx->num_delta_pocs)
-+                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
-+                else
-+                    delta_poc = delta_rps;
-+                rps->delta_poc[k] = delta_poc;
-+                if (delta_poc < 0)
-+                    k0++;
-+                else
-+                    k1++;
-+                k++;
-+            }
-+        }
-+
-+        if (k >= FF_ARRAY_ELEMS(rps->used)) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "Invalid num_delta_pocs: %d\n", k);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        rps->num_delta_pocs    = k;
-+        rps->num_negative_pics = k0;
-+        // sort in increasing order (smallest first)
-+        if (rps->num_delta_pocs != 0) {
-+            int used, tmp;
-+            for (i = 1; i < rps->num_delta_pocs; i++) {
-+                delta_poc = rps->delta_poc[i];
-+                used      = rps->used[i];
-+                for (k = i - 1; k >= 0; k--) {
-+                    tmp = rps->delta_poc[k];
-+                    if (delta_poc < tmp) {
-+                        rps->delta_poc[k + 1] = tmp;
-+                        rps->used[k + 1]      = rps->used[k];
-+                        rps->delta_poc[k]     = delta_poc;
-+                        rps->used[k]          = used;
-+                    }
-+                }
-+            }
-+        }
-+        if ((rps->num_negative_pics >> 1) != 0) {
-+            int used;
-+            k = rps->num_negative_pics - 1;
-+            // flip the negative values to largest first
-+            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
-+                delta_poc         = rps->delta_poc[i];
-+                used              = rps->used[i];
-+                rps->delta_poc[i] = rps->delta_poc[k];
-+                rps->used[i]      = rps->used[k];
-+                rps->delta_poc[k] = delta_poc;
-+                rps->used[k]      = used;
-+                k--;
-+            }
-+        }
-+    } else {
-+        unsigned int prev, nb_positive_pics;
-+        rps->num_negative_pics = get_ue_golomb_long(gb);
-+        nb_positive_pics       = get_ue_golomb_long(gb);
-+
-+        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
-+            nb_positive_pics >= HEVC_MAX_REFS) {
-+            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
-+        if (rps->num_delta_pocs) {
-+            prev = 0;
-+            for (i = 0; i < rps->num_negative_pics; i++) {
-+                delta_poc = get_ue_golomb_long(gb) + 1;
-+                if (delta_poc < 1 || delta_poc > 32768) {
-+                    av_log(avctx, AV_LOG_ERROR,
-+                        "Invalid value of delta_poc: %d\n",
-+                        delta_poc);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                prev -= delta_poc;
-+                rps->delta_poc[i] = prev;
-+                rps->used[i]      = get_bits1(gb);
-+            }
-+            prev = 0;
-+            for (i = 0; i < nb_positive_pics; i++) {
-+                delta_poc = get_ue_golomb_long(gb) + 1;
-+                if (delta_poc < 1 || delta_poc > 32768) {
-+                    av_log(avctx, AV_LOG_ERROR,
-+                        "Invalid value of delta_poc: %d\n",
-+                        delta_poc);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                prev += delta_poc;
-+                rps->delta_poc[rps->num_negative_pics + i] = prev;
-+                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
-+            }
-+        }
-+    }
-+    return 0;
-+}
-+
-+
-+static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                      PTLCommon * const ptl)
-+{
-+    int i;
-+
-+    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
-+        return -1;
-+
-+    ptl->profile_space = get_bits(gb, 2);
-+    ptl->tier_flag     = get_bits1(gb);
-+    ptl->profile_idc   = get_bits(gb, 5);
-+    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
-+        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
-+        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
-+        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
-+    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
-+        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
-+    else
-+        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
-+
-+    for (i = 0; i < 32; i++) {
-+        ptl->profile_compatibility_flag[i] = get_bits1(gb);
-+
-+        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
-+            ptl->profile_idc = i;
-+    }
-+    ptl->progressive_source_flag    = get_bits1(gb);
-+    ptl->interlaced_source_flag     = get_bits1(gb);
-+    ptl->non_packed_constraint_flag = get_bits1(gb);
-+    ptl->frame_only_constraint_flag = get_bits1(gb);
-+
-+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
-+    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
-+    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
-+
-+    return 0;
-+}
-+
-+static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
-+                      PTL * const ptl, const int max_num_sub_layers)
-+{
-+    int i;
-+    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
-+        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
-+        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
-+        return -1;
-+    }
-+
-+    ptl->general_ptl.level_idc = get_bits(gb, 8);
-+
-+    for (i = 0; i < max_num_sub_layers - 1; i++) {
-+        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
-+        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
-+    }
-+
-+    if (max_num_sub_layers - 1> 0)
-+        for (i = max_num_sub_layers - 1; i < 8; i++)
-+            skip_bits(gb, 2); // reserved_zero_2bits[i]
-+    for (i = 0; i < max_num_sub_layers - 1; i++) {
-+        if (ptl->sub_layer_profile_present_flag[i] &&
-+            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "PTL information for sublayer %i too short\n", i);
-+            return -1;
-+        }
-+        if (ptl->sub_layer_level_present_flag[i]) {
-+            if (get_bits_left(gb) < 8) {
-+                av_log(avctx, AV_LOG_ERROR,
-+                       "Not enough data for sublayer %i level_idc\n", i);
-+                return -1;
-+            } else
-+                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
-+                                const int subpic_params_present)
-+{
-+    int i;
-+
-+    for (i = 0; i < nb_cpb; i++) {
-+        get_ue_golomb_long(gb); // bit_rate_value_minus1
-+        get_ue_golomb_long(gb); // cpb_size_value_minus1
-+
-+        if (subpic_params_present) {
-+            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
-+            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
-+        }
-+        skip_bits1(gb); // cbr_flag
-+    }
-+}
-+
-+static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
-+                      const int max_sublayers)
-+{
-+    int nal_params_present = 0, vcl_params_present = 0;
-+    int subpic_params_present = 0;
-+    int i;
-+
-+    if (common_inf_present) {
-+        nal_params_present = get_bits1(gb);
-+        vcl_params_present = get_bits1(gb);
-+
-+        if (nal_params_present || vcl_params_present) {
-+            subpic_params_present = get_bits1(gb);
-+
-+            if (subpic_params_present) {
-+                skip_bits(gb, 8); // tick_divisor_minus2
-+                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
-+                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
-+                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
-+            }
-+
-+            skip_bits(gb, 4); // bit_rate_scale
-+            skip_bits(gb, 4); // cpb_size_scale
-+
-+            if (subpic_params_present)
-+                skip_bits(gb, 4);  // cpb_size_du_scale
-+
-+            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
-+            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
-+            skip_bits(gb, 5); // dpb_output_delay_length_minus1
-+        }
-+    }
-+
-+    for (i = 0; i < max_sublayers; i++) {
-+        int low_delay = 0;
-+        unsigned int nb_cpb = 1;
-+        int fixed_rate = get_bits1(gb);
-+
-+        if (!fixed_rate)
-+            fixed_rate = get_bits1(gb);
-+
-+        if (fixed_rate)
-+            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
-+        else
-+            low_delay = get_bits1(gb);
-+
-+        if (!low_delay) {
-+            nb_cpb = get_ue_golomb_long(gb) + 1;
-+            if (nb_cpb < 1 || nb_cpb > 32) {
-+                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        }
-+
-+        if (nal_params_present)
-+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+        if (vcl_params_present)
-+            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+    }
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                           HEVCRpiParamSets * const ps)
-+{
-+    int i,j;
-+    int vps_id = 0;
-+    ptrdiff_t nal_size;
-+    HEVCRpiVPS *vps;
-+    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
-+
-+    if (!vps_buf)
-+        return AVERROR(ENOMEM);
-+    vps = (HEVCRpiVPS*)vps_buf->data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(vps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(vps->data));
-+        vps->data_size = sizeof(vps->data);
-+    } else {
-+        vps->data_size = nal_size;
-+    }
-+    memcpy(vps->data, gb->buffer, vps->data_size);
-+
-+    vps_id = get_bits(gb, 4);
-+    if (vps_id >= HEVC_MAX_VPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
-+        goto err;
-+    }
-+
-+    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
-+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
-+        goto err;
-+    }
-+
-+    vps->vps_max_layers               = get_bits(gb, 6) + 1;
-+    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
-+    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
-+
-+    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
-+        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
-+        goto err;
-+    }
-+
-+    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
-+               vps->vps_max_sub_layers);
-+        goto err;
-+    }
-+
-+    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
-+        goto err;
-+
-+    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
-+
-+    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
-+    for (; i < vps->vps_max_sub_layers; i++) {
-+        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
-+        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
-+        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
-+
-+        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
-+            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+                   vps->vps_max_dec_pic_buffering[i] - 1);
-+            goto err;
-+        }
-+        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
-+            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
-+                   vps->vps_num_reorder_pics[i]);
-+            if (avctx->err_recognition & AV_EF_EXPLODE)
-+                goto err;
-+        }
-+    }
-+
-+    vps->vps_max_layer_id   = get_bits(gb, 6);
-+    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
-+    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
-+        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
-+        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
-+        goto err;
-+    }
-+
-+    for (i = 1; i < vps->vps_num_layer_sets; i++)
-+        for (j = 0; j <= vps->vps_max_layer_id; j++)
-+            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
-+
-+    vps->vps_timing_info_present_flag = get_bits1(gb);
-+    if (vps->vps_timing_info_present_flag) {
-+        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
-+        vps->vps_time_scale                      = get_bits_long(gb, 32);
-+        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
-+        if (vps->vps_poc_proportional_to_timing_flag)
-+            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
-+        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
-+        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
-+            goto err;
-+        }
-+        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
-+            int common_inf_present = 1;
-+
-+            get_ue_golomb_long(gb); // hrd_layer_set_idx
-+            if (i)
-+                common_inf_present = get_bits1(gb);
-+            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
-+        }
-+    }
-+    get_bits1(gb); /* vps_extension_flag */
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread VPS by %d bits\n", -get_bits_left(gb));
-+        if (ps->vps_list[vps_id])
-+            goto err;
-+    }
-+
-+    if (ps->vps_list[vps_id] &&
-+        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
-+        av_buffer_unref(&vps_buf);
-+    } else {
-+        remove_vps(ps, vps_id);
-+        ps->vps_list[vps_id] = vps_buf;
-+    }
-+
-+    return 0;
-+
-+err:
-+    av_buffer_unref(&vps_buf);
-+    return AVERROR_INVALIDDATA;
-+}
-+
-+static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
-+                       const int apply_defdispwin, HEVCRpiSPS * const sps)
-+{
-+    VUI backup_vui, * const vui = &sps->vui;
-+    GetBitContext backup;
-+    int sar_present, alt = 0;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
-+
-+    sar_present = get_bits1(gb);
-+    if (sar_present) {
-+        uint8_t sar_idx = get_bits(gb, 8);
-+        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
-+            vui->sar = vui_sar[sar_idx];
-+        else if (sar_idx == 255) {
-+            vui->sar.num = get_bits(gb, 16);
-+            vui->sar.den = get_bits(gb, 16);
-+        } else
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Unknown SAR index: %u.\n", sar_idx);
-+    }
-+
-+    vui->overscan_info_present_flag = get_bits1(gb);
-+    if (vui->overscan_info_present_flag)
-+        vui->overscan_appropriate_flag = get_bits1(gb);
-+
-+    vui->video_signal_type_present_flag = get_bits1(gb);
-+    if (vui->video_signal_type_present_flag) {
-+        vui->video_format                    = get_bits(gb, 3);
-+        vui->video_full_range_flag           = get_bits1(gb);
-+        vui->colour_description_present_flag = get_bits1(gb);
-+        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
-+            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
-+        if (vui->colour_description_present_flag) {
-+            vui->colour_primaries        = get_bits(gb, 8);
-+            vui->transfer_characteristic = get_bits(gb, 8);
-+            vui->matrix_coeffs           = get_bits(gb, 8);
-+
-+            // Set invalid values to "unspecified"
-+            if (!av_color_primaries_name(vui->colour_primaries))
-+                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
-+            if (!av_color_transfer_name(vui->transfer_characteristic))
-+                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
-+            if (!av_color_space_name(vui->matrix_coeffs))
-+                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
-+            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
-+                switch (sps->pix_fmt) {
-+                case AV_PIX_FMT_YUV444P:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP;
-+                    break;
-+                case AV_PIX_FMT_YUV444P10:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
-+                    break;
-+                case AV_PIX_FMT_YUV444P12:
-+                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
-+                    break;
-+                }
-+            }
-+        }
-+    }
-+
-+    vui->chroma_loc_info_present_flag = get_bits1(gb);
-+    if (vui->chroma_loc_info_present_flag) {
-+        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
-+        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
-+    }
-+
-+    vui->neutra_chroma_indication_flag = get_bits1(gb);
-+    vui->field_seq_flag                = get_bits1(gb);
-+    vui->frame_field_info_present_flag = get_bits1(gb);
-+
-+    // Backup context in case an alternate header is detected
-+    memcpy(&backup, gb, sizeof(backup));
-+    memcpy(&backup_vui, vui, sizeof(backup_vui));
-+    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
-+        vui->default_display_window_flag = 0;
-+        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
-+    } else
-+        vui->default_display_window_flag = get_bits1(gb);
-+
-+    if (vui->default_display_window_flag) {
-+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
-+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
-+        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
-+        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
-+        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
-+
-+        if (apply_defdispwin &&
-+            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+            av_log(avctx, AV_LOG_DEBUG,
-+                   "discarding vui default display window, "
-+                   "original values are l:%u r:%u t:%u b:%u\n",
-+                   vui->def_disp_win.left_offset,
-+                   vui->def_disp_win.right_offset,
-+                   vui->def_disp_win.top_offset,
-+                   vui->def_disp_win.bottom_offset);
-+
-+            vui->def_disp_win.left_offset   =
-+            vui->def_disp_win.right_offset  =
-+            vui->def_disp_win.top_offset    =
-+            vui->def_disp_win.bottom_offset = 0;
-+        }
-+    }
-+
-+timing_info:
-+    vui->vui_timing_info_present_flag = get_bits1(gb);
-+
-+    if (vui->vui_timing_info_present_flag) {
-+        if( get_bits_left(gb) < 66 && !alt) {
-+            // The alternate syntax seem to have timing info located
-+            // at where def_disp_win is normally located
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Strange VUI timing information, retrying...\n");
-+            memcpy(vui, &backup_vui, sizeof(backup_vui));
-+            memcpy(gb, &backup, sizeof(backup));
-+            alt = 1;
-+            goto timing_info;
-+        }
-+        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
-+        vui->vui_time_scale                      = get_bits_long(gb, 32);
-+        if (alt) {
-+            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
-+                   vui->vui_time_scale, vui->vui_num_units_in_tick);
-+        }
-+        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
-+        if (vui->vui_poc_proportional_to_timing_flag)
-+            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
-+        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
-+        if (vui->vui_hrd_parameters_present_flag)
-+            decode_hrd(gb, 1, sps->max_sub_layers);
-+    }
-+
-+    vui->bitstream_restriction_flag = get_bits1(gb);
-+    if (vui->bitstream_restriction_flag) {
-+        if (get_bits_left(gb) < 8 && !alt) {
-+            av_log(avctx, AV_LOG_WARNING,
-+                   "Strange VUI bitstream restriction information, retrying"
-+                   " from timing information...\n");
-+            memcpy(vui, &backup_vui, sizeof(backup_vui));
-+            memcpy(gb, &backup, sizeof(backup));
-+            alt = 1;
-+            goto timing_info;
-+        }
-+        vui->tiles_fixed_structure_flag              = get_bits1(gb);
-+        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
-+        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
-+        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
-+        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
-+        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
-+        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
-+        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
-+    }
-+
-+    if (get_bits_left(gb) < 1 && !alt) {
-+        // XXX: Alternate syntax when sps_range_extension_flag != 0?
-+        av_log(avctx, AV_LOG_WARNING,
-+               "Overread in VUI, retrying from timing information...\n");
-+        memcpy(vui, &backup_vui, sizeof(backup_vui));
-+        memcpy(gb, &backup, sizeof(backup));
-+        alt = 1;
-+        goto timing_info;
-+    }
-+}
-+
-+static void set_default_scaling_list_data(ScalingList * const sl)
-+{
-+    int matrixId;
-+
-+    for (matrixId = 0; matrixId < 6; matrixId++) {
-+        // 4x4 default is 16
-+        memset(sl->sl[0][matrixId], 16, 16);
-+        sl->sl_dc[0][matrixId] = 16; // default for 16x16
-+        sl->sl_dc[1][matrixId] = 16; // default for 32x32
-+    }
-+
-+    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
-+
-+    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
-+
-+    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
-+    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
-+
-+    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
-+    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
-+}
-+
-+static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
-+                             const HEVCRpiSPS * const sps)
-+{
-+    uint8_t scaling_list_pred_mode_flag;
-+    int32_t scaling_list_dc_coef[2][6];
-+    int size_id, matrix_id, pos;
-+    int i;
-+
-+    for (size_id = 0; size_id < 4; size_id++)
-+        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
-+            scaling_list_pred_mode_flag = get_bits1(gb);
-+            if (!scaling_list_pred_mode_flag) {
-+                unsigned int delta = get_ue_golomb_long(gb);
-+                /* Only need to handle non-zero delta. Zero means default,
-+                 * which should already be in the arrays. */
-+                if (delta) {
-+                    // Copy from previous array.
-+                    delta *= (size_id == 3) ? 3 : 1;
-+                    if (matrix_id < delta) {
-+                        av_log(avctx, AV_LOG_ERROR,
-+                               "Invalid delta in scaling list data: %d.\n", delta);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+
-+                    memcpy(sl->sl[size_id][matrix_id],
-+                           sl->sl[size_id][matrix_id - delta],
-+                           size_id > 0 ? 64 : 16);
-+                    if (size_id > 1)
-+                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
-+                }
-+            } else {
-+                int next_coef, coef_num;
-+                int32_t scaling_list_delta_coef;
-+
-+                next_coef = 8;
-+                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
-+                if (size_id > 1) {
-+                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
-+                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
-+                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
-+                }
-+                for (i = 0; i < coef_num; i++) {
-+                    if (size_id == 0)
-+                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
-+                                  ff_hevc_rpi_diag_scan4x4_x[i];
-+                    else
-+                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
-+                                  ff_hevc_rpi_diag_scan8x8_x[i];
-+
-+                    scaling_list_delta_coef = get_se_golomb(gb);
-+                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
-+                    sl->sl[size_id][matrix_id][pos] = next_coef;
-+                }
-+            }
-+        }
-+
-+    if (sps->chroma_format_idc == 3) {
-+        for (i = 0; i < 64; i++) {
-+            sl->sl[3][1][i] = sl->sl[2][1][i];
-+            sl->sl[3][2][i] = sl->sl[2][2][i];
-+            sl->sl[3][4][i] = sl->sl[2][4][i];
-+            sl->sl[3][5][i] = sl->sl[2][5][i];
-+        }
-+        sl->sl_dc[1][1] = sl->sl_dc[0][1];
-+        sl->sl_dc[1][2] = sl->sl_dc[0][2];
-+        sl->sl_dc[1][4] = sl->sl_dc[0][4];
-+        sl->sl_dc[1][5] = sl->sl_dc[0][5];
-+    }
-+
-+
-+    return 0;
-+}
-+
-+static int map_pixel_format(HEVCRpiSPS * const sps)
-+{
-+    const int cfmt = sps->chroma_format_idc;
-+
-+    sps->pix_fmt = AV_PIX_FMT_NONE;
-+    switch (sps->bit_depth) {
-+    case 8:
-+        if (cfmt == 1)
-+            sps->pix_fmt = AV_PIX_FMT_SAND128;
-+        break;
-+    case 10:
-+        if (cfmt == 1)
-+            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    sps->hshift[0] = sps->vshift[0] = 0;
-+    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
-+    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
-+
-+    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
-+
-+    return 0;
-+}
-+
-+static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
-+                      const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
-+{
-+    HEVCRpiWindow *ow;
-+    int ret = 0;
-+    int log2_diff_max_min_transform_block_size;
-+    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
-+    int i;
-+
-+    // Coded parameters
-+
-+    sps->vps_id = get_bits(gb, 4);
-+    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (vps_list && !vps_list[sps->vps_id]) {
-+        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
-+               sps->vps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->max_sub_layers = get_bits(gb, 3) + 1;
-+    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
-+               sps->max_sub_layers);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->temporal_id_nesting_flag = get_bits(gb, 1);
-+
-+    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
-+        return ret;
-+
-+    *sps_id = get_ue_golomb_long(gb);
-+    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sps->chroma_format_idc = get_ue_golomb_long(gb);
-+    if (sps->chroma_format_idc > 3U) {
-+        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->chroma_format_idc == 3)
-+        sps->separate_colour_plane_flag = get_bits1(gb);
-+
-+    if (sps->separate_colour_plane_flag)
-+        sps->chroma_format_idc = 0;
-+
-+    sps->width  = get_ue_golomb_long(gb);
-+    sps->height = get_ue_golomb_long(gb);
-+    if ((ret = av_image_check_size(sps->width,
-+                                   sps->height, 0, avctx)) < 0)
-+        return ret;
-+
-+    if (get_bits1(gb)) { // pic_conformance_flag
-+        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
-+        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
-+        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
-+        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
-+        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
-+
-+        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+            av_log(avctx, AV_LOG_DEBUG,
-+                   "discarding sps conformance window, "
-+                   "original values are l:%u r:%u t:%u b:%u\n",
-+                   sps->pic_conf_win.left_offset,
-+                   sps->pic_conf_win.right_offset,
-+                   sps->pic_conf_win.top_offset,
-+                   sps->pic_conf_win.bottom_offset);
-+
-+            sps->pic_conf_win.left_offset   =
-+            sps->pic_conf_win.right_offset  =
-+            sps->pic_conf_win.top_offset    =
-+            sps->pic_conf_win.bottom_offset = 0;
-+        }
-+        sps->output_window = sps->pic_conf_win;
-+    }
-+
-+    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
-+    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
-+    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Luma bit depth (%d) is different from chroma bit depth (%d), "
-+               "this is unsupported.\n",
-+               sps->bit_depth, bit_depth_chroma);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    ret = map_pixel_format(sps);
-+    if (ret < 0)
-+        return ret;
-+
-+    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
-+    if (sps->log2_max_poc_lsb > 16) {
-+        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
-+               sps->log2_max_poc_lsb - 4);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sublayer_ordering_info = get_bits1(gb);
-+    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
-+    for (i = start; i < sps->max_sub_layers; i++) {
-+        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
-+        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
-+        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
-+        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
-+            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
-+            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
-+                   sps->temporal_layer[i].num_reorder_pics);
-+            if (avctx->err_recognition & AV_EF_EXPLODE ||
-+                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
-+                return AVERROR_INVALIDDATA;
-+            }
-+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
-+        }
-+    }
-+
-+    if (!sublayer_ordering_info) {
-+        for (i = 0; i < start; i++) {
-+            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
-+            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
-+            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
-+        }
-+    }
-+
-+    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
-+    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
-+    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
-+    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
-+    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
-+                                               sps->log2_min_tb_size;
-+
-+    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->log2_diff_max_min_coding_block_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    {
-+        const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
-+        // Not a bitstream limitation, but all profiles
-+        if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
-+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
-+            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        // Inferred parameters
-+        sps->log2_ctb_size = CtbLog2SizeY;
-+//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
-+    }
-+
-+    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
-+    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
-+
-+    sps->scaling_list_enable_flag = get_bits1(gb);
-+    if (sps->scaling_list_enable_flag) {
-+        set_default_scaling_list_data(&sps->scaling_list);
-+
-+        if (get_bits1(gb)) {
-+            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
-+            if (ret < 0)
-+                return ret;
-+        }
-+    }
-+
-+    sps->amp_enabled_flag = get_bits1(gb);
-+    sps->sao_enabled      = get_bits1(gb);
-+
-+    // Set pcm defaults (0) so we don't have to test _enabled when we
-+    // want to use them
-+    memset(&sps->pcm, 0, sizeof(sps->pcm));
-+
-+    if (get_bits1(gb))  // pcm_enabled_flag
-+    {
-+        const unsigned int limit_max_pcm = FFMIN(5,
-+            sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
-+        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
-+        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
-+        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
-+        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
-+                                        get_ue_golomb_long(gb);
-+        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
-+                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
-+            sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
-+            av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
-+                   sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
-+    }
-+
-+    // Could be based on min_pcm_cb_size but much easier logic if we just stick
-+    // with 8 (and costs us little)
-+    sps->pcm_width = (sps->width + 63) >> 6;  // 8 for min size, 8 bits per byte - round up
-+    sps->pcm_height = (sps->height + 7) >> 3;
-+
-+    sps->nb_st_rps = get_ue_golomb_long(gb);
-+    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
-+        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
-+               sps->nb_st_rps);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    for (i = 0; i < sps->nb_st_rps; i++) {
-+        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
-+                                                 sps, 0)) < 0)
-+            return ret;
-+    }
-+
-+    sps->long_term_ref_pics_present_flag = get_bits1(gb);
-+    if (sps->long_term_ref_pics_present_flag) {
-+        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
-+        if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
-+            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
-+                   sps->num_long_term_ref_pics_sps);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
-+            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
-+            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
-+        }
-+    }
-+
-+    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
-+    sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
-+    sps->vui.sar = (AVRational){0, 1};
-+    vui_present = get_bits1(gb);
-+    if (vui_present)
-+        decode_vui(gb, avctx, apply_defdispwin, sps);
-+
-+    if (get_bits1(gb)) { // sps_extension_flag
-+        int sps_extension_flag[1];
-+        for (i = 0; i < 1; i++)
-+            sps_extension_flag[i] = get_bits1(gb);
-+        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
-+        if (sps_extension_flag[0]) {
-+            int extended_precision_processing_flag;
-+            int cabac_bypass_alignment_enabled_flag;
-+
-+            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
-+            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
-+            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+            extended_precision_processing_flag = get_bits1(gb);
-+            if (extended_precision_processing_flag)
-+                av_log(avctx, AV_LOG_WARNING,
-+                   "extended_precision_processing_flag not yet implemented\n");
-+
-+            if (get_bits1(gb))          // sps->intra_smoothing_disabled_flag
-+                sps->intra_filters_disable |= FILTER_EITHER;
-+            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
-+            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
-+
-+            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
-+            if (cabac_bypass_alignment_enabled_flag)
-+                av_log(avctx, AV_LOG_WARNING,
-+                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
-+        }
-+    }
-+    if (apply_defdispwin) {
-+        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
-+        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
-+        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
-+        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
-+    }
-+
-+    ow = &sps->output_window;
-+    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
-+        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
-+        ow->left_offset + ow->right_offset  >= sps->width ||
-+        ow->top_offset  + ow->bottom_offset >= sps->height) {
-+        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
-+               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
-+        if (avctx->err_recognition & AV_EF_EXPLODE) {
-+            return AVERROR_INVALIDDATA;
-+        }
-+        av_log(avctx, AV_LOG_WARNING,
-+               "Displaying the whole video surface.\n");
-+        memset(ow, 0, sizeof(*ow));
-+        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
-+    }
-+
-+    // Inferred parameters
-+
-+    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
-+
-+    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
-+    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
-+    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
-+    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
-+    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
-+    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
-+    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
-+
-+    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
-+    sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
-+
-+    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
-+        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
-+        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
-+               sps->max_transform_hierarchy_depth_inter);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
-+               sps->max_transform_hierarchy_depth_intra);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "max transform block size out of range: %d\n",
-+               sps->log2_max_trafo_size);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread SPS by %d bits\n", -get_bits_left(gb));
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps, int apply_defdispwin)
-+{
-+    HEVCRpiSPS *sps;
-+    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
-+    unsigned int sps_id;
-+    int ret;
-+    ptrdiff_t nal_size;
-+
-+    if (!sps_buf)
-+        return AVERROR(ENOMEM);
-+    sps = (HEVCRpiSPS*)sps_buf->data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(sps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(sps->data));
-+        sps->data_size = sizeof(sps->data);
-+    } else {
-+        sps->data_size = nal_size;
-+    }
-+    memcpy(sps->data, gb->buffer, sps->data_size);
-+
-+    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
-+                            apply_defdispwin,
-+                            ps->vps_list, avctx);
-+    if (ret < 0) {
-+        av_buffer_unref(&sps_buf);
-+        return ret;
-+    }
-+
-+    if (avctx->debug & FF_DEBUG_BITSTREAM) {
-+        av_log(avctx, AV_LOG_DEBUG,
-+               "Parsed SPS: id %d; coded wxh: %dx%d; "
-+               "cropped wxh: %dx%d; pix_fmt: %s.\n",
-+               sps_id, sps->width, sps->height,
-+               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
-+               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
-+               av_get_pix_fmt_name(sps->pix_fmt));
-+    }
-+
-+    /* check if this is a repeat of an already parsed SPS, then keep the
-+     * original one.
-+     * otherwise drop all PPSes that depend on it */
-+    if (ps->sps_list[sps_id] &&
-+        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
-+        av_buffer_unref(&sps_buf);
-+    } else {
-+        remove_sps(ps, sps_id);
-+        ps->sps_list[sps_id] = sps_buf;
-+    }
-+
-+    return 0;
-+}
-+
-+static void hevc_pps_free(void *opaque, uint8_t *data)
-+{
-+    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
-+
-+    av_freep(&pps->column_width);
-+    av_freep(&pps->row_height);
-+    av_freep(&pps->col_bd);
-+    av_freep(&pps->row_bd);
-+    av_freep(&pps->col_idxX);
-+    av_freep(&pps->ctb_addr_rs_to_ts);
-+    av_freep(&pps->ctb_addr_ts_to_rs);
-+    av_freep(&pps->tile_pos_ts);
-+    av_freep(&pps->tile_size);
-+    av_freep(&pps->tile_id);
-+    av_freep(&pps->ctb_ts_flags);
-+
-+    av_freep(&pps);
-+}
-+
-+static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
-+{
-+    do
-+    {
-+        const int offset = get_se_golomb_long(gb);
-+        if (offset < -12 || offset > 12) {
-+            av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        *offsets++ = offset;
-+    } while (n_minus_1-- != 0);
-+    return 0;
-+}
-+
-+static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
-+                                HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+    if (pps->transform_skip_enabled_flag) {
-+        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
-+    }
-+    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
-+    if (pps->cross_component_prediction_enabled_flag &&
-+        (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
-+    if (pps->chroma_qp_offset_list_enabled_flag) {
-+        int err;
-+
-+        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
-+        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
-+        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
-+            av_log(avctx, AV_LOG_ERROR,
-+                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
-+
-+        if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
-+            (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
-+            return err;
-+    }
-+
-+    {
-+        const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
-+
-+        pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
-+        if (pps->log2_sao_offset_scale_luma > max_offset) {
-+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
-+        if (pps->log2_sao_offset_scale_chroma > max_offset) {
-+            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    return(0);
-+}
-+
-+static inline int setup_pps(AVCodecContext * const avctx,
-+                            HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+    int pic_area_in_ctbs;
-+    int i, j, x, y, ctb_addr_rs, tile_id;
-+
-+    // Inferred parameters
-+
-+    // qp_y -> qp_u/qp_v tables
-+    // The tables have at least -24,+24 overrun after adding offset here
-+    // which should allow for clipless offseting
-+
-+    pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0;  // No offset for luma, but may be useful for general code
-+    pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
-+
-+    if (sps->chroma_format_idc == 1) {
-+        pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+        pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+    }
-+    else
-+    {
-+        pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+        pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+        pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+    }
-+
-+    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
-+    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
-+    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
-+    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
-+        return AVERROR(ENOMEM);
-+
-+    if (pps->uniform_spacing_flag) {
-+        if (!pps->column_width) {
-+            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
-+        }
-+        if (!pps->column_width || !pps->row_height)
-+            return AVERROR(ENOMEM);
-+
-+        for (i = 0; i < pps->num_tile_columns; i++) {
-+            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
-+                                   (i * sps->ctb_width) / pps->num_tile_columns;
-+        }
-+
-+        for (i = 0; i < pps->num_tile_rows; i++) {
-+            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
-+                                 (i * sps->ctb_height) / pps->num_tile_rows;
-+        }
-+    }
-+
-+    {
-+        const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
-+        pps->col_bd[0] = 0;
-+        pps->tile_wpp_inter_disable = 0;
-+        for (i = 0; i < pps->num_tile_columns; i++)
-+        {
-+            pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
-+
-+            // Avoid trying tile parallel if the columns don't fall on cache boundries
-+            // (this causes too much pain syncing flushes with the QPU)
-+            // Ignore the final (RHS of pic) tile boundry
-+            if ((pps->col_bd[i] & td_mask) != 0) {
-+                pps->tile_wpp_inter_disable = 1;
-+            }
-+        }
-+
-+        // If we can start the next row before finishing the first line of
-+        // this one then we must wait at the end of the tile
-+        // * if this happens a lot then there are better but more complicated
-+        //   conditions that we could apply
-+        if (pps->tile_wpp_inter_disable) {
-+            for (i = 0; i < pps->num_tile_rows; i++)
-+            {
-+                if (pps->row_height[i] <= RPI_MAX_JOBS) {
-+                    pps->tile_wpp_inter_disable = 2;
-+                    break;
-+                }
-+            }
-+        }
-+    }
-+
-+    pps->row_bd[0] = 0;
-+    for (i = 0; i < pps->num_tile_rows; i++)
-+        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
-+
-+    for (i = 0, j = 0; i < sps->ctb_width; i++) {
-+        if (i >= pps->col_bd[j + 1])
-+            j++;
-+        pps->col_idxX[i] = j;
-+    }
-+
-+    /**
-+     * 6.5
-+     */
-+    pic_area_in_ctbs     = sps->ctb_size;
-+
-+    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
-+    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
-+    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-+    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
-+    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
-+    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
-+    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-+        !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
-+
-+    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
-+        int tb_x   = ctb_addr_rs % sps->ctb_width;
-+        int tb_y   = ctb_addr_rs / sps->ctb_width;
-+        int tile_x = 0;
-+        int tile_y = 0;
-+        int val    = 0;
-+
-+        for (i = 0; i < pps->num_tile_columns; i++) {
-+            if (tb_x < pps->col_bd[i + 1]) {
-+                tile_x = i;
-+                break;
-+            }
-+        }
-+
-+        for (i = 0; i < pps->num_tile_rows; i++) {
-+            if (tb_y < pps->row_bd[i + 1]) {
-+                tile_y = i;
-+                break;
-+            }
-+        }
-+
-+        for (i = 0; i < tile_x; i++)
-+            val += pps->row_height[tile_y] * pps->column_width[i];
-+        for (i = 0; i < tile_y; i++)
-+            val += sps->ctb_width * pps->row_height[i];
-+
-+        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
-+               tb_x - pps->col_bd[tile_x];
-+
-+        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
-+        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
-+    }
-+
-+    {
-+        uint8_t * pflags = pps->ctb_ts_flags;
-+        uint16_t * ptid = pps->tile_id;
-+
-+        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
-+        {
-+            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
-+            {
-+                const unsigned int tile_w = pps->column_width[i];
-+
-+                pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+                for (x = 0; x != tile_w; ++x) {
-+                    pflags[x] |= CTB_TS_FLAGS_TOT;
-+                }
-+
-+                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
-+                {
-+                    pflags[0] |= CTB_TS_FLAGS_SOTL;
-+
-+                    if (pps->entropy_coding_sync_enabled_flag)
-+                    {
-+                        if (pps->column_width[i] != 1)
-+                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
-+                        else
-+                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
-+                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
-+                    }
-+
-+                    for (x = 0; x != tile_w; ++x)
-+                        *ptid++ = tile_id;
-+
-+                    pflags += tile_w;
-+                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
-+                    if (i + 1 == pps->num_tile_columns)
-+                        pflags[-1] |= CTB_TS_FLAGS_EOL;
-+                }
-+
-+                pflags[-1] |= CTB_TS_FLAGS_EOT;
-+            }
-+        }
-+    }
-+
-+    {
-+        unsigned int ts = 0;
-+        for (j = 0; j < pps->num_tile_rows; j++)
-+            for (i = 0; i < pps->num_tile_columns; i++)
-+            {
-+                const unsigned int size = pps->column_width[i] * pps->row_height[j];
-+                pps->tile_size[j * pps->num_tile_columns + i] = size;
-+                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
-+                ts += size;
-+            }
-+    }
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
-+                           HEVCRpiParamSets * const ps)
-+{
-+    const HEVCRpiSPS *sps = NULL;
-+    int i, ret = 0;
-+    unsigned int pps_id = 0;
-+    ptrdiff_t nal_size;
-+    unsigned log2_parallel_merge_level_minus2;
-+
-+    AVBufferRef *pps_buf;
-+    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
-+
-+    if (!pps)
-+        return AVERROR(ENOMEM);
-+
-+    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
-+                               hevc_pps_free, NULL, 0);
-+    if (!pps_buf) {
-+        av_freep(&pps);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
-+
-+    nal_size = gb->buffer_end - gb->buffer;
-+    if (nal_size > sizeof(pps->data)) {
-+        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
-+               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+               nal_size, sizeof(pps->data));
-+        pps->data_size = sizeof(pps->data);
-+    } else {
-+        pps->data_size = nal_size;
-+    }
-+    memcpy(pps->data, gb->buffer, pps->data_size);
-+
-+    // Default values
-+    pps->loop_filter_across_tiles_enabled_flag = 1;
-+    pps->num_tile_columns                      = 1;
-+    pps->num_tile_rows                         = 1;
-+    pps->uniform_spacing_flag                  = 1;
-+    pps->disable_dbf                           = 0;
-+    pps->beta_offset                           = 0;
-+    pps->tc_offset                             = 0;
-+    pps->log2_max_transform_skip_block_size    = 2;
-+
-+    // Coded parameters
-+    pps_id = get_ue_golomb_long(gb);
-+    if (pps_id >= HEVC_MAX_PPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->sps_id = get_ue_golomb_long(gb);
-+    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    if (!ps->sps_list[pps->sps_id]) {
-+        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
-+
-+    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
-+    pps->output_flag_present_flag              = get_bits1(gb);
-+    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
-+
-+    pps->sign_data_hiding_flag = get_bits1(gb);
-+
-+    pps->cabac_init_present_flag = get_bits1(gb);
-+
-+    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
-+    if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
-+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
-+    if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
-+        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    pps->pic_init_qp_minus26 = get_se_golomb(gb);
-+    if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "init_qp_minus26 %d is outside the valid range "
-+               "[%d, %d].\n",
-+               pps->pic_init_qp_minus26,
-+               -(26 + sps->qp_bd_offset), 25);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    pps->constrained_intra_pred_flag = get_bits1(gb);
-+    pps->transform_skip_enabled_flag = get_bits1(gb);
-+
-+    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
-+    pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
-+    if (pps->cu_qp_delta_enabled_flag)
-+    {
-+        const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
-+
-+        if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
-+            av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
-+                   diff_cu_qp_delta_depth);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+
-+        pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
-+    }
-+
-+    pps->cb_qp_offset = get_se_golomb(gb);
-+    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
-+        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
-+               pps->cb_qp_offset);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->cr_qp_offset = get_se_golomb(gb);
-+    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
-+        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
-+               pps->cr_qp_offset);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
-+
-+    pps->weighted_pred_flag   = get_bits1(gb);
-+    pps->weighted_bipred_flag = get_bits1(gb);
-+
-+    pps->transquant_bypass_enable_flag    = get_bits1(gb);
-+    pps->tiles_enabled_flag               = get_bits1(gb);
-+    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
-+
-+    if (pps->tiles_enabled_flag) {
-+        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
-+        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
-+        if (pps->num_tile_columns <= 0 ||
-+            pps->num_tile_columns >= sps->width) {
-+            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
-+                   pps->num_tile_columns - 1);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+        if (pps->num_tile_rows <= 0 ||
-+            pps->num_tile_rows >= sps->height) {
-+            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
-+                   pps->num_tile_rows - 1);
-+            ret = AVERROR_INVALIDDATA;
-+            goto err;
-+        }
-+
-+        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
-+        if (!pps->column_width || !pps->row_height) {
-+            ret = AVERROR(ENOMEM);
-+            goto err;
-+        }
-+
-+        pps->uniform_spacing_flag = get_bits1(gb);
-+        if (!pps->uniform_spacing_flag) {
-+            uint64_t sum = 0;
-+            for (i = 0; i < pps->num_tile_columns - 1; i++) {
-+                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
-+                sum                 += pps->column_width[i];
-+            }
-+            if (sum >= sps->ctb_width) {
-+                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
-+
-+            sum = 0;
-+            for (i = 0; i < pps->num_tile_rows - 1; i++) {
-+                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
-+                sum               += pps->row_height[i];
-+            }
-+            if (sum >= sps->ctb_height) {
-+                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
-+        }
-+        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
-+    }
-+
-+    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+
-+    pps->deblocking_filter_control_present_flag = get_bits1(gb);
-+    if (pps->deblocking_filter_control_present_flag) {
-+        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
-+        pps->disable_dbf                             = get_bits1(gb);
-+        if (!pps->disable_dbf) {
-+            int beta_offset_div2 = get_se_golomb(gb);
-+            int tc_offset_div2   = get_se_golomb(gb) ;
-+            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
-+                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
-+                       beta_offset_div2);
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
-+                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
-+                       tc_offset_div2);
-+                ret = AVERROR_INVALIDDATA;
-+                goto err;
-+            }
-+            pps->beta_offset = 2 * beta_offset_div2;
-+            pps->tc_offset   = 2 *   tc_offset_div2;
-+        }
-+    }
-+
-+    pps->scaling_list_data_present_flag = get_bits1(gb);
-+    if (pps->scaling_list_data_present_flag) {
-+        set_default_scaling_list_data(&pps->scaling_list);
-+        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
-+        if (ret < 0)
-+            goto err;
-+    }
-+    pps->lists_modification_present_flag = get_bits1(gb);
-+    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
-+    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
-+        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
-+               log2_parallel_merge_level_minus2);
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
-+
-+    pps->slice_header_extension_present_flag = get_bits1(gb);
-+
-+    if (get_bits1(gb)) { // pps_extension_present_flag
-+        int pps_range_extensions_flag = get_bits1(gb);
-+        skip_bits(gb, 7); // pps_extension_7bits
-+        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
-+            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
-+                goto err;
-+        }
-+    }
-+
-+    ret = setup_pps(avctx, pps, sps);
-+    if (ret < 0)
-+        goto err;
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(avctx, AV_LOG_ERROR,
-+               "Overread PPS by %d bits\n", -get_bits_left(gb));
-+        ret = AVERROR_INVALIDDATA;
-+        goto err;
-+    }
-+
-+    remove_pps(ps, pps_id);
-+    ps->pps_list[pps_id] = pps_buf;
-+
-+    return 0;
-+
-+err:
-+    av_buffer_unref(&pps_buf);
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
-+{
-+    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
-+    int prev_poc_lsb = pocTid0 % max_poc_lsb;
-+    int prev_poc_msb = pocTid0 - prev_poc_lsb;
-+    int poc_msb;
-+
-+    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
-+        poc_msb = prev_poc_msb + max_poc_lsb;
-+    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
-+        poc_msb = prev_poc_msb - max_poc_lsb;
-+    else
-+        poc_msb = prev_poc_msb;
-+
-+    // For BLA picture types, POCmsb is set to 0.
-+    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
-+        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
-+        nal_unit_type == HEVC_NAL_BLA_N_LP)
-+        poc_msb = 0;
-+
-+    return poc_msb + poc_lsb;
-+}
-diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
-new file mode 100644
-index 0000000000..c725ebb9ca
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.h
-@@ -0,0 +1,449 @@
-+/*
-+ * HEVC parameter set parsing
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PS_H
-+#define AVCODEC_RPI_HEVC_PS_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/buffer.h"
-+#include "libavutil/pixfmt.h"
-+#include "libavutil/rational.h"
-+
-+#include "avcodec.h"
-+#include "get_bits.h"
-+#include "hevc.h"
-+
-+typedef struct ShortTermRPS {
-+    unsigned int num_negative_pics;
-+    int num_delta_pocs;
-+    int rps_idx_num_delta_pocs;
-+    int32_t delta_poc[32];
-+    uint8_t used[32];
-+} ShortTermRPS;
-+
-+typedef struct LongTermRPS {
-+    int     poc[32];
-+    uint8_t used[32];
-+    uint8_t nb_refs;
-+} LongTermRPS;
-+
-+typedef struct RpiSliceHeader {
-+    unsigned int pps_id;
-+
-+    ///< address (in raster order) of the first block in the current slice segment
-+    unsigned int   slice_segment_addr;
-+    ///< address (in raster order) of the first block in the current slice
-+    unsigned int   slice_addr;
-+
-+    enum HEVCSliceType slice_type;
-+
-+    int pic_order_cnt_lsb;
-+
-+    uint8_t first_slice_in_pic_flag;
-+    uint8_t dependent_slice_segment_flag;
-+    uint8_t pic_output_flag;
-+    uint8_t colour_plane_id;
-+
-+    ///< RPS coded in the slice header itself is stored here
-+    int short_term_ref_pic_set_sps_flag;
-+    int short_term_ref_pic_set_size;
-+    ShortTermRPS slice_rps;
-+    const ShortTermRPS *short_term_rps;
-+    int long_term_ref_pic_set_size;
-+    LongTermRPS long_term_rps;
-+    unsigned int list_entry_lx[2][32];
-+
-+    uint8_t rpl_modification_flag[2];
-+    uint8_t no_output_of_prior_pics_flag;
-+    uint8_t slice_temporal_mvp_enabled_flag;
-+
-+    unsigned int nb_refs[2];
-+
-+    uint8_t slice_sample_adaptive_offset_flag[3];
-+    uint8_t mvd_l1_zero_flag;
-+
-+    uint8_t cabac_init_flag;
-+    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
-+    uint8_t slice_loop_filter_across_slices_enabled_flag;
-+    uint8_t collocated_list;
-+
-+    uint8_t no_dblk_boundary_flags;
-+
-+    unsigned int collocated_ref_idx;
-+
-+    int slice_qp_delta;
-+    int slice_cb_qp_offset;  // -12, +12
-+    int slice_cr_qp_offset;  // -12, +12
-+
-+    uint8_t cu_chroma_qp_offset_enabled_flag;
-+
-+    int beta_offset;    ///< beta_offset_div2 * 2
-+    int tc_offset;      ///< tc_offset_div2 * 2
-+
-+    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
-+
-+    unsigned *entry_point_offset;
-+    int * offset;
-+    int * size;
-+    int num_entry_point_offsets;
-+    int offsets_allocated;
-+
-+    uint8_t offload_wpp;
-+    uint8_t offload_tiles;
-+
-+    int8_t slice_qp;
-+
-+    uint8_t luma_log2_weight_denom;
-+    uint8_t chroma_log2_weight_denom;
-+
-+    int16_t luma_weight_l0[16];     // -128, +255
-+    int16_t luma_offset_l0[16];
-+    int16_t chroma_weight_l0[16][2];
-+    int16_t chroma_offset_l0[16][2];
-+
-+    int16_t luma_weight_l1[16];
-+    int16_t luma_offset_l1[16];
-+    int16_t chroma_weight_l1[16][2];
-+    int16_t chroma_offset_l1[16][2];
-+
-+} RpiSliceHeader;
-+
-+typedef struct HEVCRpiWindow {
-+    uint16_t left_offset;
-+    uint16_t right_offset;
-+    uint16_t top_offset;
-+    uint16_t bottom_offset;
-+} HEVCRpiWindow;
-+
-+typedef struct VUI {
-+    AVRational sar;
-+
-+    int overscan_info_present_flag;
-+    int overscan_appropriate_flag;
-+
-+    int video_signal_type_present_flag;
-+    int video_format;
-+    int video_full_range_flag;
-+    int colour_description_present_flag;
-+    uint8_t colour_primaries;
-+    uint8_t transfer_characteristic;
-+    uint8_t matrix_coeffs;
-+
-+    int chroma_loc_info_present_flag;
-+    int chroma_sample_loc_type_top_field;
-+    int chroma_sample_loc_type_bottom_field;
-+    int neutra_chroma_indication_flag;
-+
-+    int field_seq_flag;
-+    int frame_field_info_present_flag;
-+
-+    int default_display_window_flag;
-+    HEVCRpiWindow def_disp_win;
-+
-+    int vui_timing_info_present_flag;
-+    uint32_t vui_num_units_in_tick;
-+    uint32_t vui_time_scale;
-+    int vui_poc_proportional_to_timing_flag;
-+    int vui_num_ticks_poc_diff_one_minus1;
-+    int vui_hrd_parameters_present_flag;
-+
-+    int bitstream_restriction_flag;
-+    int tiles_fixed_structure_flag;
-+    int motion_vectors_over_pic_boundaries_flag;
-+    int restricted_ref_pic_lists_flag;
-+    int min_spatial_segmentation_idc;
-+    int max_bytes_per_pic_denom;
-+    int max_bits_per_min_cu_denom;
-+    int log2_max_mv_length_horizontal;
-+    int log2_max_mv_length_vertical;
-+} VUI;
-+
-+typedef struct PTLCommon {
-+    uint8_t profile_space;
-+    uint8_t tier_flag;
-+    uint8_t profile_idc;
-+    uint8_t profile_compatibility_flag[32];
-+    uint8_t level_idc;
-+    uint8_t progressive_source_flag;
-+    uint8_t interlaced_source_flag;
-+    uint8_t non_packed_constraint_flag;
-+    uint8_t frame_only_constraint_flag;
-+} PTLCommon;
-+
-+typedef struct PTL {
-+    PTLCommon general_ptl;
-+    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
-+
-+    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
-+    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
-+} PTL;
-+
-+typedef struct HEVCRpiVPS {
-+    uint8_t vps_temporal_id_nesting_flag;
-+    int vps_max_layers;
-+    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
-+
-+    PTL ptl;
-+    int vps_sub_layer_ordering_info_present_flag;
-+    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
-+    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
-+    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
-+    int vps_max_layer_id;
-+    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
-+    uint8_t vps_timing_info_present_flag;
-+    uint32_t vps_num_units_in_tick;
-+    uint32_t vps_time_scale;
-+    uint8_t vps_poc_proportional_to_timing_flag;
-+    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
-+    int vps_num_hrd_parameters;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+} HEVCRpiVPS;
-+
-+typedef struct ScalingList {
-+    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
-+     * and size ID 3 only has 2 arrays, not 6. */
-+    uint8_t sl[4][6][64];
-+    uint8_t sl_dc[2][6];
-+} ScalingList;
-+
-+typedef struct HEVCRpiSPS {
-+    unsigned vps_id;
-+    uint8_t chroma_format_idc;
-+    uint8_t separate_colour_plane_flag;
-+
-+    HEVCRpiWindow output_window;
-+
-+    HEVCRpiWindow pic_conf_win;
-+
-+    uint16_t wp_offset_half_range;  // WpOffsetHalfRange
-+
-+    uint8_t bit_depth;
-+
-+//    int bit_depth_chroma;  // We only support lum_bit_depth = chroma_bit_depth
-+    uint8_t pixel_shift;
-+    enum AVPixelFormat pix_fmt;
-+
-+    unsigned int log2_max_poc_lsb;
-+
-+    int max_sub_layers;
-+    struct {
-+        int max_dec_pic_buffering;
-+        int num_reorder_pics;
-+        int max_latency_increase;
-+    } temporal_layer[HEVC_MAX_SUB_LAYERS];
-+    uint8_t temporal_id_nesting_flag;
-+
-+    uint8_t scaling_list_enable_flag;
-+    ScalingList scaling_list;
-+
-+    unsigned int nb_st_rps;
-+    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
-+
-+    uint8_t amp_enabled_flag;
-+    uint8_t sao_enabled;
-+
-+    uint8_t long_term_ref_pics_present_flag;
-+    uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
-+    uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
-+    uint8_t num_long_term_ref_pics_sps;
-+
-+    struct {
-+        uint8_t bit_depth;
-+        uint8_t bit_depth_chroma;
-+        uint8_t log2_min_pcm_cb_size;
-+        uint8_t log2_max_pcm_cb_size;
-+        uint8_t loop_filter_disable_flag;
-+    } pcm;
-+    char sps_temporal_mvp_enabled_flag;
-+//    char sps_strong_intra_smoothing_enable_flag;  -> intra_filtes_disable
-+
-+    uint8_t log2_min_cb_size;  // 3..6
-+    uint8_t log2_diff_max_min_coding_block_size;
-+    uint8_t log2_min_tb_size;  // 2..5
-+    uint8_t log2_max_trafo_size;
-+    uint8_t log2_ctb_size;     // 4..6
-+//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
-+#define LOG2_MIN_PU_SIZE 2
-+#define LOG2_MIN_CU_SIZE 3
-+
-+    uint8_t max_transform_hierarchy_depth_inter;
-+    uint8_t max_transform_hierarchy_depth_intra;
-+
-+    char transform_skip_rotation_enabled_flag;
-+    char transform_skip_context_enabled_flag;
-+    char implicit_rdpcm_enabled_flag;
-+    char explicit_rdpcm_enabled_flag;
-+//    char intra_smoothing_disabled_flag;  -> intra_filtes_disable
-+    char high_precision_offsets_enabled_flag;
-+    char persistent_rice_adaptation_enabled_flag;
-+
-+    uint8_t intra_filters_disable;
-+
-+    ///< coded frame dimension in various units
-+    int width;
-+    int height;
-+    int ctb_width;
-+    int ctb_height;
-+    int ctb_size;   // Pic size in CTBs not size of a CTB
-+    int min_cb_width;
-+    int min_cb_height;
-+    int min_tb_width;
-+    int min_tb_height;
-+    int min_pu_width;
-+    int min_pu_height;
-+    int pcm_width;
-+    int pcm_height;
-+    int tb_mask;
-+
-+    int hshift[3];
-+    int vshift[3];
-+
-+    int qp_bd_offset;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+
-+    VUI vui;
-+    PTL ptl;
-+} HEVCRpiSPS;
-+
-+#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
-+#define CTB_TS_FLAGS_EOTL       (1U << 1)       // Last CTB of a tile line
-+#define CTB_TS_FLAGS_EOL        (1U << 2)       // Last CTB of a complete line
-+#define CTB_TS_FLAGS_EOT        (1U << 3)       // Last CTB of a tile
-+#define CTB_TS_FLAGS_CSAVE      (1U << 4)
-+#define CTB_TS_FLAGS_CIREQ      (1U << 5)       // Cabac init request
-+#define CTB_TS_FLAGS_TOT        (1U << 6)       // CTB on top row of a tile
-+#define CTB_TS_FLAGS_CLOAD      (1U << 7)
-+
-+typedef struct HEVCRpiPPS {
-+    unsigned int sps_id; ///< seq_parameter_set_id
-+
-+    uint8_t sign_data_hiding_flag;
-+
-+    uint8_t cabac_init_present_flag;
-+
-+    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
-+    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
-+    int pic_init_qp_minus26;
-+
-+    uint8_t constrained_intra_pred_flag;
-+    uint8_t transform_skip_enabled_flag;
-+
-+    uint8_t cu_qp_delta_enabled_flag;
-+    uint8_t log2_min_cu_qp_delta_size;
-+    int cb_qp_offset;   // -12..12
-+    int cr_qp_offset;   // -12..12
-+    const uint8_t * qp_dblk_x[3];
-+    const int8_t * qp_bd_x[3];
-+
-+    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
-+    uint8_t weighted_pred_flag;
-+    uint8_t weighted_bipred_flag;
-+    uint8_t output_flag_present_flag;
-+    uint8_t transquant_bypass_enable_flag;
-+
-+    uint8_t dependent_slice_segments_enabled_flag;
-+    uint8_t tiles_enabled_flag;
-+    uint8_t entropy_coding_sync_enabled_flag;
-+
-+    uint8_t tile_wpp_inter_disable;
-+    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
-+    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
-+    uint8_t uniform_spacing_flag;
-+    uint8_t loop_filter_across_tiles_enabled_flag;
-+
-+    uint8_t seq_loop_filter_across_slices_enabled_flag;
-+
-+    uint8_t deblocking_filter_control_present_flag;
-+    uint8_t deblocking_filter_override_enabled_flag;
-+    uint8_t disable_dbf;
-+    int beta_offset;    ///< beta_offset_div2 * 2
-+    int tc_offset;      ///< tc_offset_div2 * 2
-+
-+    uint8_t scaling_list_data_present_flag;
-+    ScalingList scaling_list;
-+
-+    uint8_t lists_modification_present_flag;
-+    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
-+    int num_extra_slice_header_bits;
-+    uint8_t slice_header_extension_present_flag;
-+    uint8_t log2_max_transform_skip_block_size;
-+    uint8_t cross_component_prediction_enabled_flag;
-+    uint8_t chroma_qp_offset_list_enabled_flag;
-+    uint8_t diff_cu_chroma_qp_offset_depth;
-+    uint8_t chroma_qp_offset_list_len_minus1;
-+    int8_t  cb_qp_offset_list[6];
-+    int8_t  cr_qp_offset_list[6];
-+    uint8_t log2_sao_offset_scale_luma;
-+    uint8_t log2_sao_offset_scale_chroma;
-+
-+    // Inferred parameters
-+    uint16_t *column_width;  ///< ColumnWidth
-+    uint16_t *row_height;    ///< RowHeight
-+    uint16_t *col_bd;        ///< ColBd
-+    uint16_t *row_bd;        ///< RowBd
-+    uint16_t *col_idxX;
-+
-+    // We can limit these to uint16_t given our other size limits
-+    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
-+    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
-+    uint16_t *tile_id;           ///< TileId
-+    uint16_t *tile_pos_ts;       ///< TilePosRS
-+    uint16_t *tile_size;         ///< TileSize
-+    uint8_t * ctb_ts_flags;
-+
-+    uint8_t data[4096];
-+    int data_size;
-+} HEVCRpiPPS;
-+
-+typedef struct HEVCRpiParamSets {
-+    /* currently active parameter sets */
-+    const HEVCRpiVPS *vps;
-+    const HEVCRpiSPS *sps;
-+    const HEVCRpiPPS *pps;
-+
-+    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
-+    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
-+    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
-+} HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps);
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps, int apply_defdispwin);
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
-+                           HEVCRpiParamSets *ps);
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-+                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
-+
-+int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
-+                           uint8_t *buf, int buf_size);
-+
-+/**
-+ * Compute POC of the current frame and return it.
-+ */
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
-+
-+#endif /* AVCODEC_RPI_HEVC_PS_H */
-diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
-new file mode 100644
-index 0000000000..8cc5796cf0
---- /dev/null
-+++ b/libavcodec/rpi_hevc_refs.c
-@@ -0,0 +1,485 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
-+{
-+    /* frame->frame can be NULL if context init failed */
-+    if (!frame->frame || !frame->frame->buf[0])
-+        return;
-+
-+    frame->flags &= ~flags;
-+    if (!frame->flags) {
-+        ff_thread_release_buffer(s->avctx, &frame->tf);
-+
-+        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
-+        frame->col_mvf = NULL;
-+
-+        frame->collocated_ref = NULL;
-+    }
-+}
-+
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
-+{
-+    int i;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
-+                            HEVC_FRAME_FLAG_SHORT_REF |
-+                            HEVC_FRAME_FLAG_LONG_REF);
-+}
-+
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
-+{
-+    int i;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+}
-+
-+static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
-+{
-+    int i, ret;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame * const frame = &s->DPB[i];
-+        if (frame->frame->buf[0])
-+            continue;
-+
-+        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
-+                                   AV_GET_BUFFER_FLAG_REF);
-+        if (ret < 0)
-+            return NULL;
-+
-+        frame->col_mvf = NULL;
-+        frame->col_mvf_buf = NULL;
-+        if (s->used_for_ref && !s->is_irap)
-+        {
-+            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
-+            if (!frame->col_mvf_buf)
-+                goto fail;
-+            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
-+        }
-+
-+        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
-+        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
-+
-+        return frame;
-+
-+fail:
-+        ff_hevc_rpi_unref_frame(s, frame, ~0);
-+        return NULL;
-+    }
-+    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
-+    return NULL;
-+}
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
-+{
-+    HEVCRpiFrame *ref;
-+    int i;
-+
-+    /* check that this POC doesn't already exist */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+
-+        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
-+            frame->poc == poc) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
-+                   poc);
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    ref = alloc_frame(s);
-+    if (!ref)
-+        return AVERROR(ENOMEM);
-+
-+    *frame = ref->frame;
-+    s->ref = ref;
-+
-+    if (s->sh.pic_output_flag)
-+        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
-+    else
-+        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
-+
-+    ref->poc      = poc;
-+    ref->sequence = s->seq_decode;
-+    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
-+    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
-+    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
-+    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
-+
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
-+{
-+    do {
-+        int nb_output = 0;
-+        int min_poc   = INT_MAX;
-+        int i, min_idx, ret;
-+
-+        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
-+            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+                HEVCRpiFrame *frame = &s->DPB[i];
-+                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
-+                        frame->sequence == s->seq_output) {
-+                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+                }
-+            }
-+        }
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
-+                frame->sequence == s->seq_output) {
-+                nb_output++;
-+                if (frame->poc < min_poc || nb_output == 1) {
-+                    min_poc = frame->poc;
-+                    min_idx = i;
-+                }
-+            }
-+        }
-+
-+        /* wait for more frames before output */
-+        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
-+            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
-+            return 0;
-+
-+        if (nb_output) {
-+            HEVCRpiFrame *frame = &s->DPB[min_idx];
-+            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
-+                return 0;
-+
-+            ret = av_frame_ref(out, frame->frame);
-+            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
-+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
-+            else
-+                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+            if (ret < 0)
-+                return ret;
-+            av_log(s->avctx, AV_LOG_DEBUG,
-+                   "Output frame with POC %d.\n", frame->poc);
-+            return 1;
-+        }
-+
-+        if (s->seq_output != s->seq_decode)
-+            s->seq_output = (s->seq_output + 1) & 0xff;
-+        else
-+            break;
-+    } while (1);
-+
-+    return 0;
-+}
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
-+{
-+    int dpb = 0;
-+    int min_poc = INT_MAX;
-+    int i;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+        if ((frame->flags) &&
-+            frame->sequence == s->seq_output &&
-+            frame->poc != s->poc) {
-+            dpb++;
-+        }
-+    }
-+
-+    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if ((frame->flags) &&
-+                frame->sequence == s->seq_output &&
-+                frame->poc != s->poc) {
-+                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
-+                    min_poc = frame->poc;
-+                }
-+            }
-+        }
-+
-+        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCRpiFrame *frame = &s->DPB[i];
-+            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
-+                frame->sequence == s->seq_output &&
-+                frame->poc <= min_poc) {
-+                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
-+            }
-+        }
-+
-+        dpb--;
-+    }
-+}
-+
-+static int init_slice_rpl(HEVCRpiContext *s)
-+{
-+    if (s->slice_idx >= s->rpl_tab_size)
-+        return AVERROR_INVALIDDATA;
-+
-+    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
-+{
-+    RpiSliceHeader *sh = &s->sh;
-+
-+    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
-+    uint8_t list_idx;
-+    int i, j, ret;
-+
-+    ret = init_slice_rpl(s);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
-+          s->rps[LT_CURR].nb_refs)) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    for (list_idx = 0; list_idx < nb_list; list_idx++) {
-+        RefPicList  rpl_tmp = { { 0 } };
-+        RefPicList *rpl     = &s->refPicList[list_idx];
-+
-+        /* The order of the elements is
-+         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
-+         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
-+        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
-+                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
-+                              LT_CURR };
-+
-+        /* concatenate the candidate lists for the current frame */
-+        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
-+            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
-+                RefPicList *rps = &s->rps[cand_lists[i]];
-+                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
-+                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
-+                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
-+                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
-+                    rpl_tmp.nb_refs++;
-+                }
-+            }
-+        }
-+
-+        /* reorder the references if necessary */
-+        if (sh->rpl_modification_flag[list_idx]) {
-+            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
-+                int idx = sh->list_entry_lx[list_idx][i];
-+
-+                if (idx >= rpl_tmp.nb_refs) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                rpl->list[i]       = rpl_tmp.list[idx];
-+                rpl->ref[i]        = rpl_tmp.ref[idx];
-+                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
-+                rpl->nb_refs++;
-+            }
-+        } else {
-+            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
-+            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
-+        }
-+
-+        if (sh->collocated_list == list_idx &&
-+            sh->collocated_ref_idx < rpl->nb_refs)
-+            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
-+    }
-+
-+    return 0;
-+}
-+
-+static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
-+{
-+    int i;
-+    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *ref = &s->DPB[i];
-+        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
-+            if ((ref->poc & LtMask) == poc)
-+                return ref;
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *ref = &s->DPB[i];
-+        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
-+            if (ref->poc == poc || (ref->poc & LtMask) == poc)
-+                return ref;
-+        }
-+    }
-+
-+    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Could not find ref with POC %d\n", poc);
-+    return NULL;
-+}
-+
-+static void mark_ref(HEVCRpiFrame *frame, int flag)
-+{
-+    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
-+    frame->flags |= flag;
-+}
-+
-+static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
-+{
-+    HEVCRpiFrame *frame;
-+    int i, x, y;
-+
-+    frame = alloc_frame(s);
-+    if (!frame)
-+        return NULL;
-+
-+    if (!s->ps.sps->pixel_shift) {
-+        for (i = 0; frame->frame->buf[i]; i++)
-+            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
-+                   frame->frame->buf[i]->size);
-+    } else {
-+        for (i = 0; frame->frame->data[i]; i++)
-+            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
-+                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
-+                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
-+                            1 << (s->ps.sps->bit_depth - 1));
-+                }
-+    }
-+
-+    frame->poc      = poc;
-+    frame->sequence = s->seq_decode;
-+    frame->flags    = 0;
-+
-+    ff_hevc_rpi_progress_set_all_done(frame);
-+
-+    return frame;
-+}
-+
-+/* add a reference with the given poc to the list and mark it as used in DPB */
-+static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
-+                             int poc, int ref_flag)
-+{
-+    HEVCRpiFrame *ref = find_ref_idx(s, poc);
-+
-+    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
-+        return AVERROR_INVALIDDATA;
-+
-+    if (!ref) {
-+        ref = generate_missing_ref(s, poc);
-+        if (!ref)
-+            return AVERROR(ENOMEM);
-+    }
-+
-+    list->list[list->nb_refs] = ref->poc;
-+    list->ref[list->nb_refs]  = ref;
-+    list->nb_refs++;
-+
-+    mark_ref(ref, ref_flag);
-+    return 0;
-+}
-+
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
-+{
-+    const ShortTermRPS *short_rps = s->sh.short_term_rps;
-+    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
-+    RefPicList               *rps = s->rps;
-+    int i, ret = 0;
-+
-+    if (!short_rps) {
-+        rps[0].nb_refs = rps[1].nb_refs = 0;
-+        return 0;
-+    }
-+
-+    /* clear the reference flags on all frames except the current one */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCRpiFrame *frame = &s->DPB[i];
-+
-+        if (frame == s->ref)
-+            continue;
-+
-+        mark_ref(frame, 0);
-+    }
-+
-+    for (i = 0; i < NB_RPS_TYPE; i++)
-+        rps[i].nb_refs = 0;
-+
-+    /* add the short refs */
-+    for (i = 0; i < short_rps->num_delta_pocs; i++) {
-+        int poc = s->poc + short_rps->delta_poc[i];
-+        int list;
-+
-+        if (!short_rps->used[i])
-+            list = ST_FOLL;
-+        else if (i < short_rps->num_negative_pics)
-+            list = ST_CURR_BEF;
-+        else
-+            list = ST_CURR_AFT;
-+
-+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
-+        if (ret < 0)
-+            goto fail;
-+    }
-+
-+    /* add the long refs */
-+    for (i = 0; i < long_rps->nb_refs; i++) {
-+        int poc  = long_rps->poc[i];
-+        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
-+
-+        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
-+        if (ret < 0)
-+            goto fail;
-+    }
-+
-+fail:
-+    /* release any frames that are now unused */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
-+
-+    return ret;
-+}
-+
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
-+{
-+    int ret = 0;
-+    int i;
-+    const ShortTermRPS *rps = s->sh.short_term_rps;
-+    LongTermRPS *long_rps   = &s->sh.long_term_rps;
-+
-+    if (rps) {
-+        for (i = 0; i < rps->num_negative_pics; i++)
-+            ret += !!rps->used[i];
-+        for (; i < rps->num_delta_pocs; i++)
-+            ret += !!rps->used[i];
-+    }
-+
-+    if (long_rps) {
-+        for (i = 0; i < long_rps->nb_refs; i++)
-+            ret += !!long_rps->used[i];
-+    }
-+    return ret;
-+}
-diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
-new file mode 100644
-index 0000000000..cd8149d58e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.c
-@@ -0,0 +1,368 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "golomb.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
-+{
-+    int cIdx, i;
-+    uint8_t hash_type;
-+    //uint16_t picture_crc;
-+    //uint32_t picture_checksum;
-+    hash_type = get_bits(gb, 8);
-+
-+    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
-+        if (hash_type == 0) {
-+            s->is_md5 = 1;
-+            for (i = 0; i < 16; i++)
-+                s->md5[cIdx][i] = get_bits(gb, 8);
-+        } else if (hash_type == 1) {
-+            // picture_crc = get_bits(gb, 16);
-+            skip_bits(gb, 16);
-+        } else if (hash_type == 2) {
-+            // picture_checksum = get_bits_long(gb, 32);
-+            skip_bits(gb, 32);
-+        }
-+    }
-+    return 0;
-+}
-+
-+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
-+{
-+    int i;
-+    // Mastering primaries
-+    for (i = 0; i < 3; i++) {
-+        s->display_primaries[i][0] = get_bits(gb, 16);
-+        s->display_primaries[i][1] = get_bits(gb, 16);
-+    }
-+    // White point (x, y)
-+    s->white_point[0] = get_bits(gb, 16);
-+    s->white_point[1] = get_bits(gb, 16);
-+
-+    // Max and min luminance of mastering display
-+    s->max_luminance = get_bits_long(gb, 32);
-+    s->min_luminance = get_bits_long(gb, 32);
-+
-+    // As this SEI message comes before the first frame that references it,
-+    // initialize the flag to 2 and decrement on IRAP access unit so it
-+    // persists for the coded video sequence (e.g., between two IRAPs)
-+    s->present = 2;
-+    return 0;
-+}
-+
-+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
-+{
-+    // Max and average light levels
-+    s->max_content_light_level     = get_bits_long(gb, 16);
-+    s->max_pic_average_light_level = get_bits_long(gb, 16);
-+    // As this SEI message comes before the first frame that references it,
-+    // initialize the flag to 2 and decrement on IRAP access unit so it
-+    // persists for the coded video sequence (e.g., between two IRAPs)
-+    s->present = 2;
-+    return  0;
-+}
-+
-+static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
-+{
-+    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
-+    s->present = !get_bits1(gb);
-+
-+    if (s->present) {
-+        s->arrangement_type               = get_bits(gb, 7);
-+        s->quincunx_subsampling           = get_bits1(gb);
-+        s->content_interpretation_type    = get_bits(gb, 6);
-+
-+        // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
-+        skip_bits(gb, 3);
-+        s->current_frame_is_frame0_flag = get_bits1(gb);
-+        // frame0_self_contained_flag, frame1_self_contained_flag
-+        skip_bits(gb, 2);
-+
-+        if (!s->quincunx_subsampling && s->arrangement_type != 5)
-+            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
-+        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
-+        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
-+    }
-+    skip_bits1(gb);             // upsampled_aspect_ratio_flag
-+    return 0;
-+}
-+
-+static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
-+{
-+    s->present = !get_bits1(gb);
-+
-+    if (s->present) {
-+        s->hflip = get_bits1(gb);     // hor_flip
-+        s->vflip = get_bits1(gb);     // ver_flip
-+
-+        s->anticlockwise_rotation = get_bits(gb, 16);
-+        skip_bits1(gb);     // display_orientation_persistence_flag
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
-+                                     void *logctx, int size)
-+{
-+    HEVCSEIPictureTiming *h = &s->picture_timing;
-+    HEVCRpiSPS *sps;
-+
-+    if (!ps->sps_list[s->active_seq_parameter_set_id])
-+        return(AVERROR(ENOMEM));
-+    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
-+
-+    if (sps->vui.frame_field_info_present_flag) {
-+        int pic_struct = get_bits(gb, 4);
-+        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
-+        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
-+            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
-+            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
-+        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
-+            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
-+            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
-+        }
-+        get_bits(gb, 2);                   // source_scan_type
-+        get_bits(gb, 1);                   // duplicate_flag
-+        skip_bits1(gb);
-+        size--;
-+    }
-+    skip_bits_long(gb, 8 * size);
-+
-+    return 0;
-+}
-+
-+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
-+                                                      int size)
-+{
-+    int flag;
-+    int user_data_type_code;
-+    int cc_count;
-+
-+    if (size < 3)
-+       return AVERROR(EINVAL);
-+
-+    user_data_type_code = get_bits(gb, 8);
-+    if (user_data_type_code == 0x3) {
-+        skip_bits(gb, 1); // reserved
-+
-+        flag = get_bits(gb, 1); // process_cc_data_flag
-+        if (flag) {
-+            skip_bits(gb, 1);
-+            cc_count = get_bits(gb, 5);
-+            skip_bits(gb, 8); // reserved
-+            size -= 2;
-+
-+            if (cc_count && size >= cc_count * 3) {
-+                const uint64_t new_size = (s->a53_caption_size + cc_count
-+                                           * UINT64_C(3));
-+                int i, ret;
-+
-+                if (new_size > INT_MAX)
-+                    return AVERROR(EINVAL);
-+
-+                /* Allow merging of the cc data from two fields. */
-+                ret = av_reallocp(&s->a53_caption, new_size);
-+                if (ret < 0)
-+                    return ret;
-+
-+                for (i = 0; i < cc_count; i++) {
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+                }
-+                skip_bits(gb, 8); // marker_bits
-+            }
-+        }
-+    } else {
-+        int i;
-+        for (i = 0; i < size - 1; i++)
-+            skip_bits(gb, 8);
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
-+                                                         int size)
-+{
-+    uint32_t country_code;
-+    uint32_t user_identifier;
-+
-+    if (size < 7)
-+        return AVERROR(EINVAL);
-+    size -= 7;
-+
-+    country_code = get_bits(gb, 8);
-+    if (country_code == 0xFF) {
-+        skip_bits(gb, 8);
-+        size--;
-+    }
-+
-+    skip_bits(gb, 8);
-+    skip_bits(gb, 8);
-+
-+    user_identifier = get_bits_long(gb, 32);
-+
-+    switch (user_identifier) {
-+        case MKBETAG('G', 'A', '9', '4'):
-+            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
-+        default:
-+            skip_bits_long(gb, size * 8);
-+            break;
-+    }
-+    return 0;
-+}
-+
-+static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
-+{
-+    int num_sps_ids_minus1;
-+    int i;
-+    unsigned active_seq_parameter_set_id;
-+
-+    get_bits(gb, 4); // active_video_parameter_set_id
-+    get_bits(gb, 1); // self_contained_cvs_flag
-+    get_bits(gb, 1); // num_sps_ids_minus1
-+    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
-+
-+    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
-+        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    active_seq_parameter_set_id = get_ue_golomb_long(gb);
-+    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
-+        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
-+
-+    for (i = 1; i <= num_sps_ids_minus1; i++)
-+        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
-+
-+    return 0;
-+}
-+
-+static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
-+{
-+    s->present = 1;
-+    s->preferred_transfer_characteristics = get_bits(gb, 8);
-+    return 0;
-+}
-+
-+static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
-+                                 int type, int size)
-+{
-+    switch (type) {
-+    case 256:  // Mismatched value from HM 8.1
-+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+    case HEVC_SEI_TYPE_FRAME_PACKING:
-+        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
-+    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
-+        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
-+    case HEVC_SEI_TYPE_PICTURE_TIMING:
-+        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
-+    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
-+        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
-+    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
-+        return decode_nal_sei_content_light_info(&s->content_light, gb);
-+    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
-+        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
-+    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
-+        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
-+    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
-+        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
-+    default:
-+        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
-+        skip_bits_long(gb, 8 * size);
-+        return 0;
-+    }
-+}
-+
-+static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                                 int type, int size)
-+{
-+    switch (type) {
-+    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
-+        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+    default:
-+        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
-+        skip_bits_long(gb, 8 * size);
-+        return 0;
-+    }
-+}
-+
-+static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
-+                                  const HEVCRpiParamSets * const ps, const int nal_unit_type)
-+{
-+    int payload_type = 0;
-+    int payload_size = 0;
-+    int byte = 0xFF;
-+    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
-+
-+    while (byte == 0xFF) {
-+       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
-+           return AVERROR_INVALIDDATA;
-+        byte          = get_bits(gb, 8);
-+        payload_type += byte;
-+    }
-+    byte = 0xFF;
-+    while (byte == 0xFF) {
-+        if (get_bits_left(gb) < 8 + 8LL*payload_size)
-+            return AVERROR_INVALIDDATA;
-+         byte          = get_bits(gb, 8);
-+        payload_size += byte;
-+    }
-+    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
-+        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
-+    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
-+        return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
-+    }
-+}
-+
-+static int more_rbsp_data(GetBitContext *gb)
-+{
-+    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                           const HEVCRpiParamSets *ps, int type)
-+{
-+    int ret;
-+
-+    do {
-+        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
-+        if (ret < 0)
-+            return ret;
-+    } while (more_rbsp_data(gb));
-+    return 1;
-+}
-+
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
-+{
-+    s->a53_caption.a53_caption_size = 0;
-+    av_freep(&s->a53_caption.a53_caption);
-+}
-diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
-new file mode 100644
-index 0000000000..d4ac348df9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.h
-@@ -0,0 +1,135 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_SEI_H
-+#define AVCODEC_RPI_HEVC_SEI_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/md5.h"
-+
-+#include "get_bits.h"
-+
-+/**
-+ * SEI message types
-+ */
-+typedef enum {
-+    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
-+    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
-+    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
-+    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
-+    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
-+    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
-+    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
-+    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
-+    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
-+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
-+    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
-+    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
-+    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
-+    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
-+    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
-+    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
-+    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
-+    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
-+    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
-+    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
-+    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
-+    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
-+    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
-+    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
-+    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
-+    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
-+} HEVC_SEI_Type;
-+
-+typedef struct HEVCSEIPictureHash {
-+    uint8_t       md5[3][16];
-+    uint8_t is_md5;
-+} HEVCSEIPictureHash;
-+
-+typedef struct HEVCSEIFramePacking {
-+    int present;
-+    int arrangement_type;
-+    int content_interpretation_type;
-+    int quincunx_subsampling;
-+    int current_frame_is_frame0_flag;
-+} HEVCSEIFramePacking;
-+
-+typedef struct HEVCSEIDisplayOrientation {
-+    int present;
-+    int anticlockwise_rotation;
-+    int hflip, vflip;
-+} HEVCSEIDisplayOrientation;
-+
-+typedef struct HEVCSEIPictureTiming {
-+    int picture_struct;
-+} HEVCSEIPictureTiming;
-+
-+typedef struct HEVCSEIA53Caption {
-+    int a53_caption_size;
-+    uint8_t *a53_caption;
-+} HEVCSEIA53Caption;
-+
-+typedef struct HEVCSEIMasteringDisplay {
-+    int present;
-+    uint16_t display_primaries[3][2];
-+    uint16_t white_point[2];
-+    uint32_t max_luminance;
-+    uint32_t min_luminance;
-+} HEVCSEIMasteringDisplay;
-+
-+typedef struct HEVCSEIContentLight {
-+    int present;
-+    uint16_t max_content_light_level;
-+    uint16_t max_pic_average_light_level;
-+} HEVCSEIContentLight;
-+
-+typedef struct HEVCSEIAlternativeTransfer {
-+    int present;
-+    int preferred_transfer_characteristics;
-+} HEVCSEIAlternativeTransfer;
-+
-+typedef struct HEVCSEIContext {
-+    HEVCSEIPictureHash picture_hash;
-+    HEVCSEIFramePacking frame_packing;
-+    HEVCSEIDisplayOrientation display_orientation;
-+    HEVCSEIPictureTiming picture_timing;
-+    HEVCSEIA53Caption a53_caption;
-+    HEVCSEIMasteringDisplay mastering_display;
-+    HEVCSEIContentLight content_light;
-+    int active_seq_parameter_set_id;
-+    HEVCSEIAlternativeTransfer alternative_transfer;
-+} HEVCSEIContext;
-+
-+struct HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+                           const struct HEVCRpiParamSets *ps, int type);
-+
-+/**
-+ * Reset SEI values that are stored on the Context.
-+ * e.g. Caption data that was extracted during NAL
-+ * parsing.
-+ *
-+ * @param s HEVCRpiContext.
-+ */
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
-+
-+#endif /* AVCODEC_RPI_HEVC_SEI_H */
-diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
-new file mode 100644
-index 0000000000..23b49a99ae
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.c
-@@ -0,0 +1,1537 @@
-+#include "rpi_hevc_shader.h"
-+
-+#ifdef _MSC_VER
-+   #include <stdint.h>
-+   /* cast through uintptr_t to avoid warnings */
-+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-+#else
-+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" { /* the types are probably wrong... */
-+#endif
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#ifdef _MSC_VER
-+__declspec(align(8))
-+#elif defined(__GNUC__)
-+__attribute__((aligned(8)))
-+#endif
-+unsigned int ff_hevc_rpi_shader[] = {
-+// ::mc_setup_c_q0
-+// ::mc_start
-+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_c_qn
-+/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
-+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
-+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
-+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
-+/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
-+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000110] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
-+/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
-+/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
-+/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
-+/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+// :1
-+/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
-+/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
-+/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
-+/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
-+// ::mc_filter_c_p
-+/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
-+/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
-+/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_p_l1
-+/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
-+/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
-+/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
-+/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_b
-+/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
-+/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
-+/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
-+/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
-+/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
-+/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
-+/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
-+/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
-+/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
-+/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
-+/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
-+/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
-+/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
-+/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
-+/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
-+/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+// :1
-+/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
-+/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
-+/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
-+/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
-+/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
-+/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
-+/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
-+/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_sync_q0
-+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q1
-+/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q2
-+/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q3
-+/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q4
-+/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q5
-+/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q6
-+/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q7
-+/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q8
-+/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q9
-+/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q10
-+/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync_q11
-+/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_qn
-+// ::mc_exit_y_qn
-+/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_q0
-+// ::mc_exit_y_q0
-+/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y_q0
-+/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_y_qn
-+/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
-+/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
-+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
-+/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
-+/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
-+/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
-+/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
-+// :1
-+/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000df0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
-+/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
-+/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
-+/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
-+// :per_block_setup_8
-+/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
-+/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
-+/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
-+/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
-+/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
-+/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
-+/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
-+/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
-+/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
-+/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
-+/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
-+/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
-+/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
-+/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+// ::mc_filter_y_pxx
-+/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
-+/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_bxx
-+/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
-+/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
-+/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
-+/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
-+/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
-+/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_p00
-+/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
-+/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
-+/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
-+/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
-+/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
-+/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
-+/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_b00
-+/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
-+/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
-+/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_setup_c10_q0
-+/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_c10_qn
-+/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
-+/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
-+/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
-+/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
-+/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
-+/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
-+/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00001770] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
-+/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
-+/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
-+/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
-+/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+// :1
-+/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
-+/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
-+/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
-+/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
-+// ::mc_filter_c10_p
-+/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
-+/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_p_l1
-+/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
-+/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
-+/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
-+/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
-+/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
-+/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
-+/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
-+/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
-+/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
-+// :1
-+/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
-+/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
-+/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
-+/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
-+/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
-+/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
-+/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_b
-+/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
-+/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
-+/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
-+/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
-+/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
-+/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
-+/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
-+/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
-+/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
-+/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
-+/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
-+/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
-+/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
-+/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
-+/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
-+/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
-+/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
-+/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+// :1
-+/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
-+/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
-+/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
-+/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
-+/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
-+/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
-+/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
-+/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
-+/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_sync10_q0
-+/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q1
-+/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q2
-+/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q3
-+/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q4
-+/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q5
-+/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q6
-+/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q7
-+/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q8
-+/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q9
-+/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q10
-+/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_sync10_q11
-+/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
-+/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_q0
-+// ::mc_exit_y10_q0
-+/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
-+/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_qn
-+// ::mc_exit_y10_qn
-+/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
-+/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
-+/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
-+/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y10_q0
-+/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
-+// ::mc_setup_y10_qn
-+/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
-+/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
-+/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
-+/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
-+/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
-+/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
-+/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
-+/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
-+/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
-+// :1
-+/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
-+/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
-+/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00002428] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
-+/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
-+/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
-+/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
-+// :per_block_setup_10
-+/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
-+/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
-+/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
-+/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
-+/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
-+/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
-+/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
-+/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
-+/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
-+/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
-+/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
-+/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
-+/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
-+/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+// ::mc_filter_y10_pxx
-+/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
-+/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
-+/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_p00
-+/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
-+/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
-+/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
-+/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
-+/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
-+/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
-+/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
-+/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_bxx
-+/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
-+/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
-+/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
-+/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
-+/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
-+/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
-+/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
-+/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
-+/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
-+/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
-+/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
-+/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_b00
-+/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
-+/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
-+/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
-+/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
-+/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
-+/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
-+/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
-+/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
-+/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
-+/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
-+// ::mc_end
-+};
-+#ifdef __HIGHC__
-+#pragma Align_to(8, ff_hevc_rpi_shader)
-+#endif
-diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
-new file mode 100644
-index 0000000000..79651c9b6c
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.h
-@@ -0,0 +1,63 @@
-+#ifndef rpi_hevc_shader_H
-+#define rpi_hevc_shader_H
-+
-+extern unsigned int ff_hevc_rpi_shader[];
-+
-+#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
-+#define mc_start (ff_hevc_rpi_shader + 0)
-+#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
-+#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
-+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
-+#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
-+#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
-+#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
-+#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
-+#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
-+#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
-+#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
-+#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
-+#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
-+#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
-+#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
-+#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
-+#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
-+#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
-+#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
-+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
-+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
-+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
-+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
-+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
-+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
-+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
-+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
-+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
-+#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
-+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
-+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
-+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
-+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
-+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
-+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
-+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
-+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
-+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
-+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
-+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
-+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
-+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
-+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
-+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
-+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
-+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
-+#define mc_end (ff_hevc_rpi_shader + 2860)
-+
-+#endif
-diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
-new file mode 100644
-index 0000000000..af5b59e181
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.qasm
-@@ -0,0 +1,1850 @@
-+# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+# All rights reserved.
-+#
-+# Redistribution and use in source and binary forms, with or without
-+# modification, are permitted provided that the following conditions are met:
-+#     * Redistributions of source code must retain the above copyright
-+#       notice, this list of conditions and the following disclaimer.
-+#     * Redistributions in binary form must reproduce the above copyright
-+#       notice, this list of conditions and the following disclaimer in the
-+#       documentation and/or other materials provided with the distribution.
-+#     * Neither the name of the copyright holder nor the
-+#       names of its contributors may be used to endorse or promote products
-+#       derived from this software without specific prior written permission.
-+#
-+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+#
-+# Written by Peter de Rivaz, John Cox
-+
-+
-+
-+# Inter pred asm
-+#
-+# Logic here should be good to 14 bits without modification
-+# but only 8 & 10 are currently instantiated & tested
-+# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
-+# in _p00 & _b00
-+
-+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
-+# the warning that we are using rotation & ra/rb registers. r0..3 can be
-+# rotated through all 16 elems ra regs can only be rotated through their
-+# local 4.  As it happens this is what is wanted here as we do not want the
-+# constants from the other half of the calc.
-+
-+# Number limits in P/B calculation
-+#
-+# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
-+# we offset our intermediates s.t. they always end up +ve before the next
-+# multiply (may be -ve whilst summing but that doesn't matter).
-+#
-+# Range calc for up to 14 bits (Y-B pred):
-+#
-+# denom: [0, 7]
-+# bmax = (1 << bits) - 1
-+# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
-+#
-+# wt_mul: [-128, 255]
-+# wt_off = off * 2 + 1: [-bmax, bmax]
-+#
-+# pel: [0, bmax]
-+# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
-+# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
-+# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
-+# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
-+# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
-+#  [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
-+#
-+# This all looks good and is mostly bit depth independant - and as we manage
-+# to do unsigned multiplies everywhere (now) this should be good for any bit
-+# depth up to 14 (we could probably do 16 - but that requires a few tweaks
-+# to the shifts we don't currently have logic for)
-+
-+# PREREAD is the number of requests that we have sitting in the TMU request
-+# queue.
-+#
-+# There are 8 slots availible in the TMU request Q for tm0s requests, but
-+# only 4 output FIFO entries and overflow is bad (corruption or crash)
-+# (If threaded then only 2 out FIFO entries, but we aren't.)
-+# In s/w we are effectively limited to the min vertical read which is >= 4
-+# so output FIFO is the limit.
-+#
-+# As the test for read-next is is the main part of the Luma loop (rather than
-+# the preload FIFO part) we are limited to min_luma_height - 1
-+# Min_luma_height is 4 so we can only have a preload of 3
-+# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
-+# in chroma without abandoning preload pretty much entirely (which would be bad)
-+#
-+# Timing tests vs preload of 4 suggests this doesn't hurt us much
-+# Could have preread 4 for Chroma but when tested it didn't help
-+
-+.set PREREAD,                      3
-+
-+# Offset added (effectively) at the exit of the H FIR filter
-+# This is enough to force the result +ve
-+# Is good if it is a power of 2 as that allows for >> without loss
-+#
-+# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
-+# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
-+# Round up to next power of 2
-+
-+.set FIR_OFFSET,                   0x4000
-+
-+# Block heights - 8 & 16 are the only numbers we currently support
-+
-+.set C_BLK_HEIGHT_8,               16
-+.set C_BLK_HEIGHT_16,              8
-+.set Y_BLK_HEIGHT_8,               16
-+.set Y_BLK_HEIGHT_16,              8
-+
-+# QPU counts - depend on block size
-+# If we have a 2-byte format & block_size > 8 then can only afford
-+# 8 QPUs
-+# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
-+
-+.set N_QPU_8,                      12
-+.set N_QPU_16,                     12
-+
-+# Value to add to the weight multiplier to convert it into an unsigned value
-+# Should be power of two for convienience
-+
-+.set LOG2_MUL_ADD,                 14
-+.set MUL_ADD,                      (1 << LOG2_MUL_ADD)
-+
-+# Fixed denom (max that it can be set to)
-+.set DENOM,                        7
-+
-+# register allocation
-+#
-+
-+# ra0-3
-+# Used as temp and may be loop filter coeffs (split into .8s)
-+# or temp in loop. Check usage on an individual basis.
-+
-+# ra4-11
-+# V FIFO / temp / free
-+
-+# -- free --                       ra12
-+
-+# -- free --                       ra13
-+
-+# -- free --                       ra14
-+
-+# -- free --                       ra15
-+
-+# uniform: width:height
-+.set ra_width_height,              ra16
-+.set ra_width,                     ra16.16b
-+.set ra_height,                    ra16.16a
-+
-+# y:y2 same layout as y_y2_next so we can update both together
-+.set ra_y_y2,                      ra17
-+.set ra_y2,                        ra17.16a
-+.set ra_y,                         ra17.16b
-+
-+# uniform: L1 weight (U on left, V on right)
-+# Only used in Y B
-+.set ra_wt_off_mul_l1,             ra18
-+.set ra_wt_off_l1,                 ra18.16b
-+.set ra_wt_mul_l1,                 ra18.16a
-+
-+# y_next:y2_next same layout as y_y2 so we can update both together
-+.set ra_y_y2_next,                 ra19
-+.set ra_y_next,                    ra19.16b
-+.set ra_y2_next,                   ra19.16a
-+
-+# Setup: consts - subdivide a single register
-+.set ra_kff800100,                 ra20
-+.set ra_k256,                      ra20.16a
-+.set ra_k0,                        ra20.8a
-+.set ra_k1,                        ra20.8b
-+.set ra_k128,                      ra20.8c
-+.set ra_k255,                      ra20.8d
-+
-+# Loop: xshifts
-+.set ra_xshift,                    ra21.16a
-+.set ra_xshift_next,               ra21.16b
-+
-+# Loop var: L0 weight (U on left, V on right)
-+# _off_ is not used in loop as we want to modify it before use
-+.set ra_wt_off_mul_l0,             ra22
-+.set ra_wt_mul_l0,                 ra22.16a
-+.set ra_wt_off_l0,                 ra22.16b
-+
-+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
-+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
-+#   2nd byte   but as the source should never be > 3 there 0x3ff should do
-+.set ra_blk_height_pmax,           ra23
-+.set ra_pmax,                      ra23.16a
-+.set ra_blk_height,                ra23.8c
-+# --free --                        ra23.8d
-+
-+# Loop:  src frame base (L0)
-+.set ra_base,                      ra24
-+
-+# Misc  offsets
-+.set ra_fir_off_val_wt_den_p7,     ra25
-+.set ra_wt_den_p7,                 ra25.8a
-+# -- free --                       ra25.8b
-+.set ra_fir_off_val,               ra25.16b
-+
-+# As it happens these constants are the same
-+.if FIR_OFFSET == MUL_ADD
-+# Weight multiplier unsigned add
-+.set ra_kmul_add,                  ra_fir_off_val
-+.else
-+.error "FIR_OFFSET != MUL_ADD: Need new register & init"
-+.endif
-+
-+# Loop: next src frame base (L0)
-+.set ra_base_next,                 ra26
-+
-+# Loop: height<<23 + width<<16 + vdw_setup_0
-+.set ra_dma0,                      ra27
-+
-+# Loop: destination address
-+.set ra_dest,                      ra28
-+
-+# Setup: Dup of rb_ef
-+# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
-+# (top bits are ignored by mul24)
-+.set ra_ef,                        ra29
-+
-+# Use an even numbered register as a link register to avoid corrupting flags
-+.set ra_link,                      ra30
-+
-+# -- free --                       ra31
-+
-+.set rb_xshift2,                   rb0
-+.set rb_xshift2_next,              rb1
-+
-+# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
-+.set rb_elem_x,                    rb2
-+
-+# El Flags
-+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
-+# Duped into ra_ef as sometimes that is easier to use
-+.set rb_ef,                        rb3
-+
-+# rb4-11
-+# Loop: V filter FIFO or V filter coeff
-+
-+# Loop var: offset to add before shift (round + weighting offsets)
-+# Exact value varies by loop
-+.set rb_wt_off,                    rb12
-+
-+# -- free --                       rb13
-+
-+# -- free --                       rb14
-+
-+# Loop: src frame base (L1)
-+.set rb_base2,                     rb15
-+
-+# Line pitch (128 for sand128)
-+.set rb_pitch,                     rb16
-+
-+# Loop count - 2 (set up TMU for next xfer)
-+.set rb_i_tmu,                     rb17
-+
-+# Loop count for min(height, 16)
-+# Y will reset & loop again if height > 16
-+.set rb_lcount,                    rb18
-+
-+# frame_base2_next
-+.set rb_base2_next,                rb19
-+
-+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
-+# offset to the slice
-+.set rb_xpitch,                    rb20
-+
-+# These 3 consts each save 1 instruction in Y loop setup
-+# so whilst they are worthwhile they should be the 1st to die if we need
-+# another b reg
-+.set rb_y_coeffs_2,                rb21                         # 0x050b0a00
-+.set rb_y_coeffs_3,                rb22                         # 0x11283a40
-+.set rb_y_coeffs_5,                rb23                         # 0x0a0b0500
-+
-+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
-+.set rb_pmask,                     rb24
-+
-+# vdw_setup_1(dst_pitch)
-+.set rb_dma1_base,                 rb25
-+
-+# Setup: pic width - 1
-+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
-+.set rb_max_x,                     rb26
-+
-+# vdw_setup_0 (depends on QPU number)
-+.set rb_dma0_base,                 rb27
-+
-+# Setup: vw_setup value to reset VPM write pointer
-+.set rb_vpm_init,                  rb28
-+
-+# Loop: vdw_setup_1(dst_pitch-width) = stride
-+.set rb_dma1,                      rb29
-+
-+# Setup: pic_height - 1
-+.set rb_max_y,                     rb30
-+
-+# Setup: FIR H offset
-+.set rb_fir_off_h,                 rb31
-+
-+
-+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
-+.set i_shift16,                    -16
-+.set i_shift21,                    -11
-+.set i_shift23,                     -9
-+.set i_shift30,                     -2
-+
-+# Much of the setup code is common between Y & C
-+# Macros that express this - obviously these can't be overlapped
-+# so are probably unsuitable for loop code
-+
-+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
-+  mov r2, qpu_num
-+.if v_bit_depth <= 8
-+  # 8 bit version
-+  asr r1, r2, 2
-+  shl r1, r1, 6
-+  and r0, r2, 3
-+  or  r0, r0, r1
-+
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add r_vpm, r0, r1  # VPM 8bit storage
-+
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
-+
-+.else
-+  # 16 bit version
-+  # Limited to 8 QPUs if blk height > 8
-+  asr r1, r2, 1
-+.if v_blk_height <= 8
-+  shl r1, r1, 4
-+.else
-+  shl r1, r1, 5
-+.endif
-+  and r0, r2, 1
-+  or  r0, r0, r1
-+
-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
-+  add r_vpm, r0, r1
-+
-+  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
-+  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
-+  shl r0, r0, 6
-+.endif
-+  add r_dma, r0, r1  # DMA out
-+.endm
-+
-+
-+.macro m_setup_q0
-+  srel -, 12
-+.endm
-+
-+# Code start label
-+::mc_start
-+
-+################################################################################
-+# mc_setup_c
-+#
-+# typedef struct qpu_mc_pred_c_s_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint32_t pic_cw;            // C Width (== Y width / 2)
-+#     uint32_t pic_ch;            // C Height (== Y Height / 2)
-+#     uint32_t stride2;
-+#     uint32_t stride1;
-+#     uint32_t wdenom;
-+#     int16_t y2;
-+#     int16_t x2;
-+#     uint32_t base2;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_s_t;
-+
-+.macro m_setup_c, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_pmask,           0xff
-+.set v_blk_height,      C_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         2
-+.set v_pmask,           0xffff
-+.set v_blk_height,      C_BLK_HEIGHT_16
-+.endif
-+
-+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
-+
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30      ; mov ra_base, unif             # ; ref_c_base
-+
-+# Read image dimensions
-+  sub r0, unif, 1                                               # pic c width
-+  shl rb_max_x, r0, v_x_shift                                   # rb_max_x in bytes
-+  sub rb_max_y, unif, 1                                         # pic c height
-+
-+# load constants
-+  mov ra_kff800100, 0xff800100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+
-+# get source pitch
-+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # ; stride2
-+  mov rb_pitch, unif                                            # stride1
-+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay] Merged with dst_stride shortly
-+  add rb_dma1_base, r1, rb_pitch                                # vdw_setup_1
-+
-+  and r0, 1, elem_num
-+  nop                           ; mul24 r0, r0, 5
-+.if v_bit_depth <= 8
-+  add rb_elem_x, r0, elem_num
-+.else
-+  add r0, r0, elem_num
-+  add rb_elem_x, r0, r0
-+.endif
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
-+
-+  shl r0, ra0.16b, v_x_shift                                    # [rb_elem_x delay]
-+  add r0, r0, rb_elem_x                                         # Add elem no to x to get X for this slice
-+  max r0, r0, 0                 ; mov ra_y, ra0.16a             # ; stash Y
-+  min r0, r0, rb_max_x
-+
-+# Get shift
-+# Shift will always calculate as 0 for 9+ bit
-+# Ideally we can optimize the shift out of the code in these cases but for now
-+# it is tidier to leave it in
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.else
-+  mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
-+.endif
-+
-+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
-+
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra0, unif                 # ; next_x2_y2
-+  add ra_base, ra_base, r0
-+
-+# Compute part of VPM to use for DMA output
-+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+# And again for L1, but only worrying about frame2 stuff
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# rb_base2 ends up with t1s base
-+
-+  shl r0, ra0.16b, v_x_shift
-+  add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a            # Add QPU slice offset
-+  max r0, r0, 0                 ; mov rb_base2, unif            # ref_c_base2
-+  min r0, r0, rb_max_x
-+
-+# Get shift (already zero if 9+ bit so ignore)
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
-+
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
-+
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1                ; mov r3, PREREAD
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov r2, ra_y2
-+  add rb_base2, rb_base2, r0    ; mov r0, ra_y
-+
-+# Do preloads
-+# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
-+
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1          ; mov ra_y, r0
-+
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t1s, rb_base2, r1         ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+  mov ra_link, unif                                             # link
-+# touch registers to keep simulator happy (and fills in delay slots)
-+  mov ra4, 0                    ; mov rb4, 0
-+  bra -, ra_link
-+  mov ra5, 0                    ; mov rb5, 0
-+  mov ra6, 0                    ; mov rb6, 0
-+  mov ra7, 0                    ; mov rb7, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_c_q0
-+  m_setup_q0
-+::mc_setup_c_qn
-+  m_setup_c 8
-+
-+################################################################################
-+#
-+# mc_filter_c_p
-+#
-+# typedef struct qpu_mc_pred_c_p_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint16_t h;
-+#     uint16_t w;
-+#     uint32_t coeffs_x;
-+#     uint32_t coeffs_y;
-+#     uint32_t wo_u;
-+#     uint32_t wo_v;
-+#     uint32_t dst_addr_c;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_p_t;
-+
-+.macro m_filter_c_p, v_tmu, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_x_mul,           4
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+.if v_tmu == 0
-+.set vrx_xshift,        rb_xshift2              # b side more convienient
-+.set vrx_xshift_next,   ra_xshift_next
-+.set vra_y_next,        ra_y_next
-+.set vrx_base_next,     ra_base_next
-+.set vra_y,             ra_y
-+.set vra_base,          ra_base
-+.set vr_txs,            t0s
-+.else
-+.set vrx_xshift,        ra_xshift               # a side more convienient
-+.set vrx_xshift_next,   rb_xshift2_next
-+.set vra_y_next,        ra_y2_next
-+.set vrx_base_next,     rb_base2_next
-+.set vra_y,             ra_y2
-+.set vra_base,          rb_base2
-+.set vr_txs,            t1s
-+.endif
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
-+
-+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; base
-+
-+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0          # r5 = 0
-+  add r0, r0, rb_elem_x         ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
-+  sub r1, r5, rb_pitch          ; mov ra0, unif                 # ; H filter coeffs
-+  max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
-+  min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
-+
-+.if v_bit_depth <= 8
-+  shl vrx_xshift_next, r0, 3
-+  and r0, r0, -4
-+.endif
-+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra3, unif                 # ; V filter coeffs
-+  add vrx_base_next, r3, r0     ; mov r1, ra_height
-+
-+# set up VPM write
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
-+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
-+
-+# Misc final setup...
-+
-+  shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif             # ; dst_addr
-+  add r0, r0, r2                ; mov r2, ra_fir_off_val        # Combine width and height of destination area (r0=h<<8, r2=w*2)
-+  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c              # Shift into bits 16 upwards of the vdw_setup0 register
-+  add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0          # ; r1=weight
-+  shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
-+  sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4            # ; loop counter (V FIFO fill = 4)
-+  mov rb11, ra3.8d              ; mov ra_link, unif             # ; Link
-+
-+# r5           = -4                     (loop counter)
-+# ra_wt_mul_l0 = weight L0 + 128        (now unsigned)
-+# rb_wt_off    = (offset * 2 + 1) << (wt_den + 5)
-+# rb31         = FIR value offset
-+
-+# FIFO: rb4, ra5, rb6, ra7
-+# Coeffs in ra3.8a, ra3.8b, rb10, rb11
-+
-+# We want (r0r1)
-+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
-+# We fetch (after shift)
-+#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+.if v_tmu == 0
-+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
-+  shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
-+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
-+.else
-+  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
-+  shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
-+  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next       # [r1 << delay]
-+.endif
-+
-+  add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
-+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+  min r3, r3, rb_max_y          ; mov.ifnc r0, r2
-+
-+  and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
-+.if v_tmu == 0
-+  add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask        # ; mask bytes
-+.else
-+  add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax         # ; mask bytes
-+.endif
-+
-+# apply horizontal filter
-+# The filter coeffs for the two halves of this are the same (unlike in the
-+# Y case) so it doesn't matter which ra0 we get them from
-+# Also as the two halves are locked together we don't need to separate the 1st
-+# r0 mul or the last r1 mul as they are valid for all QPUs
-+
-+  add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
-+  sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
-+# We would like to save the r5->r4 shift but we need a delay slot
-+# for both r7 & r6 which we can't find anything to put in if we have
-+# already multiplied r4 & r5!
-+  brr.anyn -, r:1b
-+  add r2, r2, r3                ; mul24 r0, ra7, rb10           # r6 post
-+  mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b         # r5 post
-+  asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
-+# >>> .anyn 1b
-+
-+  add r1, r1, r0                ; mul24 r0, rb4, ra3.8a         # [ra7 delay]
-+  sub r1, r1, r0                ; mul24 r0, ra7, rb11
-+  sub r1, r1, r0
-+
-+  asr r1, r1, 6                 ; mov r3, ra_blk_height         # ; NxtLoop
-+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; v8subs r0, ra_height, r3      # ; NxtLoop
-+  brr.anyn -, r:1b
-+  asr r1, r1, i_wt_den_p6
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_p
-+  m_filter_c_p 0, 8
-+
-+::mc_filter_c_p_l1
-+  m_filter_c_p 1, 8
-+
-+################################################################################
-+#
-+# mc_filter_c_b
-+#
-+# typedef struct qpu_mc_pred_c_b_s {
-+#     int16_t y;
-+#     int16_t x;
-+#     uint32_t base;
-+#     uint16_t h;
-+#     uint16_t w;
-+#     uint32_t coeffs_x1;
-+#     uint32_t coeffs_y1;
-+#     int16_t weight_u1;
-+#     int16_t weight_v1;
-+#     int16_t y2;
-+#     int16_t x2;
-+#     uint32_t base2;
-+#     uint32_t coeffs_x2;
-+#     uint32_t coeffs_y2;
-+#     uint32_t wo_u2;
-+#     uint32_t wo_v2;
-+#     uint32_t dst_addr_c;
-+#     uint32_t next_fn;
-+# } qpu_mc_pred_c_b_t;
-+
-+.macro m_filter_c_b, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+.set v_x_mul,           (1 << v_x_shift)
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
-+
-+  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; r3=base
-+
-+  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1          # x ; r5=0
-+  add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
-+  sub r1, r5, rb_pitch          ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
-+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x          ; mov ra0, unif                 # ; L0 H filter coeffs
-+
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.endif
-+
-+  and r0, r0, -4                ; mov ra2, unif                 # ; L0 V filter coeffs
-+  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=x*2 (we are working in pel pairs)
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov r1, ra_height             # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
-+
-+# set up VPM write
-+
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U weight
-+  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
-+
-+  shl r0, r1, v_dma_h_shift     ; mov ra3, unif                 # ; x2_y2
-+  add r0, r0, r2                ; mov r3, unif                  # [ra3 delay] ; base
-+  shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a       # Shift into bits 16 upwards of the vdw_setup0 register
-+  add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b               # r0=x
-+
-+# L1 - uniform layout could possibly be optimized
-+
-+  shl r0, r0, v_x_shift         ; mov ra1, unif                 # r0=x<<shift ; L1 H filter coeffs
-+  add r0, r0, rb_elem_x         ; mov ra3, unif                 # ; L1 V filter coeffs
-+  sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif    # [ra3 delay] r1=pitch2 mask ; U offset/weight
-+  max r0, r0, r5                ; mov ra9, rb_max_y
-+  min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
-+
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
-+
-+  and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
-+  and r1, r0, r1                ; mov r5rep, -4
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra_dest, unif             #  Add stripe offsets ; dst_addr
-+  add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
-+
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+  add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+  add r0, r0, r1                ; mov r1, ra_wt_off_l1          # ; L0 off unset
-+  shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
-+  sub rb_wt_off, r1, r0         ; mov ra_link, unif             # ; link
-+
-+  mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
-+
-+# r5        loop counter (-4)
-+# ra0       H coeffs L0
-+# ra1       H coeffs L1
-+# ra2       V coeffs L0
-+# ra3       V coeffs L1
-+# ra9       rb_max_y alias
-+# ra10      rb_xshift2 alias
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
-+  shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
-+  shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
-+  add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next # [ra_y delay]
-+  add ra_y, 1, ra_y             ; mov r3, ra_y
-+
-+  max r3, r3, ra_k0             ; mov      r0, r1 << 15
-+  min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
-+
-+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+  add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask        # ; masks bytes
-+
-+# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
-+
-+  and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
-+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+  add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
-+
-+  shr r2, r4, ra10              ; mov rb5, rb6
-+  shr r1, r2, v_v_shift         ; mov r3, ra_y2
-+  shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7                  # [r1 << delay]
-+
-+  add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
-+  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
-+  min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
-+
-+  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
-+  add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax         # ; masks bytes
-+
-+# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
-+
-+  add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
-+  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
-+  sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+  add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
-+  mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
-+  shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+# >>> .anyn 1b
-+
-+  sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b        # L1 ; L0
-+  sub.setf -, r5, rb_lcount     ; mov r0, ra4
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  add r1, r1, r0                ; mul24 r0, ra7,  rb7
-+
-+  sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c        # L1
-+  add r2, r2, r0                ; mul24 r0, ra11, rb11          # L1
-+  sub r2, r2, r0
-+
-+  shr r1, r1, 6
-+  shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
-+  add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
-+  add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
-+  sub r1, r1, r2                ; mov r3, ra_blk_height         # ; NxtLoop
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3      # ; NxtLoop
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, ra_wt_den_p7
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_b
-+  m_filter_c_b 8
-+
-+################################################################################
-+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
-+# conflicts
-+
-+.macro m_exit_drain
-+.if PREREAD == 2
-+# Special case 2 as loop is wasteful
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  nop                   ; nop           ; ldtmu0
-+  mov -, vw_wait        ; nop           ; ldtmu1
-+.else
-+  mov.setf r3, PREREAD - 1
-+:1
-+  brr.anynz -, r:1b
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  sub.setf r3, r3, 1
-+ # >>>
-+  mov  -, vw_wait
-+.endif
-+.endm
-+
-+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
-+# All qpus start at the beginning and after that (group - 1) must have finished
-+# before (group) can start
-+#
-+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
-+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
-+# lockup otherwise)
-+#
-+# There is some, currently ill defined, potential lockup if we have the VDM active
-+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
-+#
-+# The code stalled when I had many waiters on a single sem so we have a
-+# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
-+# and we currently have both the memory & sems to support it.
-+.macro m_sync_q, n_qpu, n_quads
-+# Do not generate code for qpu >= quads * 4 -  fns should never be called
-+.if n_qpu < n_quads * 4
-+  mov ra_link, unif     # Can only branch to an a reg (not r0)
-+  mov -, vw_wait        # [ra_link delay]
-+
-+.set n_sem_sync, n_qpu - (n_qpu % 4)
-+.set n_sem_in, n_qpu
-+.set n_sem_out, n_qpu + 1
-+
-+.if n_qpu % 4 == 0
-+
-+.set n_sem_quad_in,  12 + n_qpu / 4
-+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
-+
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  bra -, ra_link
-+  sacq -, n_sem_quad_in
-+  srel -, n_sem_out
-+  srel -, n_sem_quad_out
-+
-+.else
-+  bra -, ra_link
-+  srel -, n_sem_sync
-+  sacq -, n_sem_in
-+.if n_sem_out % 4 != 0
-+  srel -, n_sem_out
-+.else
-+  nop
-+.endif
-+.endif
-+.endif
-+.endm
-+
-+.set v_quads8, N_QPU_8 / 4
-+
-+::mc_sync_q0
-+  m_sync_q 0, v_quads8
-+::mc_sync_q1
-+  m_sync_q 1, v_quads8
-+::mc_sync_q2
-+  m_sync_q 2, v_quads8
-+::mc_sync_q3
-+  m_sync_q 3, v_quads8
-+::mc_sync_q4
-+  m_sync_q 4, v_quads8
-+::mc_sync_q5
-+  m_sync_q 5, v_quads8
-+::mc_sync_q6
-+  m_sync_q 6, v_quads8
-+::mc_sync_q7
-+  m_sync_q 7, v_quads8
-+::mc_sync_q8
-+  m_sync_q 8, v_quads8
-+::mc_sync_q9
-+  m_sync_q 9, v_quads8
-+::mc_sync_q10
-+  m_sync_q 10, v_quads8
-+::mc_sync_q11
-+  m_sync_q 11, v_quads8
-+
-+# mc_exit()
-+# Chroma & Luma the same now
-+
-+.macro m_exit_qn
-+  m_exit_drain
-+  nop                   ; nop           ; thrend
-+  nop
-+  nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_qn
-+::mc_exit_y_qn
-+  m_exit_qn
-+
-+
-+
-+# mc_interrupt_exit12()
-+
-+.macro m_exit_q0
-+  m_exit_drain
-+  sacq -, 12
-+  nop                   ; nop           ; thrend
-+  mov interrupt, 1
-+  nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_q0
-+::mc_exit_y_q0
-+  m_exit_q0
-+
-+# LUMA CODE
-+
-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-+# For P frames we make the second x,y coordinates offset by +8
-+
-+
-+################################################################################
-+# mc_setup
-+#
-+# typedef struct qpu_mc_pred_y_s_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t pic_h;
-+#    uint16_t pic_w;
-+#    uint32_t stride2;
-+#    uint32_t stride1;
-+#    uint32_t wdenom;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_s_t;
-+
-+.macro m_setup_y, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_pmask,           0xff
-+.set v_blk_height,      Y_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         1
-+.set v_pmask,           0xffff
-+.set v_blk_height,      Y_BLK_HEIGHT_16
-+.endif
-+
-+
-+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
-+  mov ra9, unif                                                 # ref_y_base
-+  mov ra1, unif                                                 # x2_y2
-+
-+
-+# load constants
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30      ; mov ra11, unif                # ; ref_y2_base
-+
-+  mov ra_kff800100, 0xff800100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+  mov rb_y_coeffs_2, 0x050b0a00
-+  mov rb_y_coeffs_3, 0x11283a40
-+  mov rb_y_coeffs_5, 0x0a0b0500
-+
-+# Compute part of VPM to use
-+
-+# Read image dimensions
-+  mov ra3, unif                                                 # width_height
-+  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # [ra3 delay] ; stride2
-+.if v_x_shift == 0
-+  sub rb_max_x, ra3.16b, 1
-+.else
-+  sub r0, ra3.16b, 1
-+  shl rb_max_x, r0, v_x_shift
-+.endif
-+  sub rb_max_y, ra3.16a, 1
-+  mov r3, elem_num              ; mov rb_pitch, unif            # stride1
-+
-+# get destination pitch
-+  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay]
-+  or  rb_dma1_base, r1, rb_pitch
-+
-+# Compute base address for first and second access
-+  add r0, ra0.16b, r3                                           # Load x + elem_num
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+
-+# X is byte offset - we can only load words - mask
-+
-+  and r0, r0, -4                ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                                # Add stripe offsets
-+  add ra_base, ra9, r0
-+
-+  # r3 still contains elem_num
-+  add r0, ra1.16b, r3                                           # Load x
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
-+
-+  # r2 still contains mask
-+  and r0, r0, -4
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                                # Add stripe offsets
-+  add rb_base2, ra11, r0
-+
-+# Do preloads
-+  nop                           ; mov r0, ra0.16a               # ; r0 = y
-+  mov r3, PREREAD               ; mov r2, ra1.16a               # ; r2 = y2
-+
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1          ; mov ra_y, r0
-+
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
-+  add t1s, rb_base2, r1         ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+  mov ra_link, unif                                             # Next fn
-+
-+# touch vertical context to keep simulator happy
-+  mov ra8,  0                   ; mov rb8,  0                   # [ra_link delay]
-+  bra -, ra_link
-+  mov ra9,  0                   ; mov rb9,  0
-+  mov ra10, 0                   ; mov rb10, 0
-+  mov ra11, 0                   ; mov rb11, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_y_q0
-+  m_setup_q0
-+::mc_setup_y_qn
-+  m_setup_y 8
-+
-+################################################################################
-+#
-+# Start of per-block setup code
-+# P and B blocks share the same setup code to save on Icache space
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# 1st 3 instructions of per_block-setup in branch delay
-+#
-+# typedef struct qpu_mc_pred_y_p_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t mymx21;
-+#    uint32_t wo1;
-+#    uint32_t wo2;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p_t;
-+#
-+
-+.macro m_luma_setup, v_bit_depth
-+# Hack - QASM may well have have label pasting but I have no idea how...
-+.if v_bit_depth == 8
-+  brr ra_link, r:per_block_setup_8
-+.elif v_bit_depth == 10
-+  brr ra_link, r:per_block_setup_10
-+.endif
-+  mov ra0, unif                 ; mov r3, elem_num              # y_x ; elem_num has implicit unpack??
-+  add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2          # [ra0 delay] ; r5 = 0
-+  add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
-+.endm
-+
-+.macro m_per_block_setup, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x
-+
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+  and r0, r0, -4
-+  sub r2, r5, rb_pitch          ; mov ra_base_next, unif        # ; src1.base
-+  and r1, r0, r2                ; mov ra_y_next, ra0.16a
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra1, unif                 # Add stripe offsets ; src2.x_y
-+  add ra_base_next, ra_base_next, r0                            # [ra1 delay]
-+
-+  add r0, ra1.16b, r3                                           # Load x2
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5                ; mov ra_y2_next, ra1.16a
-+  min r0, r0, rb_max_x          ; mov rb_base2_next, unif       # ; src2.base
-+  shl rb_xshift2_next, r0, 3                                    # Compute shifts
-+  and r0, r0, -4                ; mov ra_width_height, unif     # ; width_height
-+  and r1, r0, r2                ; mov vw_setup, rb_vpm_init     # ; set up VPM write
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul   # Add stripe offsets ; r1 = x in bytes
-+  add rb_base2_next, rb_base2_next, r0
-+
-+# get width,height of block (unif load above), r1 = width * pel_size
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height             # Compute vdw_setup1(dst_pitch-width)
-+  add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+  add rb_lcount, r0, (7-8)
-+  shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add           # ; r3 return val
-+  add r0, r0, r1                                                # Combine width and height of destination area
-+  shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val        # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
-+  add ra_dma0, r0, rb_dma0_base ; mov r0, unif                  # ; Packed filter offsets
-+
-+# get filter coefficients and discard unused B frame values
-+  shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif    #  Pick half to use ; L0 offset/weight
-+  shl ra8, r0, 3                ; mov rb5, ra_k255
-+
-+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
-+
-+# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
-+# but I can't see a way of doing that that is cheap enough to be worth it
-+
-+# Picked out in a slightly random order to space out uniform loads
-+
-+  # 1
-+  mov r1, 0x01040400            # [ra8 delay]
-+  ror ra2.8b, r1, ra8.8d
-+  ror ra0.8b, r1, ra8.8c
-+  # 2
-+  ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+  ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+  # 0
-+  mov r1,0x00010100             # -ve  [ra8 delay]
-+  ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif    # ; L1 Wt/Offset
-+  ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
-+  # 7
-+  shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
-+  ror r0, r1, ra8.8d            ; mov ra_dest, unif             # ; Destination address
-+  ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
-+  # 3
-+  ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+  ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+  # 5
-+  ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+  ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+  # 6
-+  mov r1,0x04040100
-+  ror ra3.8c, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8                 # ; r5 return val
-+
-+  bra -, ra_link
-+  # 4
-+  mov r1,0x3a281100
-+  ror r0, r1, ra8.8d            ; mov ra_link, unif             # ; link - load after we've used its previous val
-+  ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
-+# >>> branch ra_link
-+
-+# r5 = -8
-+# r2 = fir_off_val
-+# r3 = 128
-+.endm
-+
-+:per_block_setup_8
-+  m_per_block_setup 8
-+
-+
-+
-+################################################################################
-+#
-+# mc_filter_y_pxx
-+#
-+# Setup (& therefore uniform struct) shared with _bxx
-+# Struct in m_luma_setup
-+#
-+# We can have 2 separate P reqs here as long as they mate to generate a
-+# rectangular output block (i.e. h0 = h1, w0 = 8)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_pxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+  m_luma_setup v_bit_depth
-+
-+  shl r1, ra_wt_off_l0, i_wt_den_p5
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
-+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# This loop is identical to the B loop from here --->
-+:1
-+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+
-+  max r2, ra_y, 0               ; mov r1, 0
-+  min r2, r2, rb_max_y          ; mov r3, ra_k1
-+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+  add t0s, ra_base, r2          ; mov rb5,  rb6
-+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+
-+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
-+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+  add t1s, rb_base2, r2         ; mov ra8,  ra9
-+
-+# apply horizontal filter
-+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+  # >>> .anyn 1b (r5 + r5)
-+
-+  # apply vertical filter and write to VPM
-+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+  add r1, r1, r0                ; mul24 r0, ra11, rb11
-+# <--- to here
-+  sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height                 # ; NxtLoop: r3 = block height
-+  sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
-+  sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
-+
-+  asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
-+  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
-+  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; v8subs r0, ra_height, r3              # ; NxtLoop: r0 = remaining height (0 saturate)
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, i_wt_den_p6
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch                # ; NxtLoop
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_pxx
-+  m_filter_y_pxx 8
-+
-+
-+################################################################################
-+
-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+#
-+# Setup (& therefore uniform struct) shared with _pxx
-+# Struct in m_luma_setup
-+#
-+# l0 calc in els 0-7, L1 in 8-15
-+# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_bxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
-+
-+  m_luma_setup v_bit_depth
-+
-+  shl r1, ra_wt_off_l0, i_wt_den_p6
-+  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+  sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
-+  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
-+
-+# This loop is identical to the P loop from here --->
-+:1
-+  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
-+
-+  max r2, ra_y, 0               ; mov r1, 0
-+  min r2, r2, rb_max_y          ; mov r3, ra_k1
-+  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
-+  add t0s, ra_base, r2          ; mov rb5,  rb6
-+  shr r0, r4, ra_xshift         ; mov rb6,  rb7
-+
-+  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
-+  shr r1, r4, rb_xshift2        ; mov rb7, ra8
-+  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
-+  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
-+  add t1s, rb_base2, r2         ; mov ra8,  ra9
-+
-+# apply horizontal filter
-+  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
-+  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
-+  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+  brr.anyn -, r:1b
-+  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
-+  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+  # >>> .anyn 1b (r5 + r5)
-+
-+  # apply vertical filter and write to VPM
-+  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
-+  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
-+  add r1, r1, r0                ; mul24 r0, ra8,  rb8
-+  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
-+  add r1, r1, r0                ; mul24 r0, ra11, rb11
-+# <--- to here
-+  sub r1, r1, ra4
-+  sub r1, r1, r0                ; mov r2, rb_wt_off
-+
-+  asr r1, r1, 6
-+  sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
-+  mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+  sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
-+  sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
-+  add r1, r1, r2                ; mov r0, r1 << 8
-+  add r1, r1, r0                ; mov r3, ra_blk_height         # ; NxtLoop: r3 = block height
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch        # ; NxtLoop
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, 0                ; v8subs r0, ra_height, r3      # ; NxtLoop: r0 = remaining height (0 saturate)
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed block_height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link (ra_height - remaining height)
-+
-+# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_bxx
-+  m_filter_y_bxx 8
-+
-+################################################################################
-+#
-+# typedef struct qpu_mc_pred_y_p00_s {
-+#    qpu_mc_src_t next_src1;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t wo1;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p00_t;
-+
-+.macro m_filter_y_p00, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+
-+  mov ra0, unif                 ; mov r0, elem_num              # y_x
-+  mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5          # [ra0 delay] ; r5 = 0
-+  add r0, ra0.16b, r0           ; mov ra_base_next, unif        # ; src1.base
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+
-+  max r0, r0, r5                ; mov ra_y_next, ra0.16a        # ; width_height
-+  min r0, r0, rb_max_x          ; mov ra_width_height, unif
-+
-+  shl ra_xshift_next, r0, 3                                     # Compute shifts
-+  and r0, r0, -4
-+  sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif    # ; weight_offset
-+  and r1, r0, r2
-+  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                ; mov ra_dest, unif             # Add stripe offsets ; dest addr
-+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
-+
-+# get width,height of block (unif load above)
-+# Compute vdw_setup1(dst_pitch-width)
-+  shl r1, ra_width, v_x_shift
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+  sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
-+  shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
-+  add r0, r0, r1                                                # Combine width and height of destination area
-+  shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+  shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif             # Shift into bits 16 upwards of the vdw_setup0 register ; link
-+  add ra_dma0, r0, rb_dma0_base
-+
-+:1
-+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
-+  nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
-+
-+  sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
-+  shl r1, r1, 8                 ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, DENOM + 8
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, ra_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_p00
-+  m_filter_y_p00 8
-+
-+################################################################################
-+
-+.macro m_filter_y_b00, v_bit_depth
-+# luma setup does a fair bit more than we need calculating filter coeffs
-+# that we will never use but it saves I-cache to use it (also simple!)
-+  m_luma_setup v_bit_depth
-+
-+# Fix up vals that were expecting a filter (somewhat icky)
-+  mov r2, 1
-+  add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0      # Need in rX rather than raX for <<8 to do what we want
-+  shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
-+  nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+
-+:1
-+  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
-+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
-+
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
-+
-+  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+  add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax         # v8subs masks out all but bottom byte
-+  and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
-+
-+  sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
-+  add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
-+
-+  shl r1, r1, 8                 ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
-+
-+  brr.anyn -, r:1b
-+  asr r1, r1, (DENOM + 9) - 32                                  # -32 to get valid shift immediate
-+  min r1, r1, ra_pmax           ; mov -, vw_wait
-+  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
-+
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
-+  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  brr -, r:1b
-+  add rb_lcount, rb_lcount, r0
-+  add ra_dma0, ra_dma0, r1
-+  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_b00
-+  m_filter_y_b00 8
-+
-+################################################################################
-+################################################################################
-+# 10 BIT
-+
-+::mc_setup_c10_q0
-+  m_setup_q0
-+::mc_setup_c10_qn
-+  m_setup_c 10
-+
-+::mc_filter_c10_p
-+  m_filter_c_p 0, 10
-+
-+::mc_filter_c10_p_l1
-+  m_filter_c_p 1, 10
-+
-+
-+::mc_filter_c10_b
-+  m_filter_c_b 10
-+
-+# Even if these fns are the same as for other bit depths we want our own copy
-+# to keep the code we are using in a single lump to avoid (direct map) cache
-+# thrashing
-+.set v_quads10, N_QPU_16 / 4
-+
-+::mc_sync10_q0
-+  m_sync_q 0, v_quads10
-+::mc_sync10_q1
-+  m_sync_q 1, v_quads10
-+::mc_sync10_q2
-+  m_sync_q 2, v_quads10
-+::mc_sync10_q3
-+  m_sync_q 3, v_quads10
-+::mc_sync10_q4
-+  m_sync_q 4, v_quads10
-+::mc_sync10_q5
-+  m_sync_q 5, v_quads10
-+::mc_sync10_q6
-+  m_sync_q 6, v_quads10
-+::mc_sync10_q7
-+  m_sync_q 7, v_quads10
-+::mc_sync10_q8
-+  m_sync_q 8, v_quads10
-+::mc_sync10_q9
-+  m_sync_q 9, v_quads10
-+::mc_sync10_q10
-+  m_sync_q 10, v_quads10
-+::mc_sync10_q11
-+  m_sync_q 11, v_quads10
-+
-+::mc_exit_y10_q0
-+::mc_exit_c10_q0
-+  m_exit_q0
-+
-+::mc_exit_y10_qn
-+::mc_exit_c10_qn
-+  m_exit_qn
-+
-+::mc_setup_y10_q0
-+  m_setup_q0
-+::mc_setup_y10_qn
-+  m_setup_y 10
-+
-+:per_block_setup_10
-+  m_per_block_setup 10
-+
-+::mc_filter_y10_pxx
-+  m_filter_y_pxx 10
-+
-+::mc_filter_y10_p00
-+  m_filter_y_p00 10
-+
-+::mc_filter_y10_bxx
-+  m_filter_y_bxx 10
-+
-+::mc_filter_y10_b00
-+  m_filter_y_b00 10
-+
-+
-+
-+::mc_end
-+# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
-new file mode 100644
-index 0000000000..89711d776b
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_cmd.h
-@@ -0,0 +1,165 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#ifndef RPI_SHADER_CMD_H
-+#define RPI_SHADER_CMD_H
-+
-+#pragma pack(push, 4)
-+
-+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
-+// If mixed then we are just confused and get a lot of warnings....
-+typedef const uint8_t * qpu_mc_src_addr_t;
-+typedef uint8_t * qpu_mc_dst_addr_t;
-+#else
-+typedef uint32_t qpu_mc_src_addr_t;
-+typedef uint32_t qpu_mc_dst_addr_t;
-+#endif
-+
-+typedef struct qpu_mc_src_s
-+{
-+    int16_t y;
-+    int16_t x;
-+    qpu_mc_src_addr_t base;
-+} qpu_mc_src_t;
-+
-+
-+typedef struct qpu_mc_pred_c_p_s {
-+    qpu_mc_src_t next_src;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x;
-+    uint32_t coeffs_y;
-+    uint32_t wo_u;
-+    uint32_t wo_v;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_p_t;
-+
-+typedef struct qpu_mc_pred_c_b_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x1;
-+    uint32_t coeffs_y1;
-+    int16_t weight_u1;
-+    int16_t weight_v1;
-+    qpu_mc_src_t next_src2;
-+    uint32_t coeffs_x2;
-+    uint32_t coeffs_y2;
-+    uint32_t wo_u2;
-+    uint32_t wo_v2;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_b_t;
-+
-+typedef struct qpu_mc_pred_c_s_s {
-+    qpu_mc_src_t next_src1;
-+    uint32_t pic_cw;            // C Width (== Y width / 2)
-+    uint32_t pic_ch;            // C Height (== Y Height / 2)
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    qpu_mc_src_t next_src2;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_s_t;
-+
-+typedef struct qpu_mc_pred_c_s {
-+    union {
-+        qpu_mc_pred_c_p_t p;
-+        qpu_mc_pred_c_b_t b;
-+        qpu_mc_pred_c_s_t s;
-+    };
-+} qpu_mc_pred_c_t;
-+
-+
-+typedef struct qpu_mc_pred_y_p_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t mymx21;
-+    uint32_t wo1;
-+    uint32_t wo2;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p_t;
-+
-+typedef struct qpu_mc_pred_y_p00_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t wo1;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p00_t;
-+
-+typedef struct qpu_mc_pred_y_s_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t pic_h;
-+    uint16_t pic_w;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_s_t;
-+
-+typedef struct qpu_mc_pred_sync_s {
-+    uint32_t next_fn;
-+} qpu_mc_pred_sync_t;
-+
-+// Only a useful structure in that it allows us to return something other than a void *
-+typedef struct qpu_mc_pred_y_s {
-+    union {
-+        qpu_mc_pred_y_p_t p;
-+        qpu_mc_pred_y_p00_t p00;
-+        qpu_mc_pred_y_s_t s;
-+    };
-+} qpu_mc_pred_y_t;
-+
-+typedef union qpu_mc_pred_cmd_u {
-+    qpu_mc_pred_y_t y;
-+    qpu_mc_pred_c_t c;
-+    qpu_mc_pred_sync_t sync;
-+} qpu_mc_pred_cmd_t;
-+
-+static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
-+{
-+    // Link is last el of previous cmd
-+    ((uint32_t *)cmd)[-1] = fn;
-+}
-+
-+#define QPU_MC_PRED_N_Y8        12
-+#define QPU_MC_PRED_N_C8        12
-+
-+#define QPU_MC_PRED_N_Y10       12
-+#define QPU_MC_PRED_N_C10       12
-+
-+#define QPU_MC_DENOM            7
-+
-+#pragma pack(pop)
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
-new file mode 100644
-index 0000000000..77d8366eb8
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.c
-@@ -0,0 +1,88 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+
-+typedef struct shader_track_s
-+{
-+    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    const struct qpu_mc_src_s *last_l0;
-+    const struct qpu_mc_src_s *last_l1;
-+    uint32_t width;  // pic_width * PW
-+    uint32_t height;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+} shader_track_t;
-+
-+static int wtoidx(const unsigned int w)
-+{
-+    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+    return pel_weight[w];
-+}
-+
-+static const int fctom(uint32_t x)
-+{
-+    int rv;
-+    // As it happens we can take the 2nd filter term & divide it by 8
-+    // (dropping fractions) to get the fractional move
-+    rv = 8 - ((x >> 11) & 0xf);
-+    av_assert2(rv >= 0 && rv <= 7);
-+    return rv;
-+}
-+
-+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
-+{
-+    return (x << shl) >> shr;
-+}
-+
-+static inline int woff_p(HEVCRpiContext *const s, int32_t x)
-+{
-+    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int woff_b(HEVCRpiContext *const s, int32_t x)
-+{
-+    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int wweight(int32_t x)
-+{
-+    return ext(x, 16, 16);
-+}
-+
-+
-+#define PW 1
-+#include "rpi_hevc_shader_template_fn.h"
-+
-+#undef PW
-+#define PW 2
-+#include "rpi_hevc_shader_template_fn.h"
-+
-diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
-new file mode 100644
-index 0000000000..0fc5a45e9f
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.h
-@@ -0,0 +1,49 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiInterPredEnv;
-+
-+void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void rpi_sand_dump8(const char * const name,
-+                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+void rpi_sand_dump16(const char * const name,
-+                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
-new file mode 100644
-index 0000000000..10c163a4b9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template_fn.h
-@@ -0,0 +1,502 @@
-+/*
-+Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+#define PATCH_STRIDE (16 * PW)
-+
-+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
-+        const pixel s = *(const pixel *)src;
-+        pixel * d = (pixel *)dst;
-+        for (unsigned int j = 0; j < w; j += PW) {
-+            *d++ = s;
-+        }
-+    }
-+}
-+
-+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride) {
-+        memcpy(dst, src, w);
-+    }
-+}
-+
-+static void FUNC(get_patch_y)(const shader_track_t * const st,
-+                         uint8_t * dst, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
-+{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > st->width) {
-+        if (x >= st->width)
-+            x = st->width - PW;
-+        dr = (x + w) - st->width;
-+        w = st->width - x;
-+    }
-+
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > st->height) {
-+        if (y >= st->height)
-+            y = st->height - 1;
-+        db = (y + h) - st->height;
-+        h = st->height - y;
-+    }
-+
-+    dst += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+    // Edge dup
-+    if (dl != 0)
-+        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
-+    if (dr != 0)
-+        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
-+    w += dl + dr;
-+    dst -= dl;
-+
-+    if (dt != 0)
-+        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
-+    if (db != 0)
-+        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
-+}
-+
-+
-+
-+static void FUNC(get_patch_c)(const shader_track_t * const st,
-+                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
-+{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+    const int width = st->width;
-+    const int height = st->height;
-+
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > width) {
-+        if (x >= width)
-+            x = width - PW;
-+        dr = (x + w) - width;
-+        w = width - x;
-+    }
-+
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > height) {
-+        if (y >= height)
-+            y = height - 1;
-+        db = (y + h) - height;
-+        h = height - y;
-+    }
-+
-+    dst_u += dl + dt * dst_stride;
-+    dst_v += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+    // Edge dup
-+    if (dl != 0)
-+    {
-+        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
-+        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
-+    }
-+    if (dr != 0)
-+    {
-+        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
-+        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
-+    }
-+    w += dl + dr;
-+    dst_u -= dl;
-+    dst_v -= dl;
-+
-+    if (dt != 0)
-+    {
-+        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
-+        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
-+    }
-+    if (db != 0)
-+    {
-+        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
-+        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
-+    }
-+}
-+
-+// w, y, w, h in pixels
-+// stride1, stride2 in bytes
-+void FUNC(rpi_sand_dump)(const char * const name,
-+                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
-+{
-+    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
-+
-+    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
-+
-+    if (is_c) {
-+        x *= 2;
-+        w *= 2;
-+    }
-+
-+    for (int i = y; i != y + h; ++i) {
-+        for (int j = x; j != x + w; ++j) {
-+            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
-+            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
-+#if PW == 1
-+            if (j < 0 || i < 0)
-+                printf("..%c", sep);
-+            else
-+                printf("%02x%c", *(const pixel*)p, sep);
-+#else
-+            if (j < 0 || i < 0)
-+                printf("...%c", sep);
-+            else
-+                printf("%03x%c", *(const pixel*)p, sep);
-+#endif
-+        }
-+        printf("\n");
-+    }
-+}
-+
-+
-+void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
-+                  const HEVCRpiInterPredEnv *const ipe_y,
-+                  const HEVCRpiInterPredEnv *const ipe_c)
-+{
-+    for (int c_idx = 0; c_idx < 2; ++c_idx)
-+    {
-+        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
-+        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
-+        unsigned int exit_n = 0;
-+
-+        if (ipe == NULL || !ipe->used) {
-+            continue;
-+        }
-+
-+        do {
-+            for (unsigned int i = 0; i != ipe->n; ++i) {
-+                const HEVCRpiInterPredQ * const q = ipe->q + i;
-+                shader_track_t * const st = tracka + i;
-+                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
-+
-+                for (;;) {
-+                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
-+
-+                    if (link == q->code_setup) {
-+                        if (c_idx == 0) {
-+                            // Luma
-+                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
-+
-+                            st->height = c->pic_h;
-+                            st->width = c->pic_w * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                        else {
-+                            // Chroma
-+                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
-+
-+                            st->height = c->pic_ch;
-+                            st->width = c->pic_cw * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                    }
-+                    else if (link == s->qpu.y_pxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+                        const int w1 = FFMIN(c->w, 8);
-+                        const int w2 = c->w - w1;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        if (w2 > 0) {
-+                            FUNC(get_patch_y)(st,
-+                                        patch_y2, PATCH_STRIDE,
-+                                        st->last_l1,
-+                                        16, c->h + 7);
-+                        }
-+
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
-+                        if (w2 > 0) {
-+                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                                c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
-+                        }
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_bxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h + 7);
-+
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
-+
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_p00) {
-+                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
-+
-+                        st->last_l0 = &c->next_src1;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_b00) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        av_assert0(c->w <= 16 && c->h <= 64);
-+
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h);
-+
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
-+                           patch_y3, patch_y1, PATCH_STRIDE,
-+                           c->h, 0, 0, c->w);
-+
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), 0, 0, c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l0 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx_l1) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l1 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_bxx) {
-+                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
-+                        const int mx1 = fctom(c->coeffs_x1);
-+                        const int my1 = fctom(c->coeffs_y1);
-+                        const int mx2 = fctom(c->coeffs_x2);
-+                        const int my2 = fctom(c->coeffs_y2);
-+
-+                        uint8_t patch_u1[PATCH_STRIDE * 72];
-+                        uint8_t patch_v1[PATCH_STRIDE * 72];
-+                        uint8_t patch_u2[PATCH_STRIDE * 72];
-+                        uint8_t patch_v2[PATCH_STRIDE * 72];
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
-+                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
-+
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
-+                            c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
-+                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
-+                            c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
-+                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
-+
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == q->code_sync) {
-+                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
-+                        break;
-+                    }
-+                    else if (link == q->code_exit) {
-+                        // We expect exit to occur without other sync
-+                        av_assert0(i == exit_n);
-+                        ++exit_n;
-+                        break;
-+                    }
-+                    else {
-+                        av_assert0(0);
-+                    }
-+                }
-+
-+                st->qpu_mc_curr = cmd;
-+            }
-+        } while (exit_n == 0);
-+    }
-+}
-+
-+#undef FUNC
-+#undef pixel
-+
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000000..3caef20137
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,444 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
-+# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
-+.set USE_STACK, 0
-+
-+# Lines that fail to assemble start with #:
-+# The script insert_magic_opcodes.sh inserts the machine code directly for these.
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+
-+.equ TRANS_SHIFT, 20 - BIT_DEPTH
-+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
-+.equ TRANS_ASL2, 16 - TRANS_SHIFT
-+
-+
-+hevc_trans_16x16:
-+  push r6-r15, lr # TODO cut down number of used registers
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,TRANS_RND2 # Constant used for rounding second pass
-+
-+  sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+
-+  add r11,sp,64 # Space for 32 bytes before, and rounding
-+  lsr r11,5
-+  lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
-+
-+  lsr r10, r2, 16 # Number of compressed blocks stored in top short
-+  extu r2,16
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+  # r0 VRF location of current block
-+  # r1 address of current block
-+  # r2 number of 16*16 transforms to do
-+  # r3 Stride of coefficients (==32)
-+  # r4 TRANS_RND1 (64)
-+  # r5 TRANS_RND2
-+  # r6 temporary used inside col_trans16
-+  # r7 16*16*2 total bytes in block
-+  # r8 64*16 VRF switch locations
-+  # r9 temporary in unpack_coeff for index
-+  # r10 number of 16x16 transforms using compression
-+  # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
-+  # r12 temporary counter in unpack_coeff
-+  # r13
-+  # r14 Save information for 32 bit transform (coeffs location)
-+  # r15 Save information for 32 bit transform (number of transforms)
-+  cmp r2,0
-+  beq done16x16s
-+block_loop:
-+  # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
-+  cmp r10,0
-+  mov r6, r1
-+  beq not_compressed
-+  sub r10, 1
-+  bl unpack16x16
-+not_compressed:
-+  #mov r6,r1 # DEBUG without compress
-+  vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+  #eor r0,r8
-+  #add r1,r7
-+  # Prefetch the next block
-+  #bl unpack16x16
-+  #vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+  #vmov HX(0++,0)+r0,0 REP 16  # DEBUG
-+  #eor r0,r8
-+  #sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+done16x16s:
-+
-+  add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-+  pop r6-r15, pc
-+# This returns a value in r6 that says where to load the data from.
-+# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
-+unpack16x16:
-+# Clear out destination
-+  vmov HX(0,0)+r0,0
-+  mov r6, r11
-+  vsth HX(0,0)+r0,(r6 += r3) REP 16
-+  mov r5, r1 # Moving pointer to input coefficients
-+unpack_outer_loop:
-+  # Loop until we find the end
-+  vldh HX(0,0)+r0,(r5)  # TODO would prefetch help here while unpacking previous?
-+  sub r6,r11,32
-+  #add r6,pc,packed_data-$ # Packed data
-+  vsth HX(0,0)+r0,(r6)  # Store into packed data
-+  mov r12,0
-+unpack_loop:
-+  ld r4,(r6)
-+  add r6,r6,4
-+  lsr r9,r4,16 # r9 is destination value
-+  cmp r4,0 # {value,index}
-+  extu r4,8
-+  beq done_unpack
-+  sth r9,(r11, r4)
-+  addcmpblt r12,1,8,unpack_loop
-+#  # Read next 16
-+  add r5,32
-+  b unpack_outer_loop
-+done_unpack:
-+#  # Set new load location
-+  mov r6, r11
-+  #add r6,pc,unpacked_data-$
-+#  # Restore constants
-+  mov r4,64
-+  mov r5,TRANS_RND2
-+#  pop r6-r15, pc
-+  b lr
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+# r1/r10 input pointer
-+# r0,r4,r5,r6 free
-+# r8/r9 output storage
-+#
-+# Store packed coefficients at r9-32
-+# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
-+unpack32x32:
-+# Clear out destination
-+  vmov HX(0,0),0
-+  add r0, r9, 32*32*2 # Unpacked buffer
-+  mov r4, 32
-+  vsth HX(0,0),(r0 += r4) REP 64
-+unpack_outer_loop32:
-+  # Loop until we find the end
-+  vldh HX(0,0),(r1)  # TODO would prefetch help here while unpacking previous?
-+  sub r6,r9,32
-+  #add r6,pc,packed_data-$ # Packed data
-+  vsth HX(0,0),(r6)  # Store into packed data
-+  mov r8,0
-+unpack_loop32:
-+  ld r4,(r6)
-+  add r6,r6,4
-+  lsr r5,r4,16 # r5 is destination value
-+  cmp r4,0 # {value,index}
-+  extu r4,10
-+  beq done_unpack
-+  sth r5,(r0, r4)
-+  addcmpblt r8,1,8,unpack_loop32
-+#  # Read next 16
-+  add r1,32
-+  b unpack_outer_loop32
-+done_unpack32:
-+  b lr
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
-+#
-+# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
-+hevc_trans_32x32:
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
-+  lsr r15,r15,16 # Number that are packed
-+  extu r2,16 # Total number
-+
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+
-+.if USE_STACK
-+  # Stack base allocation
-+  sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
-+  # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
-+  add r8,sp,63
-+  lsr r8,5
-+  lsl r8,5
-+.else
-+#:version r8
-+  .half 0x00e8 #AUTOINSERTED
-+  btst r8,16
-+#:add r8,pc,intermediate_results-$
-+  .half 0xbfe8
-+  .half intermediate_results-($-2)
-+  beq on_vpu1
-+  add r8,r8,32*32*2*2+16*2 # Move to secondary storage
-+on_vpu1:
-+.endif
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+
-+  cmp r2,0
-+  beq done32x32s
-+block_loop32:
-+
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
-+  cmp r2,r15
-+  bgt not_compressed_32
-+  bl unpack32x32
-+  add r1,r9,32*32*2   # Uncompressed into temporary storage
-+  mov r8,r9           # Transform into here
-+not_compressed_32:
-+  # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r4, TRANS_RND2 # Constant used for rounding second pass
-+  mov r5, TRANS_ASL2 # left shift used for rounding second pass
-+
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+done32x32s:
-+
-+.if USE_STACK
-+  add sp,sp,32*32*4+64# Restore stack
-+.endif
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  push lr
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0),(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  pop pc
-+
-+.if USE_STACK == 0
-+  .balign 32
-+
-+# .space directives generate 0's in the bin so avoid unnecessary padding by
-+# just setting to appropriate value
-+.equ intermediate_results, $+16*2
-+
-+# Layout goes:
-+#
-+#packed_buffer:
-+#  .space 16*2
-+#intermediate_results:
-+#  .space 32*32*2
-+#unpacked_buffer:
-+#  .space 32*32*2
-+#
-+#packed_buffer2:
-+#  .space 16*2
-+#intermediate_results2:
-+#  .space 32*32*2
-+#unpacked_buffer2:
-+#  .space 32*32*2
-+.endif
-+
-+
-diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
-new file mode 100644
-index 0000000000..1c364492d0
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform10.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform10 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
-+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
-+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
-+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
-+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
-+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
-+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
-+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
-+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
-+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
-+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
-+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
-+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
-+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
-+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
-+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
-+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
-+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
-+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
-+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
-+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
-+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
-+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
-+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
-+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
-+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
-+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
-+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
-+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
-+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
-+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
-+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
-+0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
-+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
-+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
-+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
-+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
-+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
-+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
-+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
-+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
-+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
-+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
-+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
-+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
-+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
-+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
-+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
-+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
-+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
-+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
-+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
-+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
-+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
-+0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
-+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
-+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
-+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
-+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
-+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
-+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
-+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
-+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
-+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
-+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
-+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
-+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
-+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
-+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
-+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
-+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
-+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
-+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
-+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
-+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
-+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
-+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
-+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
-+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
-+};
-diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
-new file mode 100644
-index 0000000000..1128a2c054
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform8.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform8 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
-+0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
-+0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
-+0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
-+0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
-+0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
-+0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
-+0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
-+0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
-+0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
-+0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
-+0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
-+0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
-+0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
-+0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
-+0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
-+0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
-+0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
-+0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
-+0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
-+0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
-+0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
-+0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
-+0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
-+0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
-+0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
-+0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
-+0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
-+0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
-+0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
-+0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
-+0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
-+0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
-+0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
-+0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
-+0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
-+0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
-+0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
-+0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
-+0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
-+0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
-+0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
-+0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
-+0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
-+0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
-+0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
-+0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
-+0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
-+0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
-+0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
-+0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
-+0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
-+0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
-+0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
-+0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
-+0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
-+0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
-+0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
-+0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
-+0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
-+0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
-+0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
-+0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
-+0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
-+0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
-+0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
-+0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
-+0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
-+0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
-+0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
-+0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
-+0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
-+0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
-+0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
-+0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
-+0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
-+0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
-+0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
-+0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
-+0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
-+0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
-+0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
-+};
-diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
-new file mode 100644
-index 0000000000..6d92c1dceb
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,6146 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Wassim Hamidouche
-+ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+#include "libavutil/display.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/mastering_display_metadata.h"
-+#include "libavutil/md5.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/stereo3d.h"
-+
-+#include "decode.h"
-+#include "bswapdsp.h"
-+#include "bytestream.h"
-+#include "golomb.h"
-+#include "hevc.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_parse.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+#include "profiles.h"
-+#include "hwaccel.h"
-+
-+#include "rpi_zc_frames.h"
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "pthread.h"
-+#include <stdatomic.h>
-+
-+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
-+
-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
-+
-+#ifndef av_mod_uintp2
-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-+{
-+    return a & ((1 << p) - 1);
-+}
-+#   define av_mod_uintp2   av_mod_uintp2_c
-+#endif
-+
-+const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
-+
-+#define MC_DUMMY_X (-32)
-+#define MC_DUMMY_Y (-32)
-+
-+// UV & Y both have min 4x4 pred (no 2x2 chroma)
-+// Allow for even spread +1 for setup, +1 for rounding
-+// As we have load sharing this can (in theory) be exceeded so we have to
-+// check after each CTU, but it is a good base size
-+
-+// Worst case (all 4x4) commands per CTU
-+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
-+#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
-+
-+#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
-+
-+#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
-+#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
-+
-+#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
-+#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
-+
-+// Total cmds to allocate - allow for slack & setup
-+#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+
-+#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
-+#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
-+
-+// The QPU code for UV blocks only works up to a block width of 8
-+#define RPI_CHROMA_BLOCK_WIDTH 8
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+
-+
-+// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8] = {
-+        ENCODE_COEFFS(  0,  64,   0,  0),
-+        ENCODE_COEFFS(  2,  58,  10,  2),
-+        ENCODE_COEFFS(  4,  54,  16,  2),
-+        ENCODE_COEFFS(  6,  46,  28,  4),
-+        ENCODE_COEFFS(  4,  36,  36,  4),
-+        ENCODE_COEFFS(  4,  28,  46,  6),
-+        ENCODE_COEFFS(  2,  16,  54,  4),
-+        ENCODE_COEFFS(  2,  10,  58,  2)
-+};
-+
-+// Function arrays by QPU
-+
-+static const int * const inter_pred_setup_c_qpu[12] = {
-+    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
-+};
-+
-+static const int * const inter_pred_setup_c10_qpu[12] = {
-+    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
-+};
-+
-+static const int * const inter_pred_setup_y_qpu[12] = {
-+    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
-+};
-+
-+static const int * const inter_pred_setup_y10_qpu[12] = {
-+    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
-+};
-+
-+static const int * const inter_pred_sync_qpu[12] = {
-+    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
-+    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
-+    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
-+};
-+
-+static const int * const inter_pred_sync10_qpu[12] = {
-+    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
-+    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
-+    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
-+};
-+
-+static const int * const inter_pred_exit_c_qpu[12] = {
-+    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
-+};
-+
-+static const int * const inter_pred_exit_c10_qpu[12] = {
-+    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
-+};
-+
-+static const int * const inter_pred_exit_y_qpu[12] = {
-+    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
-+};
-+
-+static const int * const inter_pred_exit_y10_qpu[12] = {
-+    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
-+};
-+
-+typedef struct ipe_chan_info_s
-+{
-+    const uint8_t bit_depth;
-+    const uint8_t n;
-+    const int * const * setup_fns;
-+    const int * const * sync_fns;
-+    const int * const * exit_fns;
-+} ipe_chan_info_t;
-+
-+typedef struct ipe_init_info_s
-+{
-+    ipe_chan_info_t luma;
-+    ipe_chan_info_t chroma;
-+} ipe_init_info_t;
-+
-+static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
-+{
-+    switch (ln)
-+    {
-+        default:  // normally 0
-+            *b = a;
-+            break;
-+        case 1:
-+            a |= a << 8;
-+            *(uint16_t *)b = a;
-+            b += stride;
-+            *(uint16_t *)b = a;
-+            break;
-+        case 2:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            b += stride;
-+            *(uint32_t *)b = a;
-+            break;
-+        case 3:
-+        {
-+            unsigned int i;
-+            uint64_t d;
-+            a |= a << 8;
-+            a |= a << 16;
-+            d = ((uint64_t)a << 32) | a;
-+            for (i = 0; i != 8; ++i, b += stride)
-+                *(uint64_t *)b = d;
-+            break;
-+        }
-+        case 4:
-+        {
-+            unsigned int i;
-+            uint64_t d;
-+            a |= a << 8;
-+            a |= a << 16;
-+            d = ((uint64_t)a << 32) | a;
-+            for (i = 0; i != 16; ++i, b += stride)
-+            {
-+                *(uint64_t *)b = d;
-+                *(uint64_t *)(b + 8) = d;
-+            }
-+            break;
-+        }
-+    }
-+}
-+
-+// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
-+// (4 not required)
-+static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
-+{
-+    switch (ln)
-+    {
-+        default:  // 0 or -1
-+            *b_u = a;
-+            *b_l = a;
-+            break;
-+        case 1:
-+            a |= a << 8;
-+            *(uint16_t *)b_u = a;
-+            *(uint16_t *)b_l = a;
-+            break;
-+        case 2:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)b_l = a;
-+            break;
-+        case 3:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)(b_u + 4) = a;
-+            *(uint32_t *)b_l = a;
-+            *(uint32_t *)(b_l + 4) = a;
-+            break;
-+        case 4:
-+            a |= a << 8;
-+            a |= a << 16;
-+            *(uint32_t *)b_u = a;
-+            *(uint32_t *)(b_u + 4) = a;
-+            *(uint32_t *)(b_u + 8) = a;
-+            *(uint32_t *)(b_u + 12) = a;
-+            *(uint32_t *)b_l = a;
-+            *(uint32_t *)(b_l + 4) = a;
-+            *(uint32_t *)(b_l + 8) = a;
-+            *(uint32_t *)(b_l + 12) = a;
-+            break;
-+    }
-+}
-+
-+static void zap_cabac_stash(uint8_t * b, const int ln)
-+{
-+    switch (ln)
-+    {
-+        default:  // 0
-+            *b = 0;
-+            break;
-+        case 1:
-+            *(uint16_t *)b = 0;
-+            break;
-+        case 2:
-+            *(uint32_t *)b = 0;
-+            break;
-+        case 3:
-+            *(uint32_t *)b = 0;
-+            *(uint32_t *)(b + 4) = 0;
-+            break;
-+    }
-+}
-+
-+
-+
-+// Set a small square block of bits in a bitmap
-+// Bits must be aligned on their size boundry (which will be true of all split CBs)
-+static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
-+{
-+    unsigned int n;
-+    const unsigned int sh = (x & 7);
-+
-+    f += (x >> 3);
-+
-+    av_assert2(ln <= 3);
-+    av_assert2((x & ((1 << ln) - 1)) == 0);
-+
-+    switch (ln)
-+    {
-+        default:  // 1
-+            f[0] |= 1 << sh;
-+            break;
-+        case 1:  // 3 * 2
-+            n = 3 << sh;
-+            f[0] |= n;
-+            f[stride] |= n;
-+            break;
-+        case 2:  // 0xf * 4
-+            n = 0xf << sh;
-+            f[0] |= n;
-+            f[stride] |= n;
-+            f[stride * 2] |= n;
-+            f[stride * 3] |= n;
-+            break;
-+        case 3:  // 0xff * 8
-+            for (n = 0; n != 8; ++n, f += stride)
-+                *f = 0xff;
-+            break;
-+    }
-+}
-+
-+static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
-+   {  // 8
-+      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
-+      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
-+   },
-+   {  // 9
-+      .luma =   {0},
-+      .chroma = {0}
-+   },
-+   {  // 10
-+      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
-+      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
-+   }
-+
-+};
-+
-+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
-+{
-+    const unsigned int n = ici->n;
-+    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
-+
-+    ipe->n = n;
-+    ipe->max_fill = q1_size - ipe->min_gap;
-+    for(unsigned int i = 0; i < n; i++) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base =
-+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
-+        q->code_setup = qpu_fn(ici->setup_fns[i]);
-+        q->code_sync = qpu_fn(ici->sync_fns[i]);
-+        q->code_exit = qpu_fn(ici->exit_fns[i]);
-+    }
-+}
-+
-+static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
-+{
-+    av_assert0(bit_depth >= 8 && bit_depth <= 16);
-+
-+    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
-+}
-+
-+// Unsigned Trivial MOD
-+static inline unsigned int utmod(const unsigned int x, const unsigned int n)
-+{
-+    return x >= n ? x - n : x;
-+}
-+
-+// returns pq->job_n++
-+static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
-+{
-+    unsigned int const x2 = pq->job_n;
-+    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
-+    return x2;
-+}
-+
-+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
-+{
-+    pq->terminate = 0;
-+    pq->job_n = 0;
-+    pq->context = s;
-+    pq->worker = worker;
-+    pq->psem_out = psem_out;
-+    pq->pass_n = n;
-+    pq->started = 0;
-+    sem_init(&pq->sem_in, 0, 0);
-+}
-+
-+static void pass_queue_kill(HEVCRpiPassQueue * const pq)
-+{
-+    sem_destroy(&pq->sem_in);
-+}
-+
-+static inline void rpi_sem_wait(sem_t * const sem)
-+{
-+    while (sem_wait(sem) != 0) {
-+        av_assert0(errno == EINTR);
-+    }
-+}
-+
-+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
-+{
-+    sem_post(&pq->sem_in);
-+}
-+
-+static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    // Do the various passes - common with the worker code
-+    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
-+        s->passq[i].worker(s, jb);
-+    }
-+}
-+
-+
-+#if 0
-+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
-+{
-+    int x;
-+    sem_getvalue((sem_t *)&jbc->sem_out, &x);
-+    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
-+}
-+#endif
-+
-+
-+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJob * jb;
-+    HEVCRpiJobGlobal * const jbg = jbc->jbg;
-+
-+    pthread_mutex_lock(&jbg->lock);
-+    // Check local 1st
-+    if ((jb = jbc->jb1) != NULL)
-+    {
-+        // Only 1 - very easy :-)
-+        jbc->jb1 = NULL;
-+    }
-+    else
-+    {
-+        // Now look for global free chain
-+        if ((jb = jbg->free1) != NULL)
-+        {
-+            // Found one - unlink it
-+            jbg->free1 = jb->next;
-+            jb->next = NULL;
-+        }
-+        else
-+        {
-+            // Out of places to look - wait for one to become free - add to Qs
-+
-+            // Global
-+            // If "good" lc then add after the last "good" el in the chain
-+            // otherwise add to the tail
-+            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
-+            {
-+                // Add to end as we had to wait last time or wait Q empty
-+                if ((lc->jw_prev = jbg->wait_tail) == NULL)
-+                    jbg->wait_head = lc;
-+                else
-+                    lc->jw_prev->jw_next = lc;
-+                lc->jw_next = NULL;
-+                jbg->wait_tail = lc;
-+            }
-+            else
-+            {
-+                // This is a "good" lc that we need to poke into the middle
-+                // of the Q
-+                // We know that the Q isn't empty and there is at least one
-+                // !last_progess_good el in it from the previous test
-+
-+                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
-+
-+                if (p == NULL)
-+                {
-+                    // No current good els - add to head
-+                    lc->jw_next = jbg->wait_head;
-+                    jbg->wait_head = lc;
-+                }
-+                else
-+                {
-+                    lc->jw_next = p->jw_next;
-+                    p->jw_next = lc;
-+                }
-+
-+                lc->jw_next->jw_prev = lc;
-+                lc->jw_prev = p;
-+            }
-+
-+            // If "good" then we are now the last good waiting el
-+            if (lc->last_progress_good)
-+                jbg->wait_good = lc;
-+
-+            // Local
-+            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
-+                jbc->lcw_head = lc;
-+            else
-+                lc->ljw_prev->ljw_next = lc;
-+            lc->ljw_next = NULL;
-+            jbc->lcw_tail = lc;
-+        }
-+    }
-+
-+    pthread_mutex_unlock(&jbg->lock);
-+
-+    if (jb == NULL)  // Need to wait
-+    {
-+        rpi_sem_wait(&lc->jw_sem);
-+        jb = lc->jw_job;  // Set by free code
-+    }
-+
-+    return jb;
-+}
-+
-+
-+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
-+{
-+    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
-+    HEVCRpiJobCtl * jbc = jb->jbc_local;
-+    HEVCRpiLocalContext * lc = NULL;
-+
-+    pthread_mutex_lock(&jbg->lock);
-+
-+    if (jbc != NULL)
-+    {
-+        av_assert1(jbc->jb1 == NULL);
-+
-+        // Release to Local if nothing waiting there
-+        if ((lc = jbc->lcw_head) == NULL)
-+            jbc->jb1 = jb;
-+    }
-+    else
-+    {
-+        // Release to global if nothing waiting there
-+        if ((lc = jbg->wait_head) == NULL)
-+        {
-+            jb->next = jbg->free1;
-+            jbg->free1 = jb;
-+        }
-+        else
-+        {
-+            // ? seems somehow mildy ugly...
-+            jbc = lc->context->jbc;
-+        }
-+    }
-+
-+    if (lc != NULL)
-+    {
-+        // Something was waiting
-+
-+        // Unlink
-+        // Global
-+        if (lc->jw_next == NULL)
-+            jbg->wait_tail = lc->jw_prev;
-+        else
-+            lc->jw_next->jw_prev = lc->jw_prev;
-+
-+        if (lc->jw_prev == NULL)
-+            jbg->wait_head = lc->jw_next;
-+        else
-+            lc->jw_prev->jw_next = lc->jw_next;
-+
-+        // Local
-+        if (lc->ljw_next == NULL)
-+            jbc->lcw_tail = lc->ljw_prev;
-+        else
-+            lc->ljw_next->ljw_prev = lc->ljw_prev;
-+
-+        if (lc->ljw_prev == NULL)
-+            jbc->lcw_head = lc->ljw_next;
-+        else
-+            lc->ljw_prev->ljw_next = lc->ljw_next;
-+
-+        // Update good if required
-+        if (jbg->wait_good == lc)
-+            jbg->wait_good = lc->jw_prev;
-+
-+        // Prod
-+        lc->jw_job = jb;
-+        sem_post(&lc->jw_sem);
-+    }
-+
-+    pthread_mutex_unlock(&jbg->lock);
-+}
-+
-+static void job_lc_kill(HEVCRpiLocalContext * const lc)
-+{
-+    sem_destroy(&lc->jw_sem);
-+}
-+
-+static void job_lc_init(HEVCRpiLocalContext * const lc)
-+{
-+    lc->jw_next = NULL;
-+    lc->jw_prev = NULL;
-+    lc->ljw_next = NULL;
-+    lc->ljw_prev = NULL;
-+    lc->jw_job = NULL;
-+    sem_init(&lc->jw_sem,  0, 0);
-+}
-+
-+// Returns:
-+//  0 if we have waited for MV or expect to wait for recon
-+//  1 if we haven't waited for MV & do not need to wait for recon
-+static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
-+{
-+    if (jb->waited) // reset by rpi_begin
-+        return 0;
-+    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
-+    {
-+        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
-+                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
-+            return 0;
-+    }
-+    return 1;
-+}
-+
-+// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
-+static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl *const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    av_assert1(jb != NULL);
-+
-+    if (jb->ctu_ts_last < 0) {
-+        return;
-+    }
-+
-+    lc->last_progress_good = progress_good(s, jb);
-+    jb->waited = !lc->last_progress_good;
-+    lc->jb0 = NULL;
-+
-+    if (s->offload_recon)
-+    {
-+        pthread_mutex_lock(&jbc->in_lock);
-+        jbc->offloadq[jbc->offload_in] = jb;
-+        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
-+        pthread_mutex_unlock(&jbc->in_lock);
-+
-+        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
-+    }
-+    else
-+    {
-+        pass_queue_do_all(s, jb);  // Consumes job before return
-+    }
-+}
-+
-+
-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-+// available to receive the next job.
-+//
-+// Now safe against multiple callers - needed for tiles
-+// "normal" and WPP will only call here one at a time
-+static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+
-+    // It is legit for us to already have a job allocated - do nothing in this case
-+    if (lc->jb0 != NULL)
-+        return;
-+
-+    if (s->offload_recon)
-+        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
-+
-+    lc->jb0 = job_alloc(jbc, lc);
-+
-+    rpi_begin(s, lc->jb0, lc->ts);
-+}
-+
-+// Free up a job without submission
-+static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    if (jb == NULL) {
-+        return;
-+    }
-+
-+    lc->jb0 = NULL;
-+
-+    job_free(jbc, jb);
-+
-+    // If offload then poke sem_out too
-+    if (s->offload_recon) {
-+        sem_post(&jbc->sem_out);
-+    }
-+}
-+
-+
-+// Call this to wait for all jobs to have completed at the end of a frame
-+// Slightly icky as there is no clean way to wait for a sem to count up
-+// Not reentrant - call on main thread only
-+static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    int i = 0;
-+
-+    // We shouldn't reach here with an unsubmitted job
-+    av_assert1(lc->jb0 == NULL);
-+
-+    // If no offload then there can't be anything to wait for
-+    if (!s->offload_recon) {
-+        return;
-+    }
-+
-+    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
-+    {
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            rpi_sem_wait(&jbc->sem_out);
-+        }
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            sem_post(&jbc->sem_out);
-+        }
-+    }
-+}
-+
-+static void * pass_worker(void *arg)
-+{
-+    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
-+    HEVCRpiContext *const s = pq->context;
-+
-+    for (;;)
-+    {
-+        rpi_sem_wait(&pq->sem_in);
-+
-+        if (pq->terminate)
-+            break;
-+
-+        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
-+        // * should really set jb->passes_done here
-+
-+        sem_post(pq->psem_out);
-+    }
-+    return NULL;
-+}
-+
-+static void pass_queues_start_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
-+        pqs[i].started = 1;
-+    }
-+}
-+
-+static void pass_queues_term_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pqs[i].terminate = 1;
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started)
-+            sem_post(&pqs[i].sem_in);
-+    }
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started) {
-+            pthread_join(pqs[i].thread, NULL);
-+            pqs[i].started = 0;
-+        }
-+    }
-+}
-+
-+static void pass_queues_kill_all(HEVCRpiContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
-+
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pass_queue_kill(pqs + i);
-+}
-+
-+
-+static void worker_pic_free_one(HEVCRpiJob * const jb)
-+{
-+    // Free coeff stuff - allocation not the same for all buffers
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+    if (cf->s[0].buf != NULL)
-+        av_freep(&cf->mptr);
-+    if (cf->s[2].buf != NULL)
-+        gpu_free(&cf->gptr);
-+    memset(cf, 0, sizeof(*cf));
-+}
-+
-+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
-+{
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
-+        goto fail;
-+    cf->s[2].buf = (int16_t *)cf->gptr.arm;
-+    cf->s[3].buf = cf->s[2].buf + coeff_count;
-+
-+    // Must be 64 byte aligned for our zero zapping code so over-allocate &
-+    // round
-+    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
-+        goto fail;
-+    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
-+    return 0;
-+
-+fail:
-+    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
-+    worker_pic_free_one(jb);
-+    return -1;
-+}
-+
-+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
-+{
-+    unsigned int i;
-+    for (i = 0; i != 4; ++i) {
-+        cf->s[i].n = 0;
-+#if RPI_COMPRESS_COEFFS        
-+        cf->s[i].packed = 1;
-+        cf->s[i].packed_n = 0;
-+#endif
-+    }
-+}
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
-+{
-+    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
-+    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
-+    cfe->n += n;
-+    return coeffs;
-+}
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int val, const int field)
-+{
-+    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
-+        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
-+        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
-+        sem_t * sem = NULL;
-+
-+        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+        if (((volatile int *)ref->tf.progress->data)[field] < val) {
-+            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
-+
-+            av_assert1(pwait->req == -1 && pwait->next == NULL);
-+            jb->waited = 1;  // Remember that we had to wait for later scheduling
-+
-+            pwait->req = val;
-+            pwait->next = NULL;
-+            if (pstate->first == NULL)
-+                pstate->first = pwait;
-+            else
-+                pstate->last->next = pwait;
-+            pstate->last = pwait;
-+            sem = &pwait->sem;
-+        }
-+        pthread_mutex_unlock(&pstate->lock);
-+
-+        if (sem != NULL) {
-+            rpi_sem_wait(sem);
-+        }
-+    }
-+}
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
-+{
-+    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
-+
-+    ((int *)s->ref->tf.progress->data)[field] = val;
-+
-+    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+    {
-+        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
-+        HEVCRpiFrameProgressWait * pwait;
-+
-+        while ((pwait = *ppwait) != NULL) {
-+            if (pwait->req > val)
-+            {
-+                ppwait = &pwait->next;
-+                pstate->last = pwait;
-+            }
-+            else
-+            {
-+                *ppwait = pwait->next;
-+                pwait->req = -1;
-+                pwait->next = NULL;
-+                sem_post(&pwait->sem);
-+            }
-+        }
-+    }
-+    pthread_mutex_unlock(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+    pstate->first = NULL;
-+    pstate->last = NULL;
-+    pthread_mutex_init(&pstate->lock, NULL);
-+}
-+
-+static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+    pwait->req = -1;
-+    pwait->next = NULL;
-+    sem_init(&pwait->sem, 0, 0);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+    av_assert1(pstate->first == NULL);
-+    pthread_mutex_destroy(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+    sem_destroy(&pwait->sem);
-+}
-+
-+
-+/**
-+ * NOTE: Each function hls_foo correspond to the function foo in the
-+ * specification (HLS stands for High Level Syntax).
-+ */
-+
-+/**
-+ * Section 5.7
-+ */
-+
-+// Realloc the entry point arrays
-+static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
-+{
-+    if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
-+    {
-+        // Round up alloc to multiple of 32
-+        int a = (n + 31) & ~31;
-+
-+        // We don't care about the previous contents so probably fastest to simply discard
-+        av_freep(&sh->entry_point_offset);
-+        av_freep(&sh->offset);
-+        av_freep(&sh->size);
-+
-+        if (a != 0)
-+        {
-+            sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
-+            sh->offset = av_malloc_array(a, sizeof(int));
-+            sh->size = av_malloc_array(a, sizeof(int));
-+
-+            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
-+                sh->num_entry_point_offsets = 0;
-+                sh->offsets_allocated = 0;
-+                return AVERROR(ENOMEM);
-+            }
-+        }
-+
-+        sh->offsets_allocated = a;
-+    }
-+
-+    return 0;
-+}
-+
-+/* free everything allocated  by pic_arrays_init() */
-+static void pic_arrays_free(HEVCRpiContext *s)
-+{
-+    av_freep(&s->sao);
-+    av_freep(&s->deblock);
-+
-+    av_freep(&s->cabac_stash_up);
-+    s->cabac_stash_left = NULL;  // freed with _up
-+
-+    av_freep(&s->mvf_up);
-+    av_freep(&s->mvf_left);
-+
-+    av_freep(&s->is_pcm);
-+    av_freep(&s->is_intra_store);
-+    s->is_intra = NULL;
-+    av_freep(&s->rpl_tab);
-+    s->rpl_tab_size = 0;
-+
-+    av_freep(&s->qp_y_tab);
-+    av_freep(&s->tab_slice_address);
-+    av_freep(&s->filter_slice_edges);
-+
-+    av_freep(&s->bs_horizontal);
-+    s->bs_vertical = NULL;  // freed with H
-+    av_freep(&s->bsf_stash_left);
-+    av_freep(&s->bsf_stash_up);
-+
-+    av_freep(&s->rpl_up);
-+    av_freep(&s->rpl_left);
-+
-+    alloc_entry_points(&s->sh, 0);
-+
-+    av_buffer_pool_uninit(&s->col_mvf_pool);
-+}
-+
-+/* allocate arrays that depend on frame dimensions */
-+static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
-+{
-+    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
-+    const unsigned int width            = sps->width;
-+    const unsigned int height           = sps->height;
-+    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
-+                           ((height >> log2_min_cb_size) + 1);
-+    const unsigned int ctb_count        = sps->ctb_size;
-+
-+    {
-+        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
-+        unsigned int h = ((height + 15) & ~15);
-+
-+        s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
-+        s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
-+    }
-+
-+    s->sao           = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
-+    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
-+    if (!s->sao || !s->deblock)
-+        goto fail;
-+
-+    s->cabac_stash_up  = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
-+    s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
-+    if (s->cabac_stash_up == NULL)
-+        goto fail;
-+
-+    // Round width up to max ctb size
-+    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+    // * Only needed if we have H tiles
-+    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+
-+    // We can overread by 1 line & one byte in deblock so alloc & zero
-+    // We don't need to zero the extra @ start of frame as it will never be
-+    // written
-+    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+    if (s->is_pcm == NULL || s->is_intra_store == NULL)
-+        goto fail;
-+
-+    s->filter_slice_edges = av_mallocz(ctb_count);
-+    s->tab_slice_address  = av_malloc_array(ctb_count,
-+                                      sizeof(*s->tab_slice_address));
-+    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
-+                                      sizeof(*s->qp_y_tab));
-+    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
-+        goto fail;
-+
-+    s->bs_horizontal = av_mallocz(s->bs_size * 2);
-+    s->bs_vertical   = s->bs_horizontal + s->bs_size;
-+    if (s->bs_horizontal == NULL)
-+        goto fail;
-+
-+    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
-+    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
-+    if (s->rpl_left == NULL || s->rpl_up == NULL)
-+        goto fail;
-+
-+    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
-+        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
-+        goto fail;
-+
-+    s->col_mvf_stride = (width + 15) >> 4;
-+    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
-+                                          av_buffer_allocz);
-+    if (s->col_mvf_pool == NULL)
-+        goto fail;
-+
-+    return 0;
-+
-+fail:
-+    pic_arrays_free(s);
-+    return AVERROR(ENOMEM);
-+}
-+
-+static void default_pred_weight_table(HEVCRpiContext * const s)
-+{
-+  unsigned int i;
-+  const unsigned int wt = 1 << QPU_MC_DENOM;
-+  s->sh.luma_log2_weight_denom = 0;
-+  s->sh.chroma_log2_weight_denom = 0;
-+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
-+      s->sh.luma_weight_l0[i] = wt;
-+      s->sh.luma_offset_l0[i] = 0;
-+      s->sh.chroma_weight_l0[i][0] = wt;
-+      s->sh.chroma_weight_l0[i][1] = wt;
-+      s->sh.chroma_offset_l0[i][0] = 0;
-+      s->sh.chroma_offset_l0[i][1] = 0;
-+  }
-+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
-+      s->sh.luma_weight_l1[i] = wt;
-+      s->sh.luma_offset_l1[i] = 0;
-+      s->sh.chroma_weight_l1[i][0] = wt;
-+      s->sh.chroma_weight_l1[i][1] = wt;
-+      s->sh.chroma_offset_l1[i][0] = 0;
-+      s->sh.chroma_offset_l1[i][1] = 0;
-+  }
-+}
-+
-+static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
-+                       const unsigned int refs,
-+                       int16_t * luma_weight,   int16_t * luma_offset,
-+                       int16_t * chroma_weight, int16_t * chroma_offset)
-+{
-+    unsigned int luma_flags;
-+    unsigned int chroma_flags;
-+    unsigned int i;
-+    const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
-+    const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
-+    const unsigned int luma_weight_base    = 1 << QPU_MC_DENOM;
-+    const unsigned int chroma_weight_base  = 1 << QPU_MC_DENOM;
-+    const unsigned int luma_weight_shift   = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
-+    const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
-+
-+    if (refs == 0)
-+        return 0;
-+
-+    luma_flags = get_bits(gb, refs);
-+    chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
-+    i = 1 << (refs - 1);
-+
-+    do
-+    {
-+        if ((luma_flags & i) != 0)
-+        {
-+            const int delta_weight = get_se_golomb(gb);
-+            const int offset = get_se_golomb(gb);
-+            if (delta_weight < -128 || delta_weight > 127 ||
-+                offset < -wp_offset_half_range || offset >= wp_offset_half_range)
-+            {
-+                return AVERROR_INVALIDDATA;
-+            }
-+            *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
-+            *luma_offset++ = offset << wp_offset_bd_shift;
-+        }
-+        else
-+        {
-+            *luma_weight++ = luma_weight_base;
-+            *luma_offset++ = 0;
-+        }
-+
-+        if ((chroma_flags & i) != 0)
-+        {
-+            unsigned int j;
-+            for (j = 0; j != 2; ++j)
-+            {
-+                const int delta_weight = get_se_golomb(gb);
-+                const int delta_offset = get_se_golomb(gb);
-+
-+                if (delta_weight < -128 || delta_weight > 127 ||
-+                    delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
-+                {
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
-+                *chroma_offset++ = av_clip(
-+                    wp_offset_half_range + delta_offset -
-+                        ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
-+                    -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
-+            }
-+        }
-+        else
-+        {
-+            *chroma_weight++ = chroma_weight_base;
-+            *chroma_weight++ = chroma_weight_base;
-+            *chroma_offset++ = 0;
-+            *chroma_offset++ = 0;
-+        }
-+    } while ((i >>= 1) != 0);
-+
-+    return 0;
-+}
-+
-+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
-+{
-+    int err;
-+    const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
-+    const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
-+
-+    if (luma_log2_weight_denom > 7 ||
-+        chroma_log2_weight_denom > 7)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
-+               luma_log2_weight_denom, chroma_log2_weight_denom);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
-+    s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
-+
-+    if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
-+                s->sh.luma_weight_l0,      s->sh.luma_offset_l0,
-+                s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
-+        (err = get_weights(s, gb, s->sh.nb_refs[L1],
-+                s->sh.luma_weight_l1,      s->sh.luma_offset_l1,
-+                s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
-+        return err;
-+    }
-+
-+    return 0;
-+}
-+
-+static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
-+{
-+    const HEVCRpiSPS *sps = s->ps.sps;
-+    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
-+    int prev_delta_msb = 0;
-+    unsigned int nb_sps = 0, nb_sh;
-+    int i;
-+
-+    rps->nb_refs = 0;
-+    if (!sps->long_term_ref_pics_present_flag)
-+        return 0;
-+
-+    if (sps->num_long_term_ref_pics_sps > 0)
-+        nb_sps = get_ue_golomb_long(gb);
-+    nb_sh = get_ue_golomb_long(gb);
-+
-+    if (nb_sps > sps->num_long_term_ref_pics_sps)
-+        return AVERROR_INVALIDDATA;
-+    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
-+        return AVERROR_INVALIDDATA;
-+
-+    rps->nb_refs = nb_sh + nb_sps;
-+
-+    for (i = 0; i < rps->nb_refs; i++) {
-+        uint8_t delta_poc_msb_present;
-+
-+        if (i < nb_sps) {
-+            uint8_t lt_idx_sps = 0;
-+
-+            if (sps->num_long_term_ref_pics_sps > 1)
-+                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
-+
-+            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
-+            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
-+        } else {
-+            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
-+            rps->used[i] = get_bits1(gb);
-+        }
-+
-+        delta_poc_msb_present = get_bits1(gb);
-+        if (delta_poc_msb_present) {
-+            int64_t delta = get_ue_golomb_long(gb);
-+            int64_t poc;
-+
-+            if (i && i != nb_sps)
-+                delta += prev_delta_msb;
-+
-+            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
-+            if (poc != (int32_t)poc)
-+                return AVERROR_INVALIDDATA;
-+            rps->poc[i] = poc;
-+            prev_delta_msb = delta;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
-+                                 const HEVCRpiSPS *sps)
-+{
-+    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
-+    const HEVCRpiWindow *ow = &sps->output_window;
-+    unsigned int num = 0, den = 0;
-+
-+    avctx->pix_fmt             = sps->pix_fmt;
-+    avctx->coded_width         = sps->width;
-+    avctx->coded_height        = sps->height;
-+    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
-+    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
-+    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
-+    avctx->profile             = sps->ptl.general_ptl.profile_idc;
-+    avctx->level               = sps->ptl.general_ptl.level_idc;
-+
-+    ff_set_sar(avctx, sps->vui.sar);
-+
-+    if (sps->vui.video_signal_type_present_flag)
-+        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
-+                                                            : AVCOL_RANGE_MPEG;
-+    else
-+        avctx->color_range = AVCOL_RANGE_MPEG;
-+
-+    if (sps->vui.colour_description_present_flag) {
-+        avctx->color_primaries = sps->vui.colour_primaries;
-+        avctx->color_trc       = sps->vui.transfer_characteristic;
-+        avctx->colorspace      = sps->vui.matrix_coeffs;
-+    } else {
-+        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
-+        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
-+        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
-+    }
-+
-+    if (vps->vps_timing_info_present_flag) {
-+        num = vps->vps_num_units_in_tick;
-+        den = vps->vps_time_scale;
-+    } else if (sps->vui.vui_timing_info_present_flag) {
-+        num = sps->vui.vui_num_units_in_tick;
-+        den = sps->vui.vui_time_scale;
-+    }
-+
-+    if (num != 0 && den != 0)
-+        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
-+                  num, den, 1 << 30);
-+}
-+
-+static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
-+{
-+    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
-+
-+    // Admit to no h/w formats
-+
-+    *fmt++ = sps->pix_fmt;
-+    *fmt = AV_PIX_FMT_NONE;
-+
-+    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
-+}
-+
-+static int is_sps_supported(const HEVCRpiSPS * const sps)
-+{
-+    return av_rpi_is_sand_format(sps->pix_fmt) &&
-+           sps->width <= HEVC_RPI_MAX_WIDTH &&
-+           sps->height <= HEVC_RPI_MAX_HEIGHT;
-+}
-+
-+static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
-+                   const enum AVPixelFormat pix_fmt)
-+{
-+    int ret;
-+
-+    pic_arrays_free(s);
-+    s->ps.sps = NULL;
-+    s->ps.vps = NULL;
-+
-+    if (sps == NULL)
-+        return 0;
-+
-+    if (!is_sps_supported(sps))
-+        return AVERROR_DECODER_NOT_FOUND;
-+
-+    ret = pic_arrays_init(s, sps);
-+    if (ret < 0)
-+        goto fail;
-+
-+    export_stream_params(s->avctx, &s->ps, sps);
-+
-+    s->avctx->pix_fmt = pix_fmt;
-+
-+    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
-+    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
-+
-+    // * We don't support cross_component_prediction_enabled_flag but as that
-+    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
-+    //   only deal with sand which is never 4:4:4
-+    //   [support wouldn't be hard]
-+
-+    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
-+
-+    av_freep(&s->sao_pixel_buffer_h[0]);
-+    av_freep(&s->sao_pixel_buffer_v[0]);
-+
-+    if (sps->sao_enabled)
-+    {
-+        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
-+        unsigned int c_idx;
-+        size_t vsize[3] = {0};
-+        size_t hsize[3] = {0};
-+
-+        for(c_idx = 0; c_idx < c_count; c_idx++) {
-+            int w = sps->width >> ctx_hshift(s, c_idx);
-+            int h = sps->height >> ctx_vshift(s, c_idx);
-+            // ctb height & width are a min of 8 so this must a multiple of 16
-+            // so no point rounding up!
-+            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
-+            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
-+        }
-+
-+        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
-+        // when we have plaited chroma
-+        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
-+        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
-+        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
-+        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
-+        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
-+        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
-+    }
-+
-+    s->ps.sps = sps;
-+    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
-+
-+    return 0;
-+
-+fail:
-+    pic_arrays_free(s);
-+    s->ps.sps = NULL;
-+    return ret;
-+}
-+
-+static inline int qp_offset_valid(const int qp_offset)
-+{
-+    return qp_offset >= -12 && qp_offset <= 12;
-+}
-+
-+static int hls_slice_header(HEVCRpiContext * const s)
-+{
-+    GetBitContext * const gb = &s->HEVClc->gb;
-+    RpiSliceHeader * const sh   = &s->sh;
-+    int i, ret;
-+
-+    // Coded parameters
-+    sh->first_slice_in_pic_flag = get_bits1(gb);
-+    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+        if (IS_IDR(s))
-+            ff_hevc_rpi_clear_refs(s);
-+    }
-+    sh->no_output_of_prior_pics_flag = 0;
-+    if (IS_IRAP(s))
-+        sh->no_output_of_prior_pics_flag = get_bits1(gb);
-+
-+    sh->pps_id = get_ue_golomb_long(gb);
-+    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
-+        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
-+        return AVERROR_INVALIDDATA;
-+    }
-+    if (!sh->first_slice_in_pic_flag &&
-+        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
-+        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
-+    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
-+        sh->no_output_of_prior_pics_flag = 1;
-+
-+    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
-+        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
-+        const HEVCRpiSPS *last_sps = s->ps.sps;
-+        enum AVPixelFormat pix_fmt;
-+
-+        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
-+            if (sps->width != last_sps->width || sps->height != last_sps->height ||
-+                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
-+                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
-+                sh->no_output_of_prior_pics_flag = 0;
-+        }
-+        ff_hevc_rpi_clear_refs(s);
-+
-+        ret = set_sps(s, sps, sps->pix_fmt);
-+        if (ret < 0)
-+            return ret;
-+
-+        pix_fmt = get_format(s, sps);
-+        if (pix_fmt < 0)
-+            return pix_fmt;
-+
-+//        ret = set_sps(s, sps, pix_fmt);
-+//        if (ret < 0)
-+//            return ret;
-+
-+        s->avctx->pix_fmt = pix_fmt;
-+
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+    }
-+
-+    sh->dependent_slice_segment_flag = 0;
-+    if (!sh->first_slice_in_pic_flag) {
-+        int slice_address_length;
-+
-+        if (s->ps.pps->dependent_slice_segments_enabled_flag)
-+            sh->dependent_slice_segment_flag = get_bits1(gb);
-+
-+        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
-+        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
-+        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "Invalid slice segment address: %u.\n",
-+                   sh->slice_segment_addr);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (!sh->dependent_slice_segment_flag) {
-+            sh->slice_addr = sh->slice_segment_addr;
-+            s->slice_idx++;
-+        }
-+    } else {
-+        sh->slice_segment_addr = sh->slice_addr = 0;
-+        s->slice_idx           = 0;
-+        s->slice_initialized   = 0;
-+    }
-+
-+    if (!sh->dependent_slice_segment_flag) {
-+        s->slice_initialized = 0;
-+
-+        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
-+            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
-+
-+        sh->slice_type = get_ue_golomb_long(gb);
-+        if (!(sh->slice_type == HEVC_SLICE_I ||
-+              sh->slice_type == HEVC_SLICE_P ||
-+              sh->slice_type == HEVC_SLICE_B)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
-+                   sh->slice_type);
-+            return AVERROR_INVALIDDATA;
-+        }
-+        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        // when flag is not present, picture is inferred to be output
-+        sh->pic_output_flag = 1;
-+        if (s->ps.pps->output_flag_present_flag)
-+            sh->pic_output_flag = get_bits1(gb);
-+
-+        if (s->ps.sps->separate_colour_plane_flag)
-+            sh->colour_plane_id = get_bits(gb, 2);
-+
-+        if (!IS_IDR(s)) {
-+            int poc, pos;
-+
-+            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
-+            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
-+            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
-+                av_log(s->avctx, AV_LOG_WARNING,
-+                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
-+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+                    return AVERROR_INVALIDDATA;
-+                poc = s->poc;
-+            }
-+            s->poc = poc;
-+
-+            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
-+            pos = get_bits_left(gb);
-+            if (!sh->short_term_ref_pic_set_sps_flag) {
-+                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
-+                if (ret < 0)
-+                    return ret;
-+
-+                sh->short_term_rps = &sh->slice_rps;
-+            } else {
-+                int numbits, rps_idx;
-+
-+                if (!s->ps.sps->nb_st_rps) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
-+                    return AVERROR_INVALIDDATA;
-+                }
-+
-+                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
-+                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
-+                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
-+            }
-+            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+            pos = get_bits_left(gb);
-+            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
-+            if (ret < 0) {
-+                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
-+                if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+                    return AVERROR_INVALIDDATA;
-+            }
-+            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
-+                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
-+            else
-+                sh->slice_temporal_mvp_enabled_flag = 0;
-+        } else {
-+            s->sh.short_term_rps = NULL;
-+            s->poc               = 0;
-+        }
-+
-+        /* 8.3.1 */
-+        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
-+            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
-+            s->nal_unit_type != HEVC_NAL_TSA_N   &&
-+            s->nal_unit_type != HEVC_NAL_STSA_N  &&
-+            s->nal_unit_type != HEVC_NAL_RADL_N  &&
-+            s->nal_unit_type != HEVC_NAL_RADL_R  &&
-+            s->nal_unit_type != HEVC_NAL_RASL_N  &&
-+            s->nal_unit_type != HEVC_NAL_RASL_R)
-+            s->pocTid0 = s->poc;
-+
-+        if (s->ps.sps->sao_enabled) {
-+            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
-+            if (ctx_cfmt(s) != 0) {
-+                sh->slice_sample_adaptive_offset_flag[1] =
-+                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
-+            }
-+        } else {
-+            sh->slice_sample_adaptive_offset_flag[0] = 0;
-+            sh->slice_sample_adaptive_offset_flag[1] = 0;
-+            sh->slice_sample_adaptive_offset_flag[2] = 0;
-+        }
-+
-+        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
-+        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
-+            int nb_refs;
-+
-+            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
-+            if (sh->slice_type == HEVC_SLICE_B)
-+                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
-+
-+            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
-+                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
-+                if (sh->slice_type == HEVC_SLICE_B)
-+                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
-+            }
-+            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
-+                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
-+                       sh->nb_refs[L0], sh->nb_refs[L1]);
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            sh->rpl_modification_flag[0] = 0;
-+            sh->rpl_modification_flag[1] = 0;
-+            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
-+            if (!nb_refs) {
-+                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
-+                sh->rpl_modification_flag[0] = get_bits1(gb);
-+                if (sh->rpl_modification_flag[0]) {
-+                    for (i = 0; i < sh->nb_refs[L0]; i++)
-+                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+                }
-+
-+                if (sh->slice_type == HEVC_SLICE_B) {
-+                    sh->rpl_modification_flag[1] = get_bits1(gb);
-+                    if (sh->rpl_modification_flag[1] == 1)
-+                        for (i = 0; i < sh->nb_refs[L1]; i++)
-+                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+                }
-+            }
-+
-+            if (sh->slice_type == HEVC_SLICE_B)
-+                sh->mvd_l1_zero_flag = get_bits1(gb);
-+
-+            if (s->ps.pps->cabac_init_present_flag)
-+                sh->cabac_init_flag = get_bits1(gb);
-+            else
-+                sh->cabac_init_flag = 0;
-+
-+            sh->collocated_ref_idx = 0;
-+            if (sh->slice_temporal_mvp_enabled_flag) {
-+                sh->collocated_list = L0;
-+                if (sh->slice_type == HEVC_SLICE_B)
-+                    sh->collocated_list = !get_bits1(gb);
-+
-+                if (sh->nb_refs[sh->collocated_list] > 1) {
-+                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
-+                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
-+                        av_log(s->avctx, AV_LOG_ERROR,
-+                               "Invalid collocated_ref_idx: %d.\n",
-+                               sh->collocated_ref_idx);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+                }
-+            }
-+
-+            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
-+                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
-+            {
-+                if ((ret = pred_weight_table(s, gb)) != 0)
-+                    return ret;
-+            }
-+            else
-+            {
-+                // Give us unit weights
-+                default_pred_weight_table(s);
-+            }
-+
-+            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
-+            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-+                av_log(s->avctx, AV_LOG_ERROR,
-+                       "Invalid number of merging MVP candidates: %d.\n",
-+                       sh->max_num_merge_cand);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        }
-+
-+        sh->slice_qp_delta = get_se_golomb(gb);
-+
-+        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
-+            sh->slice_cb_qp_offset = get_se_golomb(gb);
-+            sh->slice_cr_qp_offset = get_se_golomb(gb);
-+            if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
-+                !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
-+                !qp_offset_valid(sh->slice_cr_qp_offset) ||
-+                !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
-+                       sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
-+                       s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
-+                return AVERROR_INVALIDDATA;
-+            }
-+        } else
-+        {
-+            sh->slice_cb_qp_offset = 0;
-+            sh->slice_cr_qp_offset = 0;
-+        }
-+
-+        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
-+            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
-+        else
-+            sh->cu_chroma_qp_offset_enabled_flag = 0;
-+
-+        if (s->ps.pps->deblocking_filter_control_present_flag) {
-+            int deblocking_filter_override_flag = 0;
-+
-+            if (s->ps.pps->deblocking_filter_override_enabled_flag)
-+                deblocking_filter_override_flag = get_bits1(gb);
-+
-+            if (deblocking_filter_override_flag) {
-+                sh->disable_deblocking_filter_flag = get_bits1(gb);
-+                if (!sh->disable_deblocking_filter_flag) {
-+                    int beta_offset_div2 = get_se_golomb(gb);
-+                    int tc_offset_div2   = get_se_golomb(gb) ;
-+                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
-+                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
-+                        av_log(s->avctx, AV_LOG_ERROR,
-+                            "Invalid deblock filter offsets: %d, %d\n",
-+                            beta_offset_div2, tc_offset_div2);
-+                        return AVERROR_INVALIDDATA;
-+                    }
-+                    sh->beta_offset = beta_offset_div2 * 2;
-+                    sh->tc_offset   =   tc_offset_div2 * 2;
-+                }
-+            } else {
-+                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
-+                sh->beta_offset                    = s->ps.pps->beta_offset;
-+                sh->tc_offset                      = s->ps.pps->tc_offset;
-+            }
-+        } else {
-+            sh->disable_deblocking_filter_flag = 0;
-+            sh->beta_offset                    = 0;
-+            sh->tc_offset                      = 0;
-+        }
-+
-+        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
-+            (sh->slice_sample_adaptive_offset_flag[0] ||
-+             sh->slice_sample_adaptive_offset_flag[1] ||
-+             !sh->disable_deblocking_filter_flag)) {
-+            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+        } else {
-+            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
-+        }
-+        sh->no_dblk_boundary_flags =
-+            (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
-+                BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
-+            (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
-+                BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
-+
-+
-+    } else if (!s->slice_initialized) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    sh->num_entry_point_offsets = 0;
-+    sh->offload_wpp = 0;
-+    sh->offload_tiles = 0;
-+
-+    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
-+        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
-+        // It would be possible to bound this tighter but this here is simpler
-+        if (num_entry_point_offsets > get_bits_left(gb)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        sh->num_entry_point_offsets = num_entry_point_offsets;
-+        if (sh->num_entry_point_offsets > 0) {
-+            int offset_len = get_ue_golomb_long(gb) + 1;
-+
-+            if (offset_len < 1 || offset_len > 32) {
-+                sh->num_entry_point_offsets = 0;
-+                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
-+                return AVERROR_INVALIDDATA;
-+            }
-+
-+            if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
-+                return ret;
-+            }
-+
-+            for (i = 0; i < sh->num_entry_point_offsets; i++) {
-+                uint32_t val_minus1 = get_bits_long(gb, offset_len);
-+                if (val_minus1 > (1 << 28))
-+                {
-+                    // We can declare offsets of > 2^28 bad without loss of generality
-+                    // Will check actual bounds wrt NAL later, but this keeps
-+                    // the values within bounds we can deal with easily
-+                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
-+                    return AVERROR_INVALIDDATA;
-+                }
-+                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
-+            }
-+
-+            // Do we want to offload this
-+            if (s->threads_type != 0)
-+            {
-+                sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
-+                    s->ps.pps->num_tile_columns > 1;
-+                // * We only cope with WPP in a single column
-+                //   Probably want to deal with that case as tiles rather than WPP anyway
-+                // ?? Not actually sure that the main code deals with WPP + multi-col correctly
-+                sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
-+                    s->ps.pps->num_tile_columns == 1;
-+            }
-+        }
-+    }
-+
-+    if (s->ps.pps->slice_header_extension_present_flag) {
-+        unsigned int length = get_ue_golomb_long(gb);
-+        if (length*8LL > get_bits_left(gb)) {
-+            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        for (i = 0; i < length; i++)
-+            skip_bits(gb, 8);  // slice_header_extension_data_byte
-+    }
-+
-+    // Inferred parameters
-+    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
-+    if (sh->slice_qp > 51 ||
-+        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "The slice_qp %d is outside the valid range "
-+               "[%d, 51].\n",
-+               sh->slice_qp,
-+               -s->ps.sps->qp_bd_offset);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (get_bits_left(gb) < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Overread slice header by %d bits\n", -get_bits_left(gb));
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    s->slice_initialized = 1;
-+    return 0;
-+}
-+
-+static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
-+{
-+    RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
-+    int c_idx, i;
-+
-+    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
-+        s->sh.slice_sample_adaptive_offset_flag[1]) {
-+        if ((lc->ctb_avail & AVAIL_L) != 0)
-+        {
-+            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+            if (sao_merge_left_flag) {
-+                *sao = sao[-1];
-+                return;
-+            }
-+        }
-+        if ((lc->ctb_avail & AVAIL_U) != 0)
-+        {
-+            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+            if (sao_merge_up_flag) {
-+                *sao = sao[-(int)s->ps.sps->ctb_width];
-+                return;
-+            }
-+        }
-+    }
-+
-+    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
-+        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
-+                                                 s->ps.pps->log2_sao_offset_scale_chroma;
-+        int offset_abs[4];
-+        char offset_sign[4] = {0};
-+
-+        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
-+            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
-+            continue;
-+        }
-+
-+        if (c_idx == 2) {
-+            sao->type_idx[2] = sao->type_idx[1];
-+            sao->eo_class[2] = sao->eo_class[1];
-+        } else {
-+            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
-+        }
-+
-+        // ** Could use BY22 here quite plausibly - this is all bypass stuff
-+        //    though only per CTB so not very timing critical
-+
-+        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
-+            continue;
-+
-+        for (i = 0; i < 4; i++)
-+            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
-+
-+        if (sao->type_idx[c_idx] == SAO_BAND) {
-+            for (i = 0; i < 4; i++) {
-+                if (offset_abs[i] != 0)
-+                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
-+            }
-+            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
-+        } else if (c_idx != 2) {
-+            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
-+        }
-+
-+        // Inferred parameters
-+        sao->offset_val[c_idx][0] = 0;
-+        for (i = 0; i < 4; i++) {
-+            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
-+            if (sao->type_idx[c_idx] == SAO_EDGE) {
-+                if (i > 1)
-+                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+            } else if (offset_sign[i]) {
-+                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+            }
-+        }
-+    }
-+}
-+
-+#if 0
-+static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
-+    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);  // 0..4
-+
-+    if (log2_res_scale_abs_plus1 !=  0) {
-+        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
-+        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
-+                               (1 - 2 * res_scale_sign_flag);
-+    } else {
-+        lc->tu.res_scale_val = 0;
-+    }
-+
-+
-+    return 0;
-+}
-+#endif
-+
-+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
-+{
-+    return jb->intra.cmds + jb->intra.n++;
-+}
-+
-+#define A0(x, y, U, L, UL, UR, DL) \
-+    [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
-+
-+#define A1(x, y, U, L, UL, UR, DL) \
-+    A0((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A0((x) + 1, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A0((x) + 0, (y) + 1,  1,   (L),  (L),   1,   (DL)),  A0((x) + 1, (y) + 1,  1,    1,    1,    0,    0  )
-+
-+#define A2(x, y, U, L, UL, UR, DL) \
-+    A1((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A1((x) + 2, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A1((x) + 0, (y) + 2,  1,   (L),  (L),   1,   (DL)),  A1((x) + 2, (y) + 2,  1,    1,    1,    0,    0  )
-+
-+#define A3(x, y, U, L, UL, UR, DL) \
-+    A2((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A2((x) + 4, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A2((x) + 0, (y) + 4,  1,   (L),  (L),   1,   (DL)),  A2((x) + 4, (y) + 4,  1,    1,    1,    0,    0  )
-+
-+#define A4(x, y, U, L, UL, UR, DL) \
-+    A3((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A3((x) + 8, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
-+    A3((x) + 0, (y) + 8,  1,   (L),  (L),   1,   (DL)),  A3((x) + 8, (y) + 8,  1,    1,    1,    0,    0  )
-+
-+static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
-+{
-+    const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
-+    const unsigned int tb_x = x & ~ctb_mask;
-+    const unsigned int tb_y = y & ~ctb_mask;
-+    const unsigned int ctb_avail = lc->ctb_avail;
-+
-+    const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
-+
-+    unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
-+
-+    // This deals with both the U & L edges
-+    if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
-+        f |= AVAIL_UL;
-+
-+    if (x + w < lc->end_of_ctb_x)
-+        f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
-+    else if (tb_y == 0)
-+        f |= (ctb_avail & AVAIL_UR);
-+#if AVAIL_S_U - AVAIL_S_UR < 0
-+#error Shift problem
-+#endif
-+
-+    // Never any D if Y beyond eoctb
-+    if (y + h < lc->end_of_ctb_y)
-+        f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
-+#if AVAIL_S_DL - AVAIL_S_L < 0
-+#error Shift problem
-+#endif
-+
-+//    printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
-+//           lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
-+//           lc->end_of_ctb_x, lc->end_of_ctb_y);
-+
-+    return f;
-+}
-+
-+#undef A0
-+#undef A1
-+#undef A2
-+#undef A3
-+#undef A4
-+
-+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
-+                          unsigned int avail)
-+{
-+    // If rpi_enabled then sand - U & V done on U call
-+    if (c_idx <= 1)
-+    {
-+        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+        cmd->type = RPI_PRED_INTRA + c_idx;
-+        cmd->size = log2_trafo_size;
-+        cmd->avail = avail;
-+        cmd->i_pred.x = x0;
-+        cmd->i_pred.y = y0;
-+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-+
-+//        printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
-+    }
-+}
-+
-+#define CBF_CB0_S 0
-+#define CBF_CB1_S 1 // CB1 must be CB0 + 1
-+#define CBF_CR0_S 2
-+#define CBF_CR1_S 3
-+
-+#define CBF_CB0 (1 << CBF_CB0_S)
-+#define CBF_CR0 (1 << CBF_CR0_S)
-+#define CBF_CB1 (1 << CBF_CB1_S)
-+#define CBF_CR1 (1 << CBF_CR1_S)
-+
-+// * Only good for chroma_idx == 1
-+static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                              const unsigned int x0, const unsigned int y0,
-+                              const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
-+                              const unsigned int blk_idx, const int cbf_luma,
-+                              const unsigned int cbf_chroma)
-+{
-+    const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
-+    const unsigned int x0_c = x0 & ~7;
-+    const unsigned int y0_c = y0 & ~7;
-+
-+    enum ScanType scan_idx   = SCAN_DIAG;
-+    enum ScanType scan_idx_c = SCAN_DIAG;
-+
-+    if (lc->cu.pred_mode == MODE_INTRA)
-+    {
-+        const unsigned int trafo_size = 1 << log2_trafo_size;
-+        const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
-+
-+        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
-+
-+        if (log2_trafo_size > 2)
-+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
-+        else if (blk_idx == 3)
-+            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
-+                          ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
-+
-+        if (log2_trafo_size < 4) {
-+            if (lc->tu.intra_pred_mode >= 6 &&
-+                lc->tu.intra_pred_mode <= 14) {
-+                scan_idx = SCAN_VERT;
-+            } else if (lc->tu.intra_pred_mode >= 22 &&
-+                       lc->tu.intra_pred_mode <= 30) {
-+                scan_idx = SCAN_HORIZ;
-+            }
-+
-+            if (lc->tu.intra_pred_mode_c >=  6 &&
-+                lc->tu.intra_pred_mode_c <= 14) {
-+                scan_idx_c = SCAN_VERT;
-+            } else if (lc->tu.intra_pred_mode_c >= 22 &&
-+                       lc->tu.intra_pred_mode_c <= 30) {
-+                scan_idx_c = SCAN_HORIZ;
-+            }
-+        }
-+    }
-+
-+    if (!cbf_luma && cbf_chroma == 0)
-+        return 0;
-+
-+    if (lc->tu.is_cu_qp_delta_wanted)
-+    {
-+        const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
-+        const unsigned int cb_mask = ~0U << log2_cb_size;
-+
-+        if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
-+            qp_delta >  (25 + (s->ps.sps->qp_bd_offset >> 1)))
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "The cu_qp_delta %d is outside the valid range "
-+                   "[%d, %d].\n",
-+                   qp_delta,
-+                   -(26 + (s->ps.sps->qp_bd_offset >> 1)),
-+                    (25 + (s->ps.sps->qp_bd_offset >> 1)));
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        lc->tu.is_cu_qp_delta_wanted = 0;
-+        lc->tu.cu_qp_delta = qp_delta;
-+        ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
-+    }
-+
-+    // * Not main profile & untested due to no conform streams
-+    if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
-+        !lc->cu.cu_transquant_bypass_flag) {
-+        int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
-+        if (cu_chroma_qp_offset_flag) {
-+            int cu_chroma_qp_offset_idx  = 0;
-+            if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
-+                cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
-+            }
-+            lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
-+            lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
-+        }
-+        lc->tu.cu_chroma_qp_offset_wanted = 0;
-+    }
-+
-+    if (cbf_luma)
-+        ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
-+
-+    if (log2_trafo_size > 2 || blk_idx == 3)
-+    {
-+        if ((cbf_chroma & CBF_CB0) != 0)
-+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+                                        log2_trafo_size_c, scan_idx_c, 1);
-+        if ((cbf_chroma & CBF_CR0) != 0)
-+            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+                                        log2_trafo_size_c, scan_idx_c, 2);
-+    }
-+
-+    return 0;
-+}
-+
-+static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
-+{
-+    set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
-+}
-+
-+
-+static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                              const unsigned int x0, const unsigned int y0,
-+                              const unsigned int log2_trafo_size,
-+                              const unsigned int trafo_depth, const unsigned int blk_idx,
-+                              const unsigned int cbf_c0)
-+{
-+    // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
-+    unsigned int cbf_c1 = cbf_c0;
-+    int split_transform_flag;
-+    int ret;
-+
-+    if (lc->cu.intra_split_flag) {
-+        if (trafo_depth == 1) {
-+            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
-+            if (ctx_cfmt(s) == 3) {
-+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
-+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
-+            } else {
-+                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
-+            }
-+        }
-+    } else {
-+        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
-+        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
-+    }
-+
-+    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
-+        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
-+        trafo_depth     < lc->cu.max_trafo_depth       &&
-+        !(lc->cu.intra_split_flag && trafo_depth == 0))
-+    {
-+        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
-+    } else {
-+        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
-+                          lc->cu.pred_mode == MODE_INTER &&
-+                          lc->cu.part_mode != PART_2Nx2N &&
-+                          trafo_depth == 0;
-+
-+        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
-+                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
-+                               inter_split;
-+    }
-+
-+    if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
-+    {
-+        const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
-+        cbf_c1 = 0;
-+
-+        if ((cbf_c0 & CBF_CB0) != 0)
-+        {
-+            cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
-+            if (wants_c1)
-+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
-+        }
-+
-+        if ((cbf_c0 & CBF_CR0) != 0)
-+        {
-+            cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
-+            if (wants_c1)
-+                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
-+        }
-+    }
-+
-+    if (split_transform_flag) {
-+        const int trafo_size_split = 1 << (log2_trafo_size - 1);
-+        const int x1 = x0 + trafo_size_split;
-+        const int y1 = y0 + trafo_size_split;
-+
-+#define SUBDIVIDE(x, y, idx)                                                    \
-+do {                                                                            \
-+    ret = hls_transform_tree(s, lc, x, y,                                       \
-+                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
-+                             cbf_c1);                                           \
-+    if (ret < 0)                                                                \
-+        return ret;                                                             \
-+} while (0)
-+
-+        SUBDIVIDE(x0, y0, 0);
-+        SUBDIVIDE(x1, y0, 1);
-+        SUBDIVIDE(x0, y1, 2);
-+        SUBDIVIDE(x1, y1, 3);
-+
-+#undef SUBDIVIDE
-+    } else {
-+        // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
-+        // trafo_size == 2 with depth == 0 the issue is moot
-+        const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
-+            ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
-+
-+        ret = hls_transform_unit(s, lc, x0, y0,
-+                                 log2_trafo_size + trafo_depth, log2_trafo_size,
-+                                 blk_idx, cbf_luma, cbf_c1);
-+        if (ret < 0)
-+            return ret;
-+
-+        if (!s->sh.disable_deblocking_filter_flag) {
-+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
-+        }
-+    }
-+    return 0;
-+}
-+
-+
-+static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
-+{
-+    GetBitContext gb;
-+    int ret;
-+
-+    ret = init_get_bits(&gb, pcm, length);
-+    if (ret < 0)
-+        return ret;
-+
-+    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
-+                       frame_stride1(s->frame, 0),
-+                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
-+
-+    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
-+                       s->frame->linesize[1],
-+                       cb_size >> ctx_hshift(s, 1),
-+                       cb_size >> ctx_vshift(s, 1),
-+                       &gb, s->ps.sps->pcm.bit_depth_chroma);
-+
-+    return 0;
-+}
-+
-+
-+// x * 2^(y*2)
-+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
-+{
-+    return x << (y * 2);
-+}
-+
-+static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
-+{
-+    // Length in bits
-+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
-+
-+    const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
-+
-+    if (!s->sh.disable_deblocking_filter_flag)
-+        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+
-+    // Copy coeffs
-+    {
-+        const int blen = (length + 7) >> 3;
-+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
-+        // Allocation is in int16_t s
-+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
-+        // sample this rounding doesn't affect the total size we need to allocate for
-+        // the coeff buffer
-+        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
-+        memcpy(coeffs, pcm, blen);
-+
-+        // Our coeff stash assumes that any partially allocated 64byte lump
-+        // is zeroed so make that true.
-+        {
-+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
-+            if ((-(intptr_t)eopcm & 63) != 0)
-+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
-+        }
-+
-+        // Add command
-+        {
-+            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+            cmd->type = RPI_PRED_I_PCM;
-+            cmd->size = log2_cb_size;
-+            cmd->i_pcm.src = coeffs;
-+            cmd->i_pcm.x = x0;
-+            cmd->i_pcm.y = y0;
-+            cmd->i_pcm.src_len = length;
-+        }
-+        return 0;
-+    }
-+}
-+
-+
-+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
-+                                const MvXY xy, const int y0, const int height)
-+{
-+    if (s->threads_type != 0) {
-+        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
-+
-+        // Progress has to be attached to current job as the actual wait
-+        // is in worker_core which can't use lc
-+        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
-+        if (*pr < y) {
-+            *pr = y;
-+        }
-+    }
-+}
-+
-+static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                  const int x0, const int y0, const int nPbW,
-+                                  const int nPbH,
-+                                  HEVCRpiMvField * const mv)
-+{
-+    enum InterPredIdc inter_pred_idc = PRED_L0;
-+    int mvp_flag;
-+    const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
-+
-+    mv->pred_flag = 0;
-+    if (s->sh.slice_type == HEVC_SLICE_B)
-+        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
-+
-+    if (inter_pred_idc != PRED_L1) {
-+        MvXY mvd;
-+
-+        if (s->sh.nb_refs[L0])
-+            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
-+
-+        mv->pred_flag = PF_L0;
-+        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+                                 mv, mvp_flag, 0);
-+        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
-+    }
-+
-+    if (inter_pred_idc != PRED_L0) {
-+        MvXY mvd = 0;
-+
-+        if (s->sh.nb_refs[L1])
-+            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
-+
-+        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
-+            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+
-+        mv->pred_flag += PF_L1;
-+        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+                                 mv, mvp_flag, 1);
-+        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
-+    }
-+}
-+
-+
-+static HEVCRpiInterPredQ *
-+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
-+{
-+    HEVCRpiInterPredQ * yp = NULL;
-+    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
-+    const unsigned int max_fill = ipe->max_fill;
-+    unsigned int load = UINT_MAX;
-+
-+    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
-+        // We will always have enough room between the Qs but if we are
-+        // running critically low due to poor scheduling then use fill size
-+        // rather than load to determine QPU.  This has obvious dire
-+        // performance implications but (a) it is better than crashing
-+        // and (b) it should (almost) never happen
-+        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
-+        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
-+
-+        if (tload < load)
-+        {
-+            yp = ypt;
-+            load = tload;
-+        }
-+    }
-+
-+    yp->load += load_val;
-+    ipe->used_grp = 1;
-+    qpu_mc_link_set(yp->qpu_mc_curr, fn);
-+
-+    return yp;
-+}
-+
-+
-+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
-+{
-+    for (unsigned int i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
-+
-+        qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
-+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
-+        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
-+    }
-+}
-+
-+// Returns 0 on success
-+// We no longer check for Q fullness as wew have emergncy code in ctu alloc
-+// * However it might be an idea to have some means of spotting that we've used it
-+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
-+{
-+    if (!ipe->used_grp)
-+        return 0;
-+
-+    if ((ipe->curr += ipe->n_grp) >= ipe->n)
-+    {
-+        ipe->curr = 0;
-+        rpi_inter_pred_sync(ipe);
-+    }
-+    ipe->used = 1;
-+    ipe->used_grp = 0;
-+
-+    return 0;
-+}
-+
-+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+
-+    ipe->curr = 0;
-+    ipe->used = 0;
-+    ipe->used_grp = 0;
-+    for (i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base;
-+        q->load = 0;
-+        q->last_l0 = NULL;
-+        q->last_l1 = NULL;
-+    }
-+}
-+
-+static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
-+                                 const unsigned int n_max, const unsigned int n_grp,
-+                                 const unsigned int total_size, const unsigned int min_gap)
-+{
-+    int rv;
-+
-+    memset(ipe, 0, sizeof(*ipe));
-+    if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    ipe->n_grp = n_grp;
-+    ipe->min_gap = min_gap;
-+
-+    if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
-+        av_freep(&ipe->q);
-+    return rv;
-+}
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define get_mc_address_y(f) ((f)->data[0])
-+#else
-+#define get_mc_address_y(f) get_vc_address_y(f)
-+#endif
-+#if RPI_QPU_EMU_C
-+#define get_mc_address_u(f) ((f)->data[1])
-+#else
-+#define get_mc_address_u(f) get_vc_address_u(f)
-+#endif
-+
-+static inline uint32_t pack_wo_p(const int off, const int mul)
-+{
-+    return PACK2(off * 2 + 1, mul);
-+}
-+
-+static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
-+{
-+    return PACK2(off0 + off1 + 1, mul);
-+}
-+
-+
-+static void
-+rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const MvXY mv_xy,
-+           const int weight_mul,
-+           const int weight_offset,
-+           AVFrame *const src_frame)
-+{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const unsigned int mx          = MV_X(mv_xy) & 3;
-+    const unsigned int my          = MV_Y(mv_xy) & 3;
-+    const unsigned int my_mx       = (my << 8) | mx;
-+    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
-+    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
-+    const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+
-+    if (my_mx == 0)
-+    {
-+        const int x1 = x0 + (MV_X(mv_xy) >> 2);
-+        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
-+        const int bh = nPbH;
-+
-+        for (int start_x = 0; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
-+
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                ++ts->y_pred1_x0y0;
-+
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
-+
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src_vc_address_y;
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->wo1 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+    else
-+    {
-+        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
-+        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
-+        const unsigned int bh = nPbH;
-+        int start_x = 0;
-+
-+#if 1
-+        // As Y-pred operates on two independant 8-wide src blocks we can merge
-+        // this pred with the previous one if it the previous one is 8 pel wide,
-+        // the same height as the current block, immediately to the left of our
-+        // current dest block and mono-pred.
-+
-+        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
-+        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
-+        {
-+            const int bw = FFMIN(nPbW, 8);
-+            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
-+
-+            last_y8_src2->x = x1_m3;
-+            last_y8_src2->y = y1_m3;
-+            last_y8_src2->base = src_vc_address_y;
-+            last_y8_p->w += bw;
-+            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
-+            last_y8_p->wo2 = wo;
-+
-+            jb->last_y8_p = NULL;
-+            jb->last_y8_l1 = NULL;
-+            start_x = bw;
-+#if RPI_TSTATS
-+            ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
-+#endif
-+        }
-+#endif
-+
-+        for (; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                if (mx == 0 && my == 0)
-+                    ++ts->y_pred1_x0y0;
-+                else if (mx == 0)
-+                    ++ts->y_pred1_x0;
-+                else if (my == 0)
-+                    ++ts->y_pred1_y0;
-+                else
-+                    ++ts->y_pred1_xy;
-+
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
-+            src1->x = x1_m3 + start_x;
-+            src1->y = y1_m3;
-+            src1->base = src_vc_address_y;
-+            if (bw <= 8)
-+            {
-+                src2->x = MC_DUMMY_X;
-+                src2->y = MC_DUMMY_Y;
-+#if RPI_QPU_EMU_Y
-+                src2->base = s->qpu_dummy_frame_emu;
-+#else
-+                src2->base = s->qpu_dummy_frame_qpu;
-+#endif
-+            }
-+            else
-+            {
-+                src2->x = x1_m3 + start_x + 8;
-+                src2->y = y1_m3;
-+                src2->base = src_vc_address_y;
-+            }
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo;
-+            cmd_y->wo2 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+
-+            if (bw == 8) {
-+                jb->last_y8_l1 = src2;
-+                jb->last_y8_p = cmd_y;
-+            }
-+        }
-+    }
-+}
-+
-+static void
-+rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const struct HEVCRpiMvField *const mv_field,
-+           const AVFrame *const src_frame,
-+           const AVFrame *const src_frame2)
-+{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const MvXY mv  = mv_field->xy[0];
-+    const MvXY mv2 = mv_field->xy[1];
-+
-+    const unsigned int mx          = MV_X(mv) & 3;
-+    const unsigned int my          = MV_Y(mv) & 3;
-+    const unsigned int my_mx = (my<<8) | mx;
-+    const unsigned int mx2          = MV_X(mv2) & 3;
-+    const unsigned int my2          = MV_Y(mv2) & 3;
-+    const unsigned int my2_mx2 = (my2<<8) | mx2;
-+    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
-+    const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+    const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+    const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
-+    const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
-+
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+
-+    if (my2_mx2_my_mx == 0)
-+    {
-+        const int x1 = x0 + (MV_X(mv) >> 2);
-+        const int y1 = y0 + (MV_Y(mv) >> 2);
-+        const int x2 = x0 + (MV_X(mv2) >> 2);
-+        const int y2 = y0 + (MV_Y(mv2) >> 2);
-+        const int bh = nPbH;
-+
-+        // Can do chunks a full 16 wide if we don't want the H filter
-+        for (int start_x=0; start_x < nPbW; start_x += 16)
-+        {
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                ++ts->y_pred2_x0y0;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 16);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = 0;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+    else
-+    {
-+        // Filter requires a run-up of 3
-+        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
-+        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
-+        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
-+        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
-+        const int bh = nPbH;
-+
-+        for (int start_x=0; start_x < nPbW; start_x += 8)
-+        { // B blocks work 8 at a time
-+            // B weights aren't doubled as the QPU code does the same
-+            // amount of work as it does for P
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+                const unsigned int mmx = mx | mx2;
-+                const unsigned int mmy = my | my2;
-+                if (mmx == 0 && mmy == 0)
-+                    ++ts->y_pred2_x0y0;
-+                else if (mmx == 0)
-+                    ++ts->y_pred2_x0;
-+                else if (mmy == 0)
-+                    ++ts->y_pred2_y0;
-+                else
-+                    ++ts->y_pred2_xy;
-+
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 8);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+  const unsigned int lx, const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const MvXY mv,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  AVFrame * const src_frame)
-+{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // = s->ps.sps->hshift[1];
-+    const int vshift = 1; // = s->ps.sps->vshift[1];
-+
-+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
-+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
-+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
-+    const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
-+    const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
-+    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
-+    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
-+
-+    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
-+        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
-+        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
-+        qpu_mc_src_t * const last_lx = *plast_lx;
-+        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+        last_lx->x = x1_c + start_x;
-+        last_lx->y = y1_c;
-+        last_lx->base = src_base_u;
-+        cmd_c->h = bh;
-+        cmd_c->w = bw;
-+        cmd_c->coeffs_x = x_coeffs;
-+        cmd_c->coeffs_y = y_coeffs;
-+        cmd_c->wo_u = wo_u;
-+        cmd_c->wo_v = wo_v;
-+        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
-+        *plast_lx = &cmd_c->next_src;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
-+    }
-+    return;
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+  const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const struct HEVCRpiMvField * const mv_field,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  const int16_t * const c_weights2,
-+  const int16_t * const c_offsets2,
-+  AVFrame * const src_frame,
-+  AVFrame * const src_frame2)
-+{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // s->ps.sps->hshift[1];
-+    const int vshift = 1; // s->ps.sps->vshift[1];
-+    const MvXY mv = mv_field->xy[0];
-+    const MvXY mv2 = mv_field->xy[1];
-+
-+    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
-+    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
-+    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+
-+    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
-+    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
-+    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
-+
-+    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
-+    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
-+
-+    const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
-+    const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
-+
-+    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
-+
-+    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
-+        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
-+        qpu_mc_src_t * const src_l0 = cp->last_l0;
-+        qpu_mc_src_t * const src_l1 = cp->last_l1;
-+
-+        src_l0->x = x1_c + start_x;
-+        src_l0->y = y1_c;
-+        src_l0->base = src1_base;
-+        src_l1->x = x2_c + start_x;
-+        src_l1->y = y2_c;
-+        src_l1->base = src2_base;
-+
-+        u[0].h = bh;
-+        u[0].w = bw;
-+        u[0].coeffs_x1 = coefs0_x;
-+        u[0].coeffs_y1 = coefs0_y;
-+        u[0].weight_u1 = c_weights[0]; // Weight L0 U
-+        u[0].weight_v1 = c_weights[1]; // Weight L0 V
-+        u[0].coeffs_x2 = coefs1_x;
-+        u[0].coeffs_y2 = coefs1_y;
-+        u[0].wo_u2 = wo_u2;
-+        u[0].wo_v2 = wo_v2;
-+        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
-+
-+        cp->last_l0 = &u[0].next_src1;
-+        cp->last_l1 = &u[0].next_src2;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+    }
-+}
-+
-+
-+static inline void
-+col_stash(const HEVCRpiContext * const s,
-+          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
-+          const HEVCRpiMvField * const mvf)
-+{
-+    ColMvField * const col_mvf = s->ref->col_mvf;
-+    const unsigned int x = (x0 + 15) >> 4;
-+    const unsigned int y = (y0 + 15) >> 4;
-+    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
-+    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
-+
-+    if (col_mvf != NULL && w != 0 && h != 0)
-+    {
-+        // Only record MV from the top left of the 16x16 block
-+
-+        const RefPicList * const rpl = s->refPicList;
-+        const ColMvField cmv = {
-+            .L = {
-+                {
-+                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
-+                            COL_POC_INTRA :
-+                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
-+                    .xy = mvf->xy[0]
-+                },
-+                {
-+                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
-+                            COL_POC_INTRA :
-+                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
-+                    .xy = mvf->xy[1]
-+                }
-+            }
-+        };
-+
-+        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
-+        const unsigned int stride = s->col_mvf_stride - w;
-+        unsigned int j = h;
-+
-+        do
-+        {
-+            unsigned int k = w;
-+            do
-+            {
-+                *p++ = cmv;
-+            } while (--k != 0);
-+            p += stride;
-+        } while (--j != 0);
-+    }
-+}
-+
-+static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const unsigned int x0, const unsigned int y0,
-+                                const unsigned int nPbW, const unsigned int nPbH,
-+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
-+{
-+    HEVCRpiJob * const jb = lc->jb0;
-+
-+    struct HEVCRpiMvField current_mv = {{0}};
-+    const RefPicList  *const refPicList = s->refPicList;
-+    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
-+
-+    if (lc->cu.pred_mode != MODE_SKIP)
-+        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
-+
-+    if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
-+        const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
-+            ff_hevc_rpi_merge_idx_decode(s, lc);
-+
-+        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-+                                   partIdx, merge_idx, &current_mv);
-+    } else {
-+        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
-+    }
-+
-+    {
-+        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+        unsigned int i, j;
-+
-+        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
-+        {
-+            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
-+                p[i] = current_mv;
-+            p += MVF_STASH_WIDTH_PU;
-+        }
-+    }
-+
-+    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
-+
-+    if (current_mv.pred_flag & PF_L0) {
-+        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
-+        if (!ref0)
-+            return;
-+        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
-+    }
-+    if (current_mv.pred_flag & PF_L1) {
-+        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
-+        if (!ref1)
-+            return;
-+        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
-+    }
-+
-+    if (current_mv.pred_flag == PF_L0) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
-+          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-+          ref0->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
-+              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+              ref0->frame);
-+            return;
-+        }
-+    } else if (current_mv.pred_flag == PF_L1) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
-+          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-+          ref1->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
-+              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+              ref1->frame);
-+            return;
-+        }
-+    } else if (current_mv.pred_flag == PF_BI) {
-+        const int x0_c = x0 >> ctx_hshift(s, 1);
-+        const int y0_c = y0 >> ctx_vshift(s, 1);
-+        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
-+
-+        if (ctx_cfmt(s) != 0) {
-+          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
-+                       &current_mv,
-+                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
-+                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
-+                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+                       ref0->frame,
-+                       ref1->frame);
-+            return;
-+        }
-+    }
-+}
-+
-+static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                    const unsigned int x0, const unsigned int y0,
-+                    const unsigned int log2_cb_size,
-+                    const unsigned int ipm)
-+{
-+    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
-+    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
-+
-+    {
-+        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
-+        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
-+    }
-+
-+    // If IRAP then everything is Intra & we avoid ever looking at these
-+    // stashes so don't bother setting them
-+    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
-+    {
-+        if (s->is_intra != NULL)
-+        {
-+            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
-+        }
-+
-+        {
-+            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
-+            unsigned int n = size_in_pus;
-+
-+            do
-+            {
-+                memset(p, 0, size_in_pus * sizeof(*p));
-+                p += MVF_STASH_WIDTH_PU;
-+            } while (--n != 0);
-+        }
-+
-+
-+        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
-+        {
-+            // Only record top left stuff
-+            // Blocks should always be alinged on size boundries
-+            // so cannot have overflow from a small block
-+
-+            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
-+            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
-+            const unsigned int stride = s->col_mvf_stride - size_in_col;
-+            unsigned int j = size_in_col;
-+
-+            do
-+            {
-+                unsigned int k = size_in_col;
-+                do
-+                {
-+                    p->L[0].poc = COL_POC_INTRA;
-+                    p->L[0].xy = 0;
-+                    p->L[1].poc = COL_POC_INTRA;
-+                    p->L[1].xy = 0;
-+                    ++p;
-+                } while (--k != 0);
-+                p += stride;
-+            } while (--j != 0);
-+        }
-+    }
-+}
-+
-+static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                                const unsigned int x0, const unsigned int y0,
-+                                                const unsigned int log2_cb_size)
-+{
-+    set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
-+}
-+
-+
-+/**
-+ * 8.4.1
-+ */
-+static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                int x0, int y0, int log2_pu_size,
-+                                int prev_intra_luma_pred_flag,
-+                                const unsigned int idx)
-+{
-+    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+    const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+
-+    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
-+    // lc we can just keep 1 CTB lR stashes
-+    // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
-+    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
-+    const unsigned int cand_left = lc->ipm_left[yb_pu];
-+
-+    unsigned int intra_pred_mode;
-+    unsigned int a, b, c;
-+
-+    if (cand_left == cand_up) {
-+        if (cand_left < 2) {
-+            a = INTRA_PLANAR;
-+            b = INTRA_DC;
-+            c = INTRA_ANGULAR_26;
-+        } else {
-+            a = cand_left;
-+            b = 2 + ((cand_left - 2 - 1 + 32) & 31);
-+            c = 2 + ((cand_left - 2 + 1) & 31);
-+        }
-+    } else {
-+        a = cand_left;
-+        b = cand_up;
-+        c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
-+                INTRA_PLANAR :
-+            (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
-+                INTRA_DC :
-+                INTRA_ANGULAR_26;
-+    }
-+
-+    if (prev_intra_luma_pred_flag) {
-+        intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
-+    } else {
-+        // Sort lowest 1st
-+        if (a > b)
-+            FFSWAP(int, a, b);
-+        if (a > c)
-+            FFSWAP(int, a, c);
-+        if (b > c)
-+            FFSWAP(int, b, c);
-+
-+        intra_pred_mode = idx;
-+        if (intra_pred_mode >= a)
-+            intra_pred_mode++;
-+        if (intra_pred_mode >= b)
-+            intra_pred_mode++;
-+        if (intra_pred_mode >= c)
-+            intra_pred_mode++;
-+    }
-+
-+    /* write the intra prediction units into the mv array */
-+    set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
-+    return intra_pred_mode;
-+}
-+
-+static const uint8_t tab_mode_idx[] = {
-+     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
-+    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
-+
-+static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                  const unsigned int x0, const unsigned int y0,
-+                                  const unsigned int log2_cb_size)
-+{
-+    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
-+    uint8_t prev_intra_luma_pred_flag[4];
-+    int split   = lc->cu.part_mode == PART_NxN;
-+    const unsigned int split_size = (1 << (log2_cb_size - 1));
-+    int chroma_mode;
-+    const unsigned int n = split ? 4 : 1;
-+    unsigned int i;
-+
-+    for (i = 0; i != n; i++)
-+        prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
-+
-+    for (i = 0; i < n; i++) {
-+        // depending on mode idx is mpm or luma_pred_mode
-+        const unsigned int idx = prev_intra_luma_pred_flag[i] ?
-+            ff_hevc_rpi_mpm_idx_decode(lc) :
-+            ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
-+
-+        lc->pu.intra_pred_mode[i] =
-+            luma_intra_pred_mode(s, lc,
-+                                 x0 + ((i & 1) == 0 ? 0 : split_size),
-+                                 y0 + ((i & 2) == 0 ? 0 : split_size),
-+                                 log2_cb_size - split,
-+                                 prev_intra_luma_pred_flag[i], idx);
-+    }
-+
-+    if (ctx_cfmt(s) == 3) {
-+        for (i = 0; i < n; i++) {
-+            lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+            if (chroma_mode != 4) {
-+                if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
-+                    lc->pu.intra_pred_mode_c[i] = 34;
-+                else
-+                    lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
-+            } else {
-+                lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
-+            }
-+        }
-+    } else if (ctx_cfmt(s) == 2) {
-+        int mode_idx;
-+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+        if (chroma_mode != 4) {
-+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+                mode_idx = 34;
-+            else
-+                mode_idx = intra_chroma_table[chroma_mode];
-+        } else {
-+            mode_idx = lc->pu.intra_pred_mode[0];
-+        }
-+        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
-+    } else if (ctx_cfmt(s) != 0) {
-+        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+        if (chroma_mode != 4) {
-+            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+                lc->pu.intra_pred_mode_c[0] = 34;
-+            else
-+                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
-+        } else {
-+            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
-+        }
-+    }
-+}
-+
-+static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                           const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
-+{
-+    const unsigned int cb_size          = 1 << log2_cb_size;
-+    const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-+    const unsigned int min_cb_width     = s->ps.sps->min_cb_width;
-+    const unsigned int x_cb             = x0 >> log2_min_cb_size;
-+    const unsigned int y_cb             = y0 >> log2_min_cb_size;
-+    const unsigned int idx              = log2_cb_size - 2;
-+    const unsigned int qp_block_mask    = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+    int skip_flag = 0;
-+
-+    lc->cu.x                = x0;
-+    lc->cu.y                = y0;
-+    lc->cu.x_split          = x0;
-+    lc->cu.y_split          = y0;
-+
-+    lc->cu.pred_mode        = MODE_INTRA;
-+    lc->cu.part_mode        = PART_2Nx2N;
-+    lc->cu.intra_split_flag = 0;
-+    lc->cu.cu_transquant_bypass_flag = 0;
-+    lc->pu.intra_pred_mode[0] = 1;
-+    lc->pu.intra_pred_mode[1] = 1;
-+    lc->pu.intra_pred_mode[2] = 1;
-+    lc->pu.intra_pred_mode[3] = 1;
-+
-+    if (s->ps.pps->transquant_bypass_enable_flag) {
-+        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
-+        if (lc->cu.cu_transquant_bypass_flag)
-+            set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+    }
-+
-+    if (s->sh.slice_type != HEVC_SLICE_I) {
-+        lc->cu.pred_mode = MODE_INTER;
-+        skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
-+    }
-+
-+    if (skip_flag) {
-+        lc->cu.pred_mode = MODE_SKIP;
-+
-+        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+
-+        if (!s->sh.disable_deblocking_filter_flag)
-+            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+    } else {
-+        int pcm_flag = 0;
-+
-+        if (s->sh.slice_type != HEVC_SLICE_I)
-+            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
-+        if (lc->cu.pred_mode != MODE_INTRA ||
-+            log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
-+            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
-+                                      lc->cu.pred_mode == MODE_INTRA;
-+        }
-+
-+        if (lc->cu.pred_mode == MODE_INTRA) {
-+            if (lc->cu.part_mode == PART_2Nx2N &&
-+                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size &&  // 0 if not enabled
-+                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
-+                ff_hevc_rpi_pcm_flag_decode(lc) != 0)
-+            {
-+                int ret;
-+                pcm_flag = 1;
-+                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+                if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
-+                    return ret;
-+
-+                if (s->ps.sps->pcm.loop_filter_disable_flag)
-+                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+            } else {
-+                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
-+            }
-+        } else {
-+            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+            switch (lc->cu.part_mode) {
-+            case PART_2Nx2N:
-+                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+                break;
-+            case PART_2NxN:
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
-+                break;
-+            case PART_Nx2N:
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
-+                lc->cu.x_split = x0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
-+                break;
-+            case PART_2NxnU:
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 4;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
-+                break;
-+            case PART_2NxnD:
-+                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
-+                lc->cu.y_split = y0 + cb_size / 4 * 3;
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
-+                break;
-+            case PART_nLx2N:
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
-+                lc->cu.x_split = x0 + cb_size / 4;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                break;
-+            case PART_nRx2N:
-+                hls_prediction_unit(s, lc, x0,                   y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
-+                lc->cu.x_split = x0 + cb_size / 4 * 3;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                break;
-+            case PART_NxN:
-+                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
-+                lc->cu.x_split = x0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
-+                lc->cu.y_split = y0 + cb_size / 2;
-+                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
-+                break;
-+            }
-+        }
-+
-+        if (!pcm_flag) {
-+            int rqt_root_cbf = 1;
-+
-+            if (lc->cu.pred_mode != MODE_INTRA &&
-+                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
-+                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
-+            }
-+            if (rqt_root_cbf) {
-+                const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
-+                int ret;
-+
-+                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
-+                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
-+                                         s->ps.sps->max_transform_hierarchy_depth_inter;
-+                // transform_tree does deblock_boundary_strengths
-+                ret = hls_transform_tree(s, lc, x0, y0,
-+                                         log2_cb_size, 0, 0, cbf_c);
-+                if (ret < 0)
-+                    return ret;
-+            } else {
-+                if (!s->sh.disable_deblocking_filter_flag)
-+                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+            }
-+        }
-+    }
-+
-+    // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
-+    if (lc->tu.is_cu_qp_delta_wanted)
-+        ff_hevc_rpi_set_qPy(s, lc, x0, y0);
-+
-+    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
-+        lc->qPy_pred = lc->qp_y;
-+    }
-+
-+    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
-+
-+    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
-+
-+    return 0;
-+}
-+
-+// Returns:
-+//  < 0  Error
-+//  0    More data wanted
-+//  1    EoSlice / EoPicture
-+static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+                               const int log2_cb_size, const unsigned int cb_depth)
-+{
-+    const int cb_size    = 1 << log2_cb_size;
-+    int ret;
-+    int split_cu;
-+
-+    lc->ct_depth = cb_depth;
-+    split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
-+    if (x0 + cb_size <= s->ps.sps->width  &&
-+        y0 + cb_size <= s->ps.sps->height &&
-+        split_cu)
-+    {
-+        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
-+    }
-+
-+    // Qp delta (and offset) need to remain wanted if cb_size < min until
-+    // a coded block is found so we still initial state at depth 0 (outside
-+    // this fn) and only reset here
-+    if (s->ps.pps->cu_qp_delta_enabled_flag &&
-+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+    {
-+        lc->tu.is_cu_qp_delta_wanted = 1;
-+        lc->tu.cu_qp_delta          = 0;
-+    }
-+    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
-+        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+    {
-+        lc->tu.cu_chroma_qp_offset_wanted = 1;
-+    }
-+
-+    lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
-+    lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
-+    lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
-+
-+    if (split_cu) {
-+        int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+        const int cb_size_split = cb_size >> 1;
-+        const int x1 = x0 + cb_size_split;
-+        const int y1 = y0 + cb_size_split;
-+
-+        int more_data = 0;
-+
-+        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
-+        if (more_data < 0)
-+            return more_data;
-+
-+        if (more_data && x1 < s->ps.sps->width) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && x1 < s->ps.sps->width &&
-+            y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+
-+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
-+            lc->qPy_pred = lc->qp_y;
-+
-+        if (more_data)
-+            return ((x1 + cb_size_split) < s->ps.sps->width ||
-+                    (y1 + cb_size_split) < s->ps.sps->height);
-+        else
-+            return 0;
-+    } else {
-+        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
-+        if (ret < 0)
-+            return ret;
-+        if ((!((x0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (x0 + cb_size >= s->ps.sps->width)) &&
-+            (!((y0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (y0 + cb_size >= s->ps.sps->height))) {
-+            int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+            return !end_of_slice_flag;
-+        } else {
-+            return 1;
-+        }
-+    }
-+
-+    return 0;  // NEVER
-+}
-+
-+static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
-+{
-+    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
-+    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
-+    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+    const unsigned int line_w = s->ps.sps->ctb_width;
-+
-+    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
-+
-+    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
-+    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
-+
-+    lc->boundary_flags = 0;
-+
-+    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
-+        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
-+    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
-+        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
-+    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
-+        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
-+    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
-+        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
-+
-+    // Use line width rather than tile width for addr_in_slice test as
-+    // addr_in_slice is in raster units
-+
-+    lc->ctb_avail =
-+        ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
-+        ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
-+        ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
-+            (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
-+        ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
-+            (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
-+    // Down-left never avail at CTB level
-+}
-+
-+
-+static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
-+        (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
-+
-+    // Signal
-+    if (y > 0) {
-+        // Cast away const as progress is held in s, but this really shouldn't confuse anything
-+        ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
-+    }
-+
-+    // Job done now
-+    // ? Move outside this fn
-+    job_free(s->jbc, jb);
-+}
-+
-+// I-pred, transform_and_add for all blocks types done here
-+// All ARM
-+static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    unsigned int i;
-+    HEVCRpiIntraPredEnv * const iap = &jb->intra;
-+    const HEVCPredCmd *cmd = iap->cmds;
-+
-+#if !RPI_WORKER_WAIT_PASS_0
-+    rpi_sem_wait(&jb->sem);
-+    rpi_cache_flush_execute(jb->rfe);  // Invalidate data set up in pass1
-+#endif
-+
-+    for (i = iap->n; i > 0; i--, cmd++)
-+    {
-+        switch (cmd->type)
-+        {
-+            case RPI_PRED_INTRA:
-+                s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
-+                break;
-+            case RPI_PRED_INTRA_C:
-+                s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL:
-+                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+                break;
-+            case RPI_PRED_ADD_DC:
-+                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_U:
-+                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_V:
-+                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+                break;
-+            case RPI_PRED_ADD_RESIDUAL_C:
-+                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+                break;
-+            case RPI_PRED_ADD_DC_U:
-+            case RPI_PRED_ADD_DC_V:
-+                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+                break;
-+
-+            case RPI_PRED_I_PCM:
-+                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
-+                break;
-+
-+            default:
-+                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
-+                abort();
-+        }
-+    }
-+
-+    // Mark done
-+    iap->n = 0;
-+}
-+
-+
-+// Set initial uniform job values & zero ctu_count
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
-+{
-+    unsigned int i;
-+    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
-+    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
-+    const HEVCRpiSPS * const sps = s->ps.sps;
-+
-+    const uint16_t pic_width_y   = sps->width;
-+    const uint16_t pic_height_y  = sps->height;
-+
-+    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
-+    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
-+
-+    // We expect the pointer to change if we use another sps
-+    if (sps != jb->sps)
-+    {
-+        worker_pic_free_one(jb);
-+
-+        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
-+        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
-+
-+        {
-+            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
-+            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
-+            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
-+        }
-+
-+        jb->sps = sps;
-+    }
-+
-+    jb->waited = 0;
-+    jb->ctu_ts_first = ctu_ts_first;
-+    jb->ctu_ts_last = -1;
-+
-+    rpi_inter_pred_reset(cipe);
-+    for (i = 0; i < cipe->n; i++) {
-+        HEVCRpiInterPredQ * const cp = cipe->q + i;
-+        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
-+
-+        u->next_src1.x = 0;
-+        u->next_src1.y = 0;
-+        u->next_src1.base = 0;
-+        u->pic_cw = pic_width_c;
-+        u->pic_ch = pic_height_c;
-+        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        cp->last_l0 = &u->next_src1;
-+
-+        u->next_fn = 0;
-+        u->next_src2.x = 0;
-+        u->next_src2.y = 0;
-+        u->next_src2.base = 0;
-+        cp->last_l1 = &u->next_src2;
-+
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+    }
-+
-+    rpi_inter_pred_reset(yipe);
-+    for (i = 0; i < yipe->n; i++) {
-+        HEVCRpiInterPredQ * const yp = yipe->q + i;
-+        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
-+
-+        y->next_src1.x = 0;
-+        y->next_src1.y = 0;
-+        y->next_src1.base = 0;
-+        y->next_src2.x = 0;
-+        y->next_src2.y = 0;
-+        y->next_src2.base = 0;
-+        y->pic_h = pic_height_y;
-+        y->pic_w = pic_width_y;
-+        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        y->next_fn = 0;
-+        yp->last_l0 = &y->next_src1;
-+        yp->last_l1 = &y->next_src2;
-+
-+        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
-+    }
-+
-+    jb->last_y8_p = NULL;
-+    jb->last_y8_l1 = NULL;
-+
-+    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+        jb->progress_req[i] = -1;
-+    }
-+
-+    worker_pic_reset(&jb->coeffs);
-+}
-+
-+
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
-+    unsigned int max_block = 0;
-+
-+    if (!ipe->used) {
-+        return 0;
-+    }
-+
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
-+
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
-+        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
-+
-+        if (block_size > max_block)
-+            max_block = block_size;
-+
-+        qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_qpu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_qpu;
-+
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
-+
-+        // Add to mailbox list
-+        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
-+        mail[i][1] = yp->code_setup;
-+    }
-+
-+    // We don't need invalidate here as the uniforms aren't changed by the QPU
-+    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
-+    // new values which seems to give us a small performance advantage
-+    //
-+    // In most cases we will not have a completely packed set of uniforms and as
-+    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
-+    // fullest
-+    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
-+                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
-+                                  ipe->n, ipe->max_fill + ipe->min_gap);
-+    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
-+
-+    return 1;
-+}
-+#endif
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
-+    if (!ipe->used) {
-+        return 0;
-+    }
-+
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
-+
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
-+
-+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_emu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_emu;
-+
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
-+    }
-+
-+    return 1;
-+}
-+#endif
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define mc_terminate_add_y mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_y mc_terminate_add_qpu
-+#endif
-+#if RPI_QPU_EMU_C
-+#define mc_terminate_add_c mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_c mc_terminate_add_qpu
-+#endif
-+
-+
-+static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
-+{
-+    rpi_cache_buf_t cbuf;
-+    rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+    rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+    rpi_cache_flush_finish(rfe);
-+}
-+
-+static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
-+    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
-+    const unsigned int ctb_width = s->ps.sps->ctb_width;
-+    RpiBlk *const bounds = &jb->bounds;
-+    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
-+    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+}
-+
-+#if RPI_PASSES == 2
-+static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s, jb);
-+
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s, jb);
-+}
-+#endif
-+
-+// Core execution tasks
-+static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+    int pred_y, pred_c;
-+    vpu_qpu_job_env_t qvbuf;
-+    const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
-+#if RPI_WORKER_WAIT_PASS_0
-+    int do_wait;
-+#endif
-+
-+    {
-+        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+        if (cf->s[3].n + cf->s[2].n != 0)
-+        {
-+            const unsigned int csize = sizeof(cf->s[3].buf[0]);
-+            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
-+            unsigned int n16 = (cf->s[2].n >> 8);
-+            unsigned int n32 = (cf->s[3].n >> 10);
-+#if RPI_COMPRESS_COEFFS
-+            if (cf->s[2].packed) {
-+                n16 = n16 | (n16<<16);
-+            } else {
-+                const unsigned int npack16 = (cf->s[2].packed_n>>8);
-+                n16 = n16 | (npack16<<16);
-+            }
-+            if (cf->s[3].packed) {
-+                n32 = n32 | (n32<<16);
-+            } else {
-+                const unsigned int npack32 = (cf->s[3].packed_n>>10);
-+                n32 = n32 | (npack32<<16);
-+            }
-+#endif
-+            vpu_qpu_job_add_vpu(vqj,
-+                vpu_get_fn(s->ps.sps->bit_depth),
-+                vpu_get_constants(),
-+                cf->gptr.vc,
-+                n16,
-+                cf->gptr.vc + offset32,
-+                n32,
-+                0);
-+
-+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
-+            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
-+        }
-+    }
-+
-+    pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
-+
-+// We could take a sync here and try to locally overlap QPU processing with ARM
-+// but testing showed a slightly negative benefit with noticable extra complexity
-+
-+    pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
-+
-+    // Returns 0 if nothing to do, 1 if sync added
-+#if RPI_WORKER_WAIT_PASS_0
-+    do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
-+#else
-+    if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
-+        sem_post(&jb->sem);
-+#endif
-+
-+    rpi_cache_flush_execute(jb->rfe);
-+
-+    // Await progress as required
-+    // jb->waited will only be clear if we have already tested the progress values
-+    // (in worker_submit_job) and found we don't have to wait
-+    if (jb->waited)
-+    {
-+        unsigned int i;
-+        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+            if (jb->progress_req[i] >= 0) {
-+                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
-+            }
-+        }
-+    }
-+
-+    vpu_qpu_job_finish(vqj);
-+
-+    // We always work on a rectangular block
-+    if (pred_y || pred_c)
-+    {
-+        rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
-+                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
-+                                        ctx_vshift(s, 1), pred_y, pred_c);
-+    }
-+
-+    // If we have emulated VPU ops - do it here
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    if (av_rpi_is_sand8_frame(s->frame))
-+    {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
-+#else
-+        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
-+#endif
-+    }
-+    else
-+    {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
-+#else
-+        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
-+#endif
-+    }
-+#endif
-+
-+#if RPI_WORKER_WAIT_PASS_0
-+    if (do_wait)
-+        rpi_sem_wait(&jb->sem);
-+    rpi_cache_flush_execute(jb->rfe);
-+#endif
-+}
-+
-+
-+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
-+{
-+    av_freep(&ipe->q);
-+    gpu_free(&ipe->gptr);
-+}
-+
-+static HEVCRpiJob * job_new(void)
-+{
-+    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
-+
-+    if (jb == NULL)
-+        return NULL;
-+
-+    sem_init(&jb->sem, 0, 0);
-+    jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
-+    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
-+
-+    jb->intra.n = 0;
-+    if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
-+        goto fail1;
-+
-+    // * Sizeof the union structure might be overkill but at the moment it
-+    //   is correct (it certainly isn't going to be too small)
-+    // Set max fill to slack/2 from the end of the Q
-+    // If we exceed this in any Q then we will schedule by size (which should
-+    // mean that we never use that Q again part from syncs)
-+    // * Given how agressive the overflow resonse is we could maybe put the
-+    //   threshold even nearer the end, but I don't expect us to ever hit
-+    //   it on any real stream anyway.
-+
-+    if (rpi_inter_pred_alloc(&jb->chroma_ip,
-+                         QPU_N_MAX, QPU_N_GRP,
-+                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
-+                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
-+        goto fail2;
-+    if (rpi_inter_pred_alloc(&jb->luma_ip,
-+                         QPU_N_MAX,  QPU_N_GRP,
-+                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
-+                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
-+        goto fail3;
-+
-+    return jb;
-+
-+fail3:
-+    rpi_free_inter_pred(&jb->luma_ip);
-+fail2:
-+    av_freep(&jb->intra.cmds);
-+fail1:
-+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
-+    rpi_cache_flush_finish(jb->rfe);
-+    sem_destroy(&jb->sem);
-+    return NULL;
-+}
-+
-+static void job_delete(HEVCRpiJob * const jb)
-+{
-+    worker_pic_free_one(jb);
-+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
-+    rpi_free_inter_pred(&jb->chroma_ip);
-+    rpi_free_inter_pred(&jb->luma_ip);
-+    av_freep(&jb->intra.cmds);
-+    rpi_cache_flush_finish(jb->rfe);  // Not really needed - should do nothing
-+    sem_destroy(&jb->sem);
-+    av_free(jb);
-+}
-+
-+static void jbg_delete(HEVCRpiJobGlobal * const jbg)
-+{
-+    HEVCRpiJob * jb;
-+
-+    if (jbg == NULL)
-+        return;
-+
-+    jb = jbg->free1;
-+    while (jb != NULL)
-+    {
-+        HEVCRpiJob * const jb2 = jb;
-+        jb = jb2->next;
-+        job_delete(jb2);
-+    }
-+
-+    pthread_mutex_destroy(&jbg->lock);
-+    av_free(jbg);
-+}
-+
-+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
-+{
-+    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
-+    if (jbg == NULL)
-+        return NULL;
-+
-+    pthread_mutex_init(&jbg->lock, NULL);
-+
-+    while (job_count-- != 0)
-+    {
-+        HEVCRpiJob * const jb = job_new();
-+        if (jb == NULL)
-+            goto fail;
-+
-+        jb->next = jbg->free1;
-+        jbg->free1 = jb;
-+    }
-+
-+    return jbg;
-+
-+fail:
-+    jbg_delete(jbg);
-+    return NULL;
-+}
-+
-+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
-+{
-+    HEVCRpiJobGlobal * jbg;
-+
-+    if (jbc == NULL)
-+        return;
-+
-+    jbg = jbc->jbg;
-+
-+    if (jbc->jb1 != NULL)
-+        job_delete(jbc->jb1);
-+
-+    pthread_mutex_destroy(&jbc->in_lock);
-+    sem_destroy(&jbc->sem_out);
-+    av_free(jbc);
-+
-+    // Deref the global job context
-+    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
-+        jbg_delete(jbg);
-+}
-+
-+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
-+{
-+    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
-+
-+    if (jbc == NULL)
-+        return NULL;
-+
-+    jbc->jbg = jbg;
-+    atomic_fetch_add(&jbg->ref_count, 1);
-+
-+    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
-+    pthread_mutex_init(&jbc->in_lock, NULL);
-+
-+    if ((jbc->jb1 = job_new()) == NULL)
-+        goto fail;
-+    jbc->jb1->jbc_local = jbc;
-+
-+    return jbc;
-+
-+fail:
-+    rpi_job_ctl_delete(jbc);
-+    return NULL;
-+}
-+
-+
-+
-+static av_cold void hevc_init_worker(HEVCRpiContext * const s)
-+{
-+#if RPI_PASSES == 2
-+    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
-+#elif RPI_PASSES == 3
-+    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
-+    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
-+#else
-+#error Passes confused
-+#endif
-+    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
-+
-+    pass_queues_start_all(s);
-+}
-+
-+static av_cold void hevc_exit_worker(HEVCRpiContext *s)
-+{
-+    pass_queues_term_all(s);
-+
-+    pass_queues_kill_all(s);
-+
-+    rpi_job_ctl_delete(s->jbc);
-+    s->jbc = NULL;
-+}
-+
-+
-+static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
-+{
-+    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
-+    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
-+
-+    // Check for obvious disasters
-+    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // If dependant then ctb_addr_ts != 0 from previous check
-+    if (s->sh.dependent_slice_segment_flag) {
-+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
-+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        tile_id + s->sh.num_entry_point_offsets >= tiles)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // Tiled stuff must start at start of tile if it has multiple entry points
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        s->sh.num_entry_point_offsets != 0 &&
-+        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    ff_hevc_rpi_cabac_init_decoder(lc);
-+
-+    // Setup any required decode vars
-+    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
-+
-+//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
-+    lc->qp_y = s->sh.slice_qp;
-+
-+    // General setup
-+    lc->bt_line_no = 0;
-+    lc->ts = ctb_addr_ts;
-+    return 0;
-+}
-+
-+static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+    const GetBitContext * const gb = &s->HEVClc->gb;
-+    RpiSliceHeader * const sh = &s->sh;
-+    int i, j;
-+
-+    const unsigned int length = nal->size;
-+    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
-+    unsigned int cmpt;
-+    unsigned int startheader;
-+
-+    if (sh->num_entry_point_offsets == 0) {
-+        s->data = NULL;
-+        return 0;
-+    }
-+
-+    // offset in slice header includes emulation prevention bytes.
-+    // Unfortunately those have been removed by the time we get here so we
-+    // have to compensate.  The nal layer keeps a track of where they were.
-+    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
-+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+            startheader--;
-+            cmpt++;
-+        }
-+    }
-+
-+    for (i = 1; i < sh->num_entry_point_offsets; i++) {
-+        offset += (sh->entry_point_offset[i - 1] - cmpt);
-+        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
-+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+                startheader--;
-+                cmpt++;
-+            }
-+        }
-+        if (sh->entry_point_offset[i] <= cmpt) {
-+            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
-+        sh->offset[i - 1] = offset;
-+    }
-+
-+    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
-+    if (length < offset) {
-+        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
-+    sh->offset[sh->num_entry_point_offsets - 1] = offset;
-+
-+    // Remember data start pointer as we won't have nal later
-+    s->data = nal->data;
-+    return 0;
-+}
-+
-+
-+// Return
-+// < 0   Error
-+// 0     OK
-+//
-+// jb->ctu_ts_last < 0       Job still filling
-+// jb->ctu_ts_last >= 0      Job ready
-+
-+static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
-+{
-+    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+    const unsigned int ctb_size = (1 << log2_ctb_size);
-+    HEVCRpiJob * const jb = lc->jb0;
-+    int more_data = 1;
-+    unsigned int ctb_addr_ts = lc->ts;
-+    unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
-+    const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
-+
-+    lc->unit_done = 0;
-+
-+    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
-+    {
-+        int q_full;
-+        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+
-+        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
-+
-+        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
-+
-+        hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
-+
-+        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
-+        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-+        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+
-+        // Zap stashes if navail
-+        if ((lc->ctb_avail & AVAIL_U) == 0)
-+            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
-+        if ((lc->ctb_avail & AVAIL_L) == 0)
-+        {
-+            memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
-+            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
-+        }
-+#if MVF_STASH_WIDTH > 64
-+        // Restore left mvf stash at start of tile if not at start of line
-+        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
-+        {
-+            unsigned int i;
-+            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
-+            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+            {
-+                *dst = *src++;
-+                dst += MVF_STASH_WIDTH_PU;
-+            }
-+        }
-+#endif
-+
-+        // Set initial tu states
-+        lc->tu.cu_qp_delta = 0;
-+        lc->tu.is_cu_qp_delta_wanted = 0;
-+        lc->tu.cu_chroma_qp_offset_wanted = 0;
-+
-+        // Decode
-+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
-+
-+        if (ff_hevc_rpi_cabac_overflow(lc))
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
-+            more_data = AVERROR_INVALIDDATA;
-+        }
-+
-+        if (more_data < 0) {
-+            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
-+            return more_data;
-+        }
-+
-+        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
-+             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
-+        {
-+            if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
-+                ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
-+            {
-+                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
-+                return -1;
-+            }
-+        }
-+
-+        // --- Post CTB processing
-+
-+        // Stash rpl top/left for deblock that needs to remember such things cross-slice
-+        s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
-+        s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
-+
-+        if (!s->is_irap)
-+        {
-+            // Copy MVF up to up-left & stash to up
-+            {
-+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
-+                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
-+
-+    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
-+
-+                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
-+                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
-+            }
-+            // Stash sideways if end of tile line but not end of line (no point)
-+            // ** Could/should do this @ end of fn
-+#if MVF_STASH_WIDTH > 64
-+            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
-+#endif
-+            {
-+                unsigned int i;
-+                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
-+                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+                {
-+                    *dst++ = *src;
-+                    src += MVF_STASH_WIDTH_PU;
-+                }
-+            }
-+        }
-+
-+        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
-+            ff_hevc_rpi_save_states(s, lc);
-+
-+        // Report progress so we can use our MVs in other frames
-+        if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
-+            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
-+
-+        // End of line || End of tile line || End of tile
-+        // (EoL covers end of frame for our purposes here)
-+        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
-+
-+        // Allocate QPU chunks on fixed size 64 pel boundries rather than
-+        // whatever ctb_size is today.
-+        // * We might quite like to continue to 64 pel vertical too but that
-+        //   currently confuses WPP
-+        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
-+        {
-+            int overflow = 0;
-+            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
-+                overflow = 1;
-+            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
-+                overflow = 1;
-+            if (overflow)
-+            {
-+                // * This is very annoying (and slow) to cope with in WPP so
-+                //   we treat it as an error there (no known stream triggers this
-+                //   with the current buffer sizes).  Non-wpp should cope fine.
-+                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
-+                q_full = 1;
-+            }
-+        }
-+
-+        // Inc TS to next.
-+        ctb_addr_ts++;
-+        ctb_addr_rs++;
-+        x_ctb += ctb_size;
-+
-+        if (q_full)
-+        {
-+            // Do job
-+            // Prep for submission
-+            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
-+            job_gen_bounds(s, jb);
-+            break;
-+        }
-+
-+        // If max_blocks started as 0 then this will never be true
-+        if (--max_blocks == 0)
-+            break;
-+    }
-+
-+    lc->unit_done = (more_data <= 0);
-+    lc->ts = ctb_addr_ts;
-+    return 0;
-+}
-+
-+static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
-+{
-+    lc->context = s;
-+    lc->jb0 = NULL;
-+    lc->lc_n = n;
-+    lc->bt_terminate = 0;
-+    lc->bt_psem_out = NULL;
-+    sem_init(&lc->bt_sem_in, 0, 0);
-+}
-+
-+#define TRACE_WPP 0
-+#if RPI_EXTRA_BIT_THREADS > 0
-+static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
-+{
-+    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
-+    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
-+}
-+
-+// Move local context parameters from an aux bit thread back to the main
-+// thread at the end of a slice as processing is going to continue there.
-+static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
-+{
-+    if (src_lc == dst_lc) {
-+        return;
-+    }
-+
-+    // Move the job
-+    // We will still have an active job if the final line terminates early
-+    // Dest should always be null by now
-+    av_assert1(dst_lc->jb0 == NULL);
-+    dst_lc->jb0 = src_lc->jb0;
-+    src_lc->jb0 = NULL;
-+
-+    // Always need to store where we are in the bitstream
-+    dst_lc->ts = src_lc->ts;
-+    dst_lc->gb = src_lc->gb;
-+    // Cabac init request will be built at start of next slice
-+
-+    // Need to store context if we might have a dependent seg
-+    if (is_dep)
-+    {
-+        dst_lc->qPy_pred = src_lc->qPy_pred;
-+        memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
-+        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
-+        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
-+    }
-+}
-+
-+static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
-+{
-+    rpi_sem_wait(&lc->bt_sem_in);
-+    return lc->bt_terminate;
-+}
-+
-+// Do one WPP line
-+// Will not work correctly over horizontal tile boundries - vertical should be OK
-+static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
-+{
-+    const int is_tile = lc->bt_is_tile;
-+    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
-+    const unsigned int line = lc->bt_line_no;
-+    const unsigned int line_inc = lc->bt_line_inc;
-+    const int is_last = (line >= lc->bt_last_line);
-+
-+    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
-+    const unsigned int ts_next =
-+        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
-+            INT_MAX :
-+        is_tile ?
-+            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
-+            lc->ts + lc->bt_line_width * line_inc;
-+    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
-+    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
-+    unsigned int ts_prev;
-+    int loop_n = 0;
-+    int err = 0;
-+
-+    av_assert1(line <= s->sh.num_entry_point_offsets);
-+
-+#if TRACE_WPP
-+    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
-+           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
-+           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
-+           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
-+#endif
-+    if (line != 0)
-+    {
-+        const uint8_t * const data = s->data + s->sh.offset[line - 1];
-+        const unsigned int len = s->sh.size[line - 1];
-+        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
-+            return err;
-+
-+        ff_init_cabac_decoder(&lc->cc, data, len);
-+    }
-+
-+    // We should never be processing a dependent slice here so reset is good
-+    // ?? These probably shouldn't be needed (as they should be set by later
-+    //    logic) but do seem to be required
-+    lc->qp_y = s->sh.slice_qp;
-+
-+    do
-+    {
-+        if (!is_last && loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
-+#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+        // The wait for loop_n == 0 has been done in bit_thread
-+        if (!is_first && loop_n != 0)
-+        {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
-+#endif
-+            if (wait_bt_sem_in(lc) != 0)
-+                return AVERROR_EXIT;
-+        }
-+
-+#if TRACE_WPP
-+        {
-+            int n;
-+            sem_getvalue(&lc->bt_sem_in, &n);
-+            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
-+        }
-+#endif
-+
-+        ts_prev = lc->ts;
-+
-+        // If we have had an error - do no further decode but do continue
-+        // moving signals around so the other threads continue to operate
-+        // correctly (or at least as correctly as they can with this line missing)
-+        //
-+        // Errors in WPP/Tile are less fatal than normal as we have a good idea
-+        // of how to restart on the next line so there is no need to give up totally
-+        if (err != 0)
-+        {
-+            lc->unit_done = 0;
-+            lc->ts += partial_size;
-+        }
-+        else
-+        {
-+            worker_pass0_ready(s, lc);
-+
-+            if ((err = fill_job(s, lc, partial_size)) < 0 ||
-+                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
-+            {
-+                if (err == 0) {
-+                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
-+                    err = AVERROR_INVALIDDATA;
-+                }
-+                worker_free(s, lc);
-+                lc->ts = ts_prev + partial_size;  // Pretend we did all that
-+                lc->unit_done = 0;
-+            }
-+            else if (is_tile)
-+            {
-+                worker_submit_job(s, lc);
-+            }
-+        }
-+
-+        ++loop_n;
-+    } while (lc->ts < ts_eol && !lc->unit_done);
-+
-+    // If we are on the last line & we didn't get a whole line we must wait for
-+    // and sink the sem_posts from the line above / tile to the left.
-+    while ((ts_prev += partial_size) < ts_eol)
-+    {
-+#if TRACE_WPP
-+        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
-+#endif
-+        if (wait_bt_sem_in(lc) != 0)
-+            return AVERROR_EXIT;
-+    }
-+
-+    lc->bt_line_no += line_inc;
-+
-+    if (!is_tile && err == 0)
-+        worker_submit_job(s, lc);
-+
-+    if (!is_last) {
-+        lc->ts = ts_next;
-+
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+        sem_post(lc->bt_psem_out);
-+        if (loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+    }
-+    else
-+    {
-+        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);  // * & not EoT
-+#if MVF_STASH_WIDTH > 64
-+        // Horrid calculations to work out what we want but luckily this should almost never execute
-+        // **** Move to movlc
-+        if (!s->is_irap)
-+        {
-+            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
-+            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
-+            {
-+                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
-+                unsigned int i;
-+                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+
-+                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
-+                {
-+                    *d_mvf = *s_mvf;
-+                    d_mvf += MVF_STASH_WIDTH_PU;
-+                    s_mvf += MVF_STASH_WIDTH_PU;
-+                }
-+
-+            }
-+        }
-+#endif
-+        // When all done poke the thread 0 sem_in one final time
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
-+#endif
-+        sem_post(&s->HEVClcList[0]->bt_sem_in);
-+    }
-+
-+#if TRACE_WPP
-+    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
-+#endif
-+    return err;
-+}
-+
-+static void wpp_setup_lcs(HEVCRpiContext * const s)
-+{
-+    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int line_width = line_ts_width(s, ts);
-+
-+    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
-+    {
-+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+        lc->ts = ts;
-+        lc->bt_is_tile = 0;
-+        lc->bt_line_no = i;
-+        lc->bt_line_width = line_width;
-+        lc->bt_last_line = s->sh.num_entry_point_offsets;
-+        lc->bt_line_inc = RPI_BIT_THREADS;
-+        ts += line_width;
-+    }
-+}
-+
-+
-+// Can only process tile single row at once
-+static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
-+{
-+    const HEVCRpiPPS * const pps = s->ps.pps;
-+    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int tile0 = pps->tile_id[ts0];
-+    const unsigned int col0 = tile0 % pps->num_tile_columns;
-+
-+    const unsigned int col = (slice_row == 0) ? col0 : 0;
-+    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
-+    const unsigned int last_line = FFMIN(
-+        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
-+
-+    const unsigned int par =
-+        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
-+#if TRACE_WPP
-+    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
-+           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
-+#endif
-+    for (unsigned int i = 0; i != par; ++i, ++line)
-+    {
-+        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+        const unsigned int tile = tile0 + line;
-+
-+        lc->ts = pps->tile_pos_ts[tile];
-+        lc->bt_line_no = line;
-+        lc->bt_is_tile = 1;
-+        lc->bt_line_width = line_ts_width(s, lc->ts);
-+        lc->bt_last_line = last_line;
-+        lc->bt_line_inc = par;
-+    }
-+}
-+
-+
-+static void * bit_thread(void * v)
-+{
-+    HEVCRpiLocalContext * const lc = v;
-+    HEVCRpiContext *const s = lc->context;
-+
-+    while (wait_bt_sem_in(lc) == 0)
-+    {
-+        int err;
-+
-+        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
-+            if (lc->bt_terminate) {
-+                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
-+                break;
-+            }
-+            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
-+        }
-+    }
-+
-+    return NULL;
-+}
-+
-+static int bit_threads_start(HEVCRpiContext * const s)
-+{
-+    if (s->bt_started)
-+        return 0;
-+
-+    for (int i = 1; i < RPI_BIT_THREADS; ++i)
-+    {
-+        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
-+        if (s->HEVClcList[i] == NULL) {
-+            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
-+                return -1;
-+        }
-+
-+        bt_lc_init(s, s->HEVClcList[i], i);
-+        job_lc_init(s->HEVClcList[i]);
-+    }
-+
-+    // Link the sems in a circle
-+    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
-+        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
-+    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
-+
-+    // Init all lc before starting any threads
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
-+            return -1;
-+    }
-+
-+    s->bt_started = 1;
-+    return 0;
-+}
-+
-+static int bit_threads_kill(HEVCRpiContext * const s)
-+{
-+    if (!s->bt_started)
-+        return 0;
-+    s->bt_started = 0;
-+
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
-+        if (lc == NULL)
-+            break;
-+
-+        lc->bt_terminate = 1;
-+        sem_post(&lc->bt_sem_in);
-+        pthread_join(s->bit_threads[i], NULL);
-+
-+        sem_destroy(&lc->bt_sem_in);
-+        job_lc_kill(lc);
-+    }
-+    return 0;
-+}
-+#endif
-+
-+
-+// If we are at EoT and the row is shorter than the number of jobs
-+// we can Q we have to wait for it finish otherwise we risk cache/QPU
-+// disasters
-+static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
-+{
-+    return
-+        s->ps.pps->tile_wpp_inter_disable >= 2 &&
-+        s->sh.slice_type != HEVC_SLICE_I &&
-+        n >= 0 &&
-+        (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
-+}
-+
-+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+{
-+    HEVCRpiContext * const s  = avctxt->priv_data;
-+    HEVCRpiLocalContext * const lc = s->HEVClc;
-+    int err;
-+
-+    // Start of slice
-+    if ((err = slice_start(s, lc)) != 0)
-+        return err;
-+
-+#if RPI_EXTRA_BIT_THREADS > 0
-+
-+    if (s->sh.offload_tiles)
-+    {
-+        unsigned int slice_row = 0;
-+
-+#if TRACE_WPP
-+        printf("%s: Do Tiles\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
-+
-+        do
-+        {
-+            // Reset lc lines etc.
-+            tile_one_row_setup_lcs(s, slice_row);
-+
-+#if TRACE_WPP
-+            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            while (lc->bt_line_no <= lc->bt_last_line) {
-+                rpi_sem_wait(&lc->bt_sem_in);
-+                rpi_run_one_line(s, lc, 0);
-+            }
-+#if TRACE_WPP
-+            printf("%s: Done body\n", __func__);
-+#endif
-+
-+            // Wait for everything else to finish
-+            rpi_sem_wait(&lc->bt_sem_in);
-+
-+            ++slice_row;
-+        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
-+
-+
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+    else if (s->sh.offload_wpp)
-+    {
-+#if TRACE_WPP
-+        printf("%s: Do WPP\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
-+
-+        // Reset lc lines etc.
-+        wpp_setup_lcs(s);
-+
-+        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+        printf("%s: Done 1st\n", __func__);
-+#endif
-+
-+        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
-+            rpi_sem_wait(&lc->bt_sem_in);
-+            rpi_run_one_line(s, lc, 0);
-+        }
-+#if TRACE_WPP
-+        printf("%s: Done body\n", __func__);
-+#endif
-+
-+        // Wait for everything else to finish
-+        rpi_sem_wait(&lc->bt_sem_in);
-+
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+    else
-+#endif
-+    {
-+#if TRACE_WPP
-+        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
-+#endif
-+        // Single bit thread
-+        do {
-+            // Make sure we have space to prepare the next job
-+            worker_pass0_ready(s, lc);
-+
-+            if ((err = fill_job(s, lc, 0)) < 0)
-+                goto fail;
-+
-+            worker_submit_job(s, lc);
-+
-+            if (tile_needs_wait(s, lc->ts - 1))
-+                worker_wait(s, lc);
-+
-+        } while (!lc->unit_done);
-+
-+#if TRACE_WPP
-+        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
-+
-+    // If we have reached the end of the frame or
-+    // then wait for the worker to finish all its jobs
-+    if (lc->ts >= s->ps.sps->ctb_size)
-+        worker_wait(s, lc);
-+
-+#if RPI_TSTATS
-+    {
-+        HEVCRpiStats *const ts = &s->tstats;
-+
-+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
-+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
-+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
-+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
-+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
-+        memset(ts, 0, sizeof(*ts));
-+    }
-+#endif
-+
-+    return lc->ts;
-+
-+fail:
-+    // Cleanup
-+    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
-+    // Free our job & wait for temination
-+    worker_free(s, lc);
-+    worker_wait(s, lc);
-+    return err;
-+}
-+
-+
-+static void set_no_backward_pred(HEVCRpiContext * const s)
-+{
-+    int i, j;
-+    const RefPicList *const refPicList = s->refPicList;
-+
-+    s->no_backward_pred_flag = 0;
-+    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
-+        return;
-+
-+    for (j = 0; j < 2; j++) {
-+        for (i = 0; i < refPicList[j].nb_refs; i++) {
-+            if (refPicList[j].list[i] > s->poc) {
-+                s->no_backward_pred_flag = 1;
-+                return;
-+            }
-+        }
-+    }
-+}
-+
-+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+    int err;
-+    if ((err = gen_entry_points(s, nal)) < 0)
-+        return err;
-+
-+    set_no_backward_pred(s);
-+
-+    return rpi_decode_entry(s->avctx, NULL);
-+}
-+
-+static int set_side_data(HEVCRpiContext *s)
-+{
-+    AVFrame *out = s->ref->frame;
-+
-+    if (s->sei.frame_packing.present &&
-+        s->sei.frame_packing.arrangement_type >= 3 &&
-+        s->sei.frame_packing.arrangement_type <= 5 &&
-+        s->sei.frame_packing.content_interpretation_type > 0 &&
-+        s->sei.frame_packing.content_interpretation_type < 3) {
-+        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
-+        if (!stereo)
-+            return AVERROR(ENOMEM);
-+
-+        switch (s->sei.frame_packing.arrangement_type) {
-+        case 3:
-+            if (s->sei.frame_packing.quincunx_subsampling)
-+                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
-+            else
-+                stereo->type = AV_STEREO3D_SIDEBYSIDE;
-+            break;
-+        case 4:
-+            stereo->type = AV_STEREO3D_TOPBOTTOM;
-+            break;
-+        case 5:
-+            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
-+            break;
-+        }
-+
-+        if (s->sei.frame_packing.content_interpretation_type == 2)
-+            stereo->flags = AV_STEREO3D_FLAG_INVERT;
-+
-+        if (s->sei.frame_packing.arrangement_type == 5) {
-+            if (s->sei.frame_packing.current_frame_is_frame0_flag)
-+                stereo->view = AV_STEREO3D_VIEW_LEFT;
-+            else
-+                stereo->view = AV_STEREO3D_VIEW_RIGHT;
-+        }
-+    }
-+
-+    if (s->sei.display_orientation.present &&
-+        (s->sei.display_orientation.anticlockwise_rotation ||
-+         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
-+        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
-+        AVFrameSideData *rotation = av_frame_new_side_data(out,
-+                                                           AV_FRAME_DATA_DISPLAYMATRIX,
-+                                                           sizeof(int32_t) * 9);
-+        if (!rotation)
-+            return AVERROR(ENOMEM);
-+
-+        av_display_rotation_set((int32_t *)rotation->data, angle);
-+        av_display_matrix_flip((int32_t *)rotation->data,
-+                               s->sei.display_orientation.hflip,
-+                               s->sei.display_orientation.vflip);
-+    }
-+
-+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+    // so the side data persists for the entire coded video sequence.
-+    if (s->sei.mastering_display.present > 0 &&
-+        IS_IRAP(s) && s->no_rasl_output_flag) {
-+        s->sei.mastering_display.present--;
-+    }
-+    if (s->sei.mastering_display.present) {
-+        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
-+        const int mapping[3] = {2, 0, 1};
-+        const int chroma_den = 50000;
-+        const int luma_den = 10000;
-+        int i;
-+        AVMasteringDisplayMetadata *metadata =
-+            av_mastering_display_metadata_create_side_data(out);
-+        if (!metadata)
-+            return AVERROR(ENOMEM);
-+
-+        for (i = 0; i < 3; i++) {
-+            const int j = mapping[i];
-+            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
-+            metadata->display_primaries[i][0].den = chroma_den;
-+            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
-+            metadata->display_primaries[i][1].den = chroma_den;
-+        }
-+        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
-+        metadata->white_point[0].den = chroma_den;
-+        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
-+        metadata->white_point[1].den = chroma_den;
-+
-+        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
-+        metadata->max_luminance.den = luma_den;
-+        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
-+        metadata->min_luminance.den = luma_den;
-+        metadata->has_luminance = 1;
-+        metadata->has_primaries = 1;
-+
-+        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
-+        av_log(s->avctx, AV_LOG_DEBUG,
-+               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
-+               av_q2d(metadata->display_primaries[0][0]),
-+               av_q2d(metadata->display_primaries[0][1]),
-+               av_q2d(metadata->display_primaries[1][0]),
-+               av_q2d(metadata->display_primaries[1][1]),
-+               av_q2d(metadata->display_primaries[2][0]),
-+               av_q2d(metadata->display_primaries[2][1]),
-+               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
-+        av_log(s->avctx, AV_LOG_DEBUG,
-+               "min_luminance=%f, max_luminance=%f\n",
-+               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
-+    }
-+    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+    // so the side data persists for the entire coded video sequence.
-+    if (s->sei.content_light.present > 0 &&
-+        IS_IRAP(s) && s->no_rasl_output_flag) {
-+        s->sei.content_light.present--;
-+    }
-+    if (s->sei.content_light.present) {
-+        AVContentLightMetadata *metadata =
-+            av_content_light_metadata_create_side_data(out);
-+        if (!metadata)
-+            return AVERROR(ENOMEM);
-+        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
-+        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
-+
-+        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
-+        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
-+               metadata->MaxCLL, metadata->MaxFALL);
-+    }
-+
-+    if (s->sei.a53_caption.a53_caption) {
-+        AVFrameSideData* sd = av_frame_new_side_data(out,
-+                                                     AV_FRAME_DATA_A53_CC,
-+                                                     s->sei.a53_caption.a53_caption_size);
-+        if (sd)
-+            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
-+        av_freep(&s->sei.a53_caption.a53_caption);
-+        s->sei.a53_caption.a53_caption_size = 0;
-+        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
-+    }
-+
-+    if (s->sei.alternative_transfer.present &&
-+        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
-+        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
-+        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
-+    }
-+
-+    return 0;
-+}
-+
-+static int hevc_frame_start(HEVCRpiContext * const s)
-+{
-+    int ret;
-+
-+    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
-+    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
-+
-+    // Only need to remember intra for CIP
-+    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
-+        s->is_intra = NULL;
-+    else
-+    {
-+        s->is_intra = s->is_intra_store;
-+        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+    }
-+
-+    s->is_decoded        = 0;
-+    s->first_nal_type    = s->nal_unit_type;
-+
-+    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
-+
-+    if (s->pkt.nb_nals > s->rpl_tab_size)
-+    {
-+        // In most cases it will be faster to free & realloc as that doesn't
-+        // require (an unwanted) copy
-+        av_freep(&s->rpl_tab);
-+        s->rpl_tab_size = 0;
-+        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
-+            goto fail;
-+        s->rpl_tab_size = s->pkt.nb_nals;
-+    }
-+    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
-+
-+    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
-+    if (ret < 0)
-+        goto fail;
-+
-+    // Resize rpl_tab to max that we might want
-+    ret = ff_hevc_rpi_frame_rps(s);
-+    if (ret < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
-+        goto fail;
-+    }
-+
-+    s->ref->frame->key_frame = IS_IRAP(s);
-+
-+    ret = set_side_data(s);
-+    if (ret < 0)
-+        goto fail;
-+
-+    s->frame->pict_type = 3 - s->sh.slice_type;
-+
-+    if (!IS_IRAP(s))
-+        ff_hevc_rpi_bump_frame(s);
-+
-+    av_frame_unref(s->output_frame);
-+    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
-+    if (ret < 0)
-+        goto fail;
-+
-+    ff_thread_finish_setup(s->avctx);
-+
-+    return 0;
-+
-+fail:
-+    if (s->ref)
-+        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+    s->ref = NULL;
-+    return ret;
-+}
-+
-+static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
-+{
-+    // From Table 7-1
-+    return (nal_unit_type & ~0xe) == 0;  // True for 0, 2, 4, 6, 8, 10, 12, 14
-+}
-+
-+static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
-+{
-+    GetBitContext * const gb    = &s->HEVClc->gb;
-+    int ctb_addr_ts, ret;
-+
-+    *gb              = nal->gb;
-+    s->nal_unit_type = nal->type;
-+    s->temporal_id   = nal->temporal_id;
-+
-+    switch (s->nal_unit_type) {
-+    case HEVC_NAL_VPS:
-+        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_SPS:
-+        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
-+                                     s->apply_defdispwin);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_PPS:
-+        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_SEI_PREFIX:
-+    case HEVC_NAL_SEI_SUFFIX:
-+        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
-+        if (ret < 0)
-+            goto fail;
-+        break;
-+    case HEVC_NAL_TRAIL_R:
-+    case HEVC_NAL_TRAIL_N:
-+    case HEVC_NAL_TSA_N:
-+    case HEVC_NAL_TSA_R:
-+    case HEVC_NAL_STSA_N:
-+    case HEVC_NAL_STSA_R:
-+    case HEVC_NAL_BLA_W_LP:
-+    case HEVC_NAL_BLA_W_RADL:
-+    case HEVC_NAL_BLA_N_LP:
-+    case HEVC_NAL_IDR_W_RADL:
-+    case HEVC_NAL_IDR_N_LP:
-+    case HEVC_NAL_CRA_NUT:
-+    case HEVC_NAL_RADL_N:
-+    case HEVC_NAL_RADL_R:
-+    case HEVC_NAL_RASL_N:
-+    case HEVC_NAL_RASL_R:
-+        ret = hls_slice_header(s);
-+        if (ret < 0)
-+            return ret;
-+
-+        // The definition of _N unit types is "non-reference for other frames
-+        // with the same temporal_id" so they may/will be ref frames for pics
-+        // with a higher temporal_id.
-+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
-+            !is_non_ref_unit_type(s->nal_unit_type);
-+        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
-+        s->is_irap = IS_IRAP(s);
-+
-+#if DEBUG_DECODE_N
-+        {
-+            static int z = 0;
-+            if (IS_IDR(s)) {
-+                z = 1;
-+            }
-+            if (z != 0 && z++ > DEBUG_DECODE_N) {
-+                s->is_decoded = 0;
-+                break;
-+            }
-+        }
-+#endif
-+        if (
-+            (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
-+            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
-+            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
-+            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
-+        {
-+            s->is_decoded = 0;
-+            break;
-+        }
-+
-+        if (s->sh.first_slice_in_pic_flag) {
-+            if (s->max_ra == INT_MAX) {
-+                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-+                    s->max_ra = s->poc;
-+                } else {
-+                    if (IS_IDR(s))
-+                        s->max_ra = INT_MIN;
-+                }
-+            }
-+
-+            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
-+                s->poc <= s->max_ra) {
-+                s->is_decoded = 0;
-+                break;
-+            } else {
-+                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
-+                    s->max_ra = INT_MIN;
-+            }
-+
-+            ret = hevc_frame_start(s);
-+            if (ret < 0)
-+                return ret;
-+        } else if (!s->ref) {
-+            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
-+            goto fail;
-+        }
-+
-+        if (s->nal_unit_type != s->first_nal_type) {
-+            av_log(s->avctx, AV_LOG_ERROR,
-+                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
-+                   s->first_nal_type, s->nal_unit_type);
-+            return AVERROR_INVALIDDATA;
-+        }
-+
-+        if (!s->sh.dependent_slice_segment_flag &&
-+            s->sh.slice_type != HEVC_SLICE_I) {
-+            ret = ff_hevc_rpi_slice_rpl(s);
-+            if (ret < 0) {
-+                av_log(s->avctx, AV_LOG_WARNING,
-+                       "Error constructing the reference lists for the current slice.\n");
-+                goto fail;
-+            }
-+        }
-+
-+        ctb_addr_ts = hls_slice_data(s, nal);
-+        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
-+            s->is_decoded = 1;
-+        }
-+
-+        if (ctb_addr_ts < 0) {
-+            ret = ctb_addr_ts;
-+            goto fail;
-+        }
-+        break;
-+    case HEVC_NAL_EOS_NUT:
-+    case HEVC_NAL_EOB_NUT:
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra     = INT_MAX;
-+        break;
-+    case HEVC_NAL_AUD:
-+    case HEVC_NAL_FD_NUT:
-+        break;
-+    default:
-+        av_log(s->avctx, AV_LOG_INFO,
-+               "Skipping NAL unit %d\n", s->nal_unit_type);
-+    }
-+
-+    return 0;
-+fail:
-+    if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+        return ret;
-+    return 0;
-+}
-+
-+static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
-+{
-+    int i, ret = 0;
-+    int eos_at_start = 1;
-+
-+    s->ref = NULL;
-+    s->last_eos = s->eos;
-+    s->eos = 0;
-+
-+    /* split the input packet into NAL units, so we know the upper bound on the
-+     * number of slices in the frame */
-+    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
-+                                s->nal_length_size, s->avctx->codec_id, 0, 0);
-+    if (ret < 0) {
-+        av_log(s->avctx, AV_LOG_ERROR,
-+               "Error splitting the input into NAL units.\n");
-+        return ret;
-+    }
-+
-+    for (i = 0; i < s->pkt.nb_nals; i++) {
-+        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
-+            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
-+            if (eos_at_start) {
-+                s->last_eos = 1;
-+            } else {
-+                s->eos = 1;
-+            }
-+        } else {
-+            eos_at_start = 0;
-+        }
-+    }
-+
-+    /* decode the NAL units */
-+    for (i = 0; i < s->pkt.nb_nals; i++) {
-+        ret = decode_nal_unit(s, &s->pkt.nals[i]);
-+        if (ret < 0) {
-+            av_log(s->avctx, AV_LOG_WARNING,
-+                   "Error parsing NAL unit #%d.\n", i);
-+            goto fail;
-+        }
-+    }
-+
-+fail:  // Also success path
-+    if (s->ref != NULL) {
-+        if (s->used_for_ref && s->threads_type != 0) {
-+            ff_hevc_rpi_progress_signal_all_done(s);
-+        }
-+        else {
-+            // Flush frame to real memory as we expect to be able to pass
-+            // it straight on to mmal
-+            flush_frame(s, s->frame);
-+        }
-+    }
-+    return ret;
-+}
-+
-+static void print_md5(void *log_ctx, int level, uint8_t md5[16])
-+{
-+    int i;
-+    for (i = 0; i < 16; i++)
-+        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
-+}
-+
-+static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
-+{
-+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-+    int pixel_shift;
-+    int i, j;
-+
-+    if (!desc)
-+        return AVERROR(EINVAL);
-+
-+    pixel_shift = desc->comp[0].depth > 8;
-+
-+    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
-+           s->poc);
-+
-+    /* the checksums are LE, so we have to byteswap for >8bpp formats
-+     * on BE arches */
-+#if HAVE_BIGENDIAN
-+    if (pixel_shift && !s->checksum_buf) {
-+        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
-+                       FFMAX3(frame->linesize[0], frame->linesize[1],
-+                              frame->linesize[2]));
-+        if (!s->checksum_buf)
-+            return AVERROR(ENOMEM);
-+    }
-+#endif
-+
-+    for (i = 0; frame->data[i]; i++) {
-+        int width  = s->avctx->coded_width;
-+        int height = s->avctx->coded_height;
-+        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
-+        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
-+        uint8_t md5[16];
-+
-+        av_md5_init(s->md5_ctx);
-+        for (j = 0; j < h; j++) {
-+            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
-+#if HAVE_BIGENDIAN
-+            if (pixel_shift) {
-+                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
-+                                    (const uint16_t *) src, w);
-+                src = s->checksum_buf;
-+            }
-+#endif
-+            av_md5_update(s->md5_ctx, src, w << pixel_shift);
-+        }
-+        av_md5_final(s->md5_ctx, md5);
-+
-+        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
-+            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
-+            print_md5(s->avctx, AV_LOG_DEBUG, md5);
-+            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
-+        } else {
-+            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
-+            print_md5(s->avctx, AV_LOG_ERROR, md5);
-+            av_log   (s->avctx, AV_LOG_ERROR, " != ");
-+            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
-+            av_log   (s->avctx, AV_LOG_ERROR, "\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    av_log(s->avctx, AV_LOG_DEBUG, "\n");
-+
-+    return 0;
-+}
-+
-+static int all_sps_supported(const HEVCRpiContext * const s)
-+{
-+    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        if (s->ps.sps_list[i] != NULL)
-+        {
-+            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+            if (!is_sps_supported(sps))
-+                return 0;
-+        }
-+    }
-+    return 1;
-+}
-+
-+static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
-+{
-+    int ret, i;
-+
-+    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
-+                                   &s->nal_length_size, s->avctx->err_recognition,
-+                                   s->apply_defdispwin, s->avctx);
-+    if (ret < 0)
-+        return ret;
-+
-+    /* export stream parameters from the first SPS */
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        if (first && s->ps.sps_list[i]) {
-+            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+            export_stream_params(s->avctx, &s->ps, sps);
-+            break;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
-+                             AVPacket *avpkt)
-+{
-+    int ret;
-+    int new_extradata_size;
-+    uint8_t *new_extradata;
-+    HEVCRpiContext *s = avctx->priv_data;
-+
-+    if (!avpkt->size) {
-+        ret = ff_hevc_rpi_output_frame(s, data, 1);
-+        if (ret < 0)
-+            return ret;
-+
-+        *got_output = ret;
-+        return 0;
-+    }
-+
-+    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
-+                                            &new_extradata_size);
-+    if (new_extradata && new_extradata_size > 0) {
-+        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    s->ref = NULL;
-+    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
-+    if (ret < 0)
-+        return ret;
-+
-+    /* verify the SEI checksum */
-+    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
-+        s->sei.picture_hash.is_md5) {
-+        ret = verify_md5(s, s->ref->frame);
-+        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
-+            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+            return ret;
-+        }
-+    }
-+    s->sei.picture_hash.is_md5 = 0;
-+
-+    if (s->is_decoded) {
-+        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
-+        s->is_decoded = 0;
-+    }
-+
-+    if (s->output_frame->buf[0]) {
-+        av_frame_move_ref(data, s->output_frame);
-+        *got_output = 1;
-+    }
-+
-+    return avpkt->size;
-+}
-+
-+static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
-+{
-+    int ret;
-+
-+    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (src->col_mvf_buf != NULL)
-+    {
-+        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
-+        if (!dst->col_mvf_buf)
-+            goto fail;
-+    }
-+    dst->col_mvf = src->col_mvf;
-+
-+    dst->poc        = src->poc;
-+    dst->flags      = src->flags;
-+    dst->sequence   = src->sequence;
-+    return 0;
-+
-+fail:
-+    ff_hevc_rpi_unref_frame(s, dst, ~0);
-+    return AVERROR(ENOMEM);
-+}
-+
-+
-+static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext * const s = avctx->priv_data;
-+    int i;
-+
-+    pic_arrays_free(s);
-+
-+    av_freep(&s->md5_ctx);
-+
-+    av_freep(&s->cabac_save);
-+
-+#if RPI_EXTRA_BIT_THREADS
-+    bit_threads_kill(s);
-+#endif
-+
-+    hevc_exit_worker(s);
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
-+    }
-+    job_lc_kill(s->HEVClc);
-+
-+    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
-+    av_freep(&s->sao_pixel_buffer_v[0]);
-+    av_frame_free(&s->output_frame);
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+        av_frame_free(&s->DPB[i].frame);
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
-+        av_buffer_unref(&s->ps.vps_list[i]);
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
-+        av_buffer_unref(&s->ps.sps_list[i]);
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
-+        av_buffer_unref(&s->ps.pps_list[i]);
-+    s->ps.sps = NULL;
-+    s->ps.pps = NULL;
-+    s->ps.vps = NULL;
-+
-+    // Free separately from sLists as used that way by RPI WPP
-+    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
-+        av_freep(s->HEVClcList + i);
-+    }
-+    s->HEVClc = NULL;  // Allocated as part of HEVClcList
-+
-+    ff_h2645_packet_uninit(&s->pkt);
-+
-+    if (s->qpu_init_ok)
-+        vpu_qpu_term();
-+    s->qpu_init_ok = 0;
-+
-+    return 0;
-+}
-+
-+
-+static av_cold int hevc_init_context(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int i;
-+
-+    s->avctx = avctx;
-+
-+    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
-+    if (!s->HEVClc)
-+        goto fail;
-+    s->HEVClcList[0] = s->HEVClc;
-+
-+    // Whilst FFmpegs init fn is only called once the close fn is called as
-+    // many times as we have threads (init_thread_copy is called for the
-+    // threads).  So to match init & term put the init here where it will be
-+    // called by both init & copy
-+
-+    if (vpu_qpu_init() != 0)
-+        goto fail;
-+    s->qpu_init_ok = 1;
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    {
-+        static const uint32_t dframe[1] = {0x80808080};
-+        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
-+    }
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    s->qpu_dummy_frame_qpu = qpu_dummy();
-+#endif
-+
-+    bt_lc_init(s, s->HEVClc, 0);
-+    job_lc_init(s->HEVClc);
-+
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_init_state(s->progress_states + i);
-+    }
-+
-+    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
-+        goto fail;
-+
-+     if ((s->output_frame = av_frame_alloc()) == NULL)
-+        goto fail;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        s->DPB[i].frame = av_frame_alloc();
-+        if (!s->DPB[i].frame)
-+            goto fail;
-+        s->DPB[i].tf.f = s->DPB[i].frame;
-+        s->DPB[i].dpb_no = i;
-+    }
-+
-+    s->max_ra = INT_MAX;
-+
-+    if ((s->md5_ctx = av_md5_alloc()) == NULL)
-+        goto fail;
-+
-+    s->context_initialized = 1;
-+    s->eos = 0;
-+
-+    ff_hevc_rpi_reset_sei(&s->sei);
-+
-+    return 0;
-+
-+fail:
-+    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
-+    hevc_decode_free(avctx);
-+    return AVERROR(ENOMEM);
-+}
-+
-+#if HAVE_THREADS
-+static int hevc_update_thread_context(AVCodecContext *dst,
-+                                      const AVCodecContext *src)
-+{
-+    HEVCRpiContext *s  = dst->priv_data;
-+    HEVCRpiContext *s0 = src->priv_data;
-+    int i, ret;
-+
-+    if (!s->context_initialized) {
-+        ret = hevc_init_context(dst);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+        if (s0->DPB[i].frame->buf[0]) {
-+            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
-+            if (ret < 0)
-+                return ret;
-+        }
-+    }
-+
-+    if (s->ps.sps != s0->ps.sps)
-+        s->ps.sps = NULL;
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
-+        av_buffer_unref(&s->ps.vps_list[i]);
-+        if (s0->ps.vps_list[i]) {
-+            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
-+            if (!s->ps.vps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+        av_buffer_unref(&s->ps.sps_list[i]);
-+        if (s0->ps.sps_list[i]) {
-+            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
-+            if (!s->ps.sps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
-+        av_buffer_unref(&s->ps.pps_list[i]);
-+        if (s0->ps.pps_list[i]) {
-+            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
-+            if (!s->ps.pps_list[i])
-+                return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    if (s->ps.sps != s0->ps.sps)
-+        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
-+            return ret;
-+
-+    s->seq_decode = s0->seq_decode;
-+    s->seq_output = s0->seq_output;
-+    s->pocTid0    = s0->pocTid0;
-+    s->max_ra     = s0->max_ra;
-+    s->eos        = s0->eos;
-+    s->no_rasl_output_flag = s0->no_rasl_output_flag;
-+
-+    s->is_nalff        = s0->is_nalff;
-+    s->nal_length_size = s0->nal_length_size;
-+
-+    s->threads_type        = s0->threads_type;
-+
-+    if (s0->eos) {
-+        s->seq_decode = (s->seq_decode + 1) & 0xff;
-+        s->max_ra = INT_MAX;
-+    }
-+
-+    s->sei.frame_packing        = s0->sei.frame_packing;
-+    s->sei.display_orientation  = s0->sei.display_orientation;
-+    s->sei.mastering_display    = s0->sei.mastering_display;
-+    s->sei.content_light        = s0->sei.content_light;
-+    s->sei.alternative_transfer = s0->sei.alternative_transfer;
-+
-+    // * We do this here as it allows us to easily locate our parents
-+    //   global job pool, but there really should be a less nasty way
-+    if (s->jbc == NULL)
-+    {
-+        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
-+        hevc_init_worker(s);
-+    }
-+
-+    return 0;
-+}
-+#endif
-+
-+#include <sys/stat.h>
-+static int qpu_ok(void)
-+{
-+    static int is_pi3 = -1;
-+    if (is_pi3 == -1)
-+    {
-+        struct stat sb;
-+        is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
-+    }
-+    return is_pi3;
-+}
-+
-+static av_cold int hevc_decode_init(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int ret;
-+
-+    if (!qpu_ok())
-+        return -1;
-+
-+    avctx->internal->allocate_progress = 1;
-+
-+    if ((ret = hevc_init_context(avctx)) < 0)
-+        return ret;
-+
-+    // Job allocation requires VCSM alloc to work so ensure that we have it
-+    // initialised by this point
-+    {
-+        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
-+        if (jbg == NULL)
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
-+            return -1;
-+        }
-+
-+        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
-+        {
-+            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
-+            return -1;
-+        }
-+    }
-+
-+    hevc_init_worker(s);
-+
-+    s->sei.picture_timing.picture_struct = 0;
-+    s->eos = 1;
-+
-+    atomic_init(&s->wpp_err, 0);
-+
-+    if (avctx->extradata_size > 0 && avctx->extradata) {
-+        ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
-+
-+        if (ret == 0 && !all_sps_supported(s))
-+            ret = AVERROR_DECODER_NOT_FOUND;
-+
-+        if (ret < 0)
-+        {
-+            hevc_decode_free(avctx);
-+            return ret;
-+        }
-+    }
-+
-+    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
-+        s->threads_type = FF_THREAD_FRAME;
-+    else
-+        s->threads_type = 0;
-+
-+    return 0;
-+}
-+
-+#if HAVE_THREADS
-+static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    int ret;
-+
-+    memset(s, 0, sizeof(*s));
-+
-+    ret = hevc_init_context(avctx);
-+    if (ret < 0)
-+        return ret;
-+
-+    return 0;
-+}
-+#endif
-+
-+static void hevc_decode_flush(AVCodecContext *avctx)
-+{
-+    HEVCRpiContext *s = avctx->priv_data;
-+    ff_hevc_rpi_flush_dpb(s);
-+    s->max_ra = INT_MAX;
-+    s->eos = 1;
-+}
-+
-+typedef struct  hwaccel_rpi3_qpu_env_s {
-+    const AVClass *av_class;
-+    AVZcEnvPtr zc;
-+} hwaccel_rpi3_qpu_env_t;
-+
-+static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
-+{
-+    hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
-+    int rv;
-+
-+    if (av_rpi_zc_in_use(s))
-+    {
-+        rv = s->get_buffer2(s, frame, 0);
-+    }
-+    else
-+    {
-+        rv = av_rpi_zc_get_buffer(r3->zc, frame);
-+        if (rv == 0)
-+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);  // actually do the alloc
-+    }
-+
-+    if (rv == 0 &&
-+        (rv = ff_attach_decode_data(frame)) < 0)
-+    {
-+        av_frame_unref(frame);
-+    }
-+
-+    return rv;
-+}
-+
-+static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
-+{
-+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
-+    av_rpi_zc_int_env_freep(&r3->zc);
-+    return 0;
-+}
-+
-+static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
-+{
-+    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
-+
-+    if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
-+        goto fail;
-+
-+    return 0;
-+
-+fail:
-+    av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
-+    hwaccel_rpi3_qpu_free(avctx);
-+    return AVERROR(ENOMEM);
-+}
-+
-+
-+#define OFFSET(x) offsetof(HEVCRpiContext, x)
-+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
-+
-+
-+static const AVOption options[] = {
-+    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
-+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
-+        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+    { NULL },
-+};
-+
-+static const AVClass hevc_rpi_decoder_class = {
-+    .class_name = "HEVC RPI decoder",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+};
-+
-+static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
-+    AV_PIX_FMT_SAND128,
-+    AV_PIX_FMT_SAND64_10,
-+    AV_PIX_FMT_NONE
-+};
-+
-+
-+static const AVHWAccel hwaccel_rpi3_qpu = {
-+    .name           = "Pi3 QPU Hwaccel",
-+    .alloc_frame    = hwaccel_alloc_frame,
-+    .init           = hwaccel_rpi3_qpu_init,
-+    .uninit         = hwaccel_rpi3_qpu_free,
-+    .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-+
-+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
-+{
-+    .public = {
-+        .pix_fmt = AV_PIX_FMT_SAND128,
-+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
-+        .device_type = AV_HWDEVICE_TYPE_NONE,
-+    },
-+    .hwaccel = &hwaccel_rpi3_qpu
-+};
-+static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
-+{
-+    .public = {
-+        .pix_fmt = AV_PIX_FMT_SAND64_10,
-+        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
-+        .device_type = AV_HWDEVICE_TYPE_NONE,
-+    },
-+    .hwaccel = &hwaccel_rpi3_qpu
-+};
-+
-+
-+static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
-+    &hevc_rpi_hw_config_sand128,
-+    &hevc_rpi_hw_config_sand64_10,
-+    NULL
-+};
-+
-+
-+AVCodec ff_hevc_rpi_decoder = {
-+    .name                  = "hevc_rpi",
-+    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
-+    .type                  = AVMEDIA_TYPE_VIDEO,
-+    .id                    = AV_CODEC_ID_HEVC,
-+    .priv_data_size        = sizeof(HEVCRpiContext),
-+    .priv_class            = &hevc_rpi_decoder_class,
-+    .init                  = hevc_decode_init,
-+    .close                 = hevc_decode_free,
-+    .decode                = hevc_rpi_decode_frame,
-+    .flush                 = hevc_decode_flush,
-+    .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
-+    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(hevc_init_thread_copy),
-+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-+                             AV_CODEC_CAP_HARDWARE |
-+                             AV_CODEC_CAP_AVOID_PROBING |
-+#if 0
-+    // Debugging is often easier without threads getting in the way
-+                            0,
-+#warning H265 threading turned off
-+#else
-+    // We only have decent optimisation for frame - so only admit to that
-+                             AV_CODEC_CAP_FRAME_THREADS,
-+#endif
-+    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING,
-+    .pix_fmts              = hevc_rpi_pix_fmts,
-+    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
-+    .hw_configs            = hevc_rpi_hw_configs,
-+//    .wrapper_name          = "hevc_rpi",
-+};
-+
-diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
-new file mode 100644
-index 0000000000..5001a3853b
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,1093 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDEC_H
-+#define AVCODEC_RPI_HEVCDEC_H
-+
-+#include "config.h"
-+
-+#include <stdatomic.h>
-+
-+#include "libavutil/buffer.h"
-+
-+#include "avcodec.h"
-+#include "bswapdsp.h"
-+#include "cabac.h"
-+#include "get_bits.h"
-+#include "rpi_hevcpred.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_mv.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+#include "rpi_hevcdsp.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "videodsp.h"
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_misc_neon.h"
-+#endif
-+
-+#define MAX_NB_THREADS 16
-+#define SHIFT_CTB_WPP 2
-+
-+//TODO: check if this is really the maximum
-+#define MAX_TRANSFORM_DEPTH 5
-+
-+#define MAX_TB_SIZE 32
-+#define MAX_QP 51
-+#define DEFAULT_INTRA_TC_OFFSET 2
-+
-+#define HEVC_CONTEXTS 199
-+
-+#define MRG_MAX_NUM_CANDS     5
-+
-+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
-+
-+// Size of DPB array
-+#define HEVC_DPB_ELS            32
-+
-+#define L0 0
-+#define L1 1
-+
-+#define EPEL_EXTRA_BEFORE 1
-+#define EPEL_EXTRA_AFTER  2
-+#define EPEL_EXTRA        3
-+#define QPEL_EXTRA_BEFORE 3
-+#define QPEL_EXTRA_AFTER  4
-+#define QPEL_EXTRA        7
-+
-+#define EDGE_EMU_BUFFER_STRIDE 80
-+
-+#include <semaphore.h>
-+#include "rpi_qpu.h"
-+
-+// Max jobs per frame thread. Actual usage will be limited by the size
-+// of the global job pool
-+// ?? Limits
-+#define RPI_MAX_JOBS            8
-+
-+// This is the number of _extra_ bit threads - we will have
-+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
-+//
-+// 0 is legitimate and will disable our WPP processing
-+//#define RPI_EXTRA_BIT_THREADS 0
-+#define RPI_EXTRA_BIT_THREADS   2
-+
-+// Number of separate threads/passes in worker
-+// 2 and 3 are the currently valid numbers
-+// At the moment 3 seems fractionally faster
-+//#define RPI_PASSES              2
-+#define RPI_PASSES              3
-+
-+// Print out various usage stats
-+#define RPI_TSTATS              0
-+
-+// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
-+#define RPI_COMPRESS_COEFFS     1
-+
-+// Wait for VPU/QPU to finish in worker pass 0
-+// If 0 then the wait is in pass 1
-+//
-+// One might expect the better place to wait would be in pass 1 however
-+// testing shows that pass 0 produces overall faster decode.
-+// Interestingly it is QPU/VPU limited streams that seem to suffer
-+// from pass 1 waits, CPU limited ones tend to show a very mild gain.
-+// This define exists so it is easy to test this.
-+#define RPI_WORKER_WAIT_PASS_0  1
-+
-+// Use ARM emulation of QPU pred
-+// These are for debug only as the emulation makes only limited
-+// effort to be fast
-+#define RPI_QPU_EMU_Y           0
-+#define RPI_QPU_EMU_C           0
-+
-+// Max width & height we are prepared to consider
-+// Sand frame shape calc becomes confused with large frames
-+// Some buffer alloc also depends on this
-+#define HEVC_RPI_MAX_WIDTH      2048
-+#define HEVC_RPI_MAX_HEIGHT     1088
-+
-+
-+// Min CTB size is 16
-+#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
-+
-+/**
-+ * Value of the luma sample at position (x, y) in the 2D array tab.
-+ */
-+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
-+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
-+
-+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
-+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
-+                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
-+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
-+
-+enum RPSType {
-+    ST_CURR_BEF = 0,
-+    ST_CURR_AFT,
-+    ST_FOLL,
-+    LT_CURR,
-+    LT_FOLL,
-+    NB_RPS_TYPE,
-+};
-+
-+enum SyntaxElement {
-+    SAO_MERGE_FLAG = 0,
-+    SAO_TYPE_IDX,
-+    SAO_EO_CLASS,
-+    SAO_BAND_POSITION,
-+    SAO_OFFSET_ABS,
-+    SAO_OFFSET_SIGN,
-+    END_OF_SLICE_FLAG,
-+    SPLIT_CODING_UNIT_FLAG,
-+    CU_TRANSQUANT_BYPASS_FLAG,
-+    SKIP_FLAG,
-+    CU_QP_DELTA,
-+    PRED_MODE_FLAG,
-+    PART_MODE,
-+    PCM_FLAG,
-+    PREV_INTRA_LUMA_PRED_FLAG,
-+    MPM_IDX,
-+    REM_INTRA_LUMA_PRED_MODE,
-+    INTRA_CHROMA_PRED_MODE,
-+    MERGE_FLAG,
-+    MERGE_IDX,
-+    INTER_PRED_IDC,
-+    REF_IDX_L0,
-+    REF_IDX_L1,
-+    ABS_MVD_GREATER0_FLAG,
-+    ABS_MVD_GREATER1_FLAG,
-+    ABS_MVD_MINUS2,
-+    MVD_SIGN_FLAG,
-+    MVP_LX_FLAG,
-+    NO_RESIDUAL_DATA_FLAG,
-+    SPLIT_TRANSFORM_FLAG,
-+    CBF_LUMA,
-+    CBF_CB_CR,
-+    TRANSFORM_SKIP_FLAG,
-+    EXPLICIT_RDPCM_FLAG,
-+    EXPLICIT_RDPCM_DIR_FLAG,
-+    LAST_SIGNIFICANT_COEFF_X_PREFIX,
-+    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
-+    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
-+    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
-+    SIGNIFICANT_COEFF_GROUP_FLAG,
-+    SIGNIFICANT_COEFF_FLAG,
-+    COEFF_ABS_LEVEL_GREATER1_FLAG,
-+    COEFF_ABS_LEVEL_GREATER2_FLAG,
-+    COEFF_ABS_LEVEL_REMAINING,
-+    COEFF_SIGN_FLAG,
-+    LOG2_RES_SCALE_ABS,
-+    RES_SCALE_SIGN_FLAG,
-+    CU_CHROMA_QP_OFFSET_FLAG,
-+    CU_CHROMA_QP_OFFSET_IDX,
-+};
-+
-+enum PartMode {
-+    PART_2Nx2N = 0,
-+    PART_2NxN  = 1,
-+    PART_Nx2N  = 2,
-+    PART_NxN   = 3,
-+    PART_2NxnU = 4,
-+    PART_2NxnD = 5,
-+    PART_nLx2N = 6,
-+    PART_nRx2N = 7,
-+};
-+
-+enum PredMode {
-+    MODE_INTER = 0,
-+    MODE_INTRA,
-+    MODE_SKIP,
-+};
-+
-+enum InterPredIdc {
-+    PRED_L0 = 0,
-+    PRED_L1,
-+    PRED_BI,
-+};
-+
-+enum PredFlag {
-+    PF_INTRA = 0,
-+    PF_L0,
-+    PF_L1,
-+    PF_BI,
-+};
-+
-+enum SAOType {
-+    SAO_NOT_APPLIED = 0,
-+    SAO_BAND,
-+    SAO_EDGE,
-+    SAO_APPLIED
-+};
-+
-+enum SAOEOClass {
-+    SAO_EO_HORIZ = 0,
-+    SAO_EO_VERT,
-+    SAO_EO_135D,
-+    SAO_EO_45D,
-+};
-+
-+enum ScanType {
-+    SCAN_DIAG = 0,
-+    SCAN_HORIZ,
-+    SCAN_VERT,
-+};
-+
-+typedef struct RefPicList {
-+    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
-+    int list[HEVC_MAX_REFS];
-+    uint8_t isLongTerm[HEVC_MAX_REFS];
-+    int nb_refs;
-+} RefPicList;
-+
-+typedef struct RefPicListTab {
-+    RefPicList refPicList[2];
-+} RefPicListTab;
-+
-+typedef struct RpiCodingUnit {
-+    unsigned int x;             // Passed to deblock
-+    unsigned int y;
-+    unsigned int x_split;
-+    unsigned int y_split;
-+
-+    enum PredMode pred_mode;    ///< PredMode
-+    enum PartMode part_mode;    ///< PartMode
-+
-+    // Inferred parameters
-+    uint8_t intra_split_flag;   ///< IntraSplitFlag
-+    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
-+    uint8_t cu_transquant_bypass_flag;
-+} RpiCodingUnit;
-+
-+typedef struct RpiPredictionUnit {
-+    uint8_t intra_pred_mode[4];
-+    uint8_t intra_pred_mode_c[4];
-+    uint8_t chroma_mode_c[4];
-+    uint8_t merge_flag;
-+} RpiPredictionUnit;
-+
-+typedef struct HEVCRpiTransformUnit {
-+    int8_t cu_qp_delta;
-+
-+    // Inferred parameters;
-+    uint8_t intra_pred_mode;
-+    uint8_t intra_pred_mode_c;
-+    uint8_t chroma_mode_c;
-+    uint8_t is_cu_qp_delta_wanted;
-+    uint8_t cu_chroma_qp_offset_wanted;
-+    const int8_t * qp_divmod6[3];
-+} HEVCRpiTransformUnit;
-+
-+typedef struct DBParams {
-+    int8_t beta_offset; // -12 to +12
-+    int8_t tc_offset;   // -12 to +12
-+} DBParams;
-+
-+#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
-+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
-+#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
-+#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
-+
-+struct HEVCRpiJob;
-+
-+typedef struct HEVCRpiFrame {
-+    AVFrame *frame;
-+    ThreadFrame tf;
-+    ColMvField *col_mvf;
-+    int poc;
-+    struct HEVCRpiFrame *collocated_ref;
-+
-+    AVBufferRef *col_mvf_buf;
-+
-+    /**
-+     * A sequence counter, so that old frames are output first
-+     * after a POC reset
-+     */
-+    uint16_t sequence;
-+
-+    /**
-+     * A combination of HEVC_FRAME_FLAG_*
-+     */
-+    uint8_t flags;
-+
-+    // Entry no in DPB - can be used as a small unique
-+    // frame identifier (within the current thread)
-+    uint8_t dpb_no;
-+} HEVCRpiFrame;
-+
-+typedef struct HEVCRpiLocalContext {
-+    HEVCRpiTransformUnit tu;
-+
-+    CABACContext cc;
-+
-+    // Vars that allow us to locate everything from just an lc
-+    struct HEVCRpiContext * context;  // ??? make const ???
-+    unsigned int lc_n; // lc list el no
-+
-+    // Job wait links
-+    struct HEVCRpiLocalContext * jw_next;
-+    struct HEVCRpiLocalContext * jw_prev;
-+    struct HEVCRpiLocalContext * ljw_next;
-+    struct HEVCRpiLocalContext * ljw_prev;
-+    struct HEVCRpiJob * volatile jw_job;
-+    sem_t jw_sem;
-+
-+    // ?? Wrap in structure ??
-+    sem_t bt_sem_in;
-+    sem_t * bt_psem_out;
-+    volatile int bt_terminate;
-+    unsigned int ts;
-+    unsigned int bt_last_line;  // Last line in this bit_thread chunk
-+    unsigned int bt_line_no;
-+    unsigned int bt_line_width;
-+    unsigned int bt_line_inc;
-+
-+    struct HEVCRpiJob * jb0;
-+    char unit_done;  // Set once we have dealt with this slice
-+    char bt_is_tile;
-+    char last_progress_good;
-+    char cabac_init_req;
-+
-+    uint8_t cabac_state[HEVC_CONTEXTS];
-+    uint8_t stat_coeff[4];
-+    GetBitContext gb;
-+
-+    uint8_t ct_depth;
-+    int8_t qp_y;
-+    int8_t curr_qp_y;
-+    int8_t qPy_pred;
-+
-+// N.B. Used by asm (neon) - do not change
-+#define AVAIL_S_UR  0
-+#define AVAIL_S_U   1
-+#define AVAIL_S_UL  2
-+#define AVAIL_S_L   3
-+#define AVAIL_S_DL  4
-+
-+#define AVAIL_U     (1 << AVAIL_S_U)
-+#define AVAIL_L     (1 << AVAIL_S_L)
-+#define AVAIL_UL    (1 << AVAIL_S_UL)
-+#define AVAIL_UR    (1 << AVAIL_S_UR)
-+#define AVAIL_DL    (1 << AVAIL_S_DL)
-+
-+// Intra filters - same number space as avail
-+#define FILTER_LIGHT    0x40
-+#define FILTER_STRONG   0x80
-+#define FILTER_EITHER   (FILTER_LIGHT | FILTER_STRONG)
-+
-+    uint8_t ctb_avail;
-+    int     end_of_ctb_x;
-+    int     end_of_ctb_y;
-+
-+    RpiCodingUnit cu;
-+    RpiPredictionUnit pu;
-+
-+#define BOUNDARY_LEFT_SLICE     (1 << 0)
-+#define BOUNDARY_LEFT_TILE      (1 << 1)
-+#define BOUNDARY_UPPER_SLICE    (1 << 2)
-+#define BOUNDARY_UPPER_TILE     (1 << 3)
-+    /* properties of the boundary of the current CTB for the purposes
-+     * of the deblocking filter */
-+    unsigned int boundary_flags;
-+
-+#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
-+    uint8_t ipm_left[IPM_TAB_SIZE];
-+    uint8_t ipm_up[IPM_TAB_SIZE];
-+
-+//#define MVF_STASH_WIDTH       128
-+#define MVF_STASH_WIDTH       64
-+#define MVF_STASH_HEIGHT      64
-+#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
-+#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
-+    HEVCRpiMvField mvf_ul[1];
-+    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
-+
-+    /* +7 is for subpixel interpolation, *2 for high bit depths */
-+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+    /* The extended size between the new edge emu buffer is abused by SAO */
-+//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+//    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-+
-+} HEVCRpiLocalContext;
-+
-+// Each block can have an intra prediction and an add_residual command
-+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
-+
-+// Sand only has 2 planes (Y/C)
-+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
-+
-+// Command for intra prediction and transform_add of predictions to coefficients
-+enum rpi_pred_cmd_e
-+{
-+    RPI_PRED_ADD_RESIDUAL,
-+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
-+    RPI_PRED_ADD_DC,
-+    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
-+    RPI_PRED_ADD_DC_V,
-+    RPI_PRED_INTRA,
-+    RPI_PRED_INTRA_C,
-+    RPI_PRED_I_PCM,
-+    RPI_PRED_CMD_MAX
-+};
-+
-+typedef struct HEVCPredCmd {
-+    uint8_t type;
-+    uint8_t size;  // log2 "size" used by all variants
-+    uint8_t avail; // i_pred - but left here as they pack well
-+    uint8_t dummy;
-+    union {
-+        struct {  // TRANSFORM_ADD
-+            uint8_t * dst;
-+            const int16_t * buf;
-+            uint16_t stride;  // Should be good enough for all pic fmts we use
-+            int16_t dc;
-+        } ta;
-+        struct {
-+            uint8_t * dst;
-+            uint32_t stride;
-+            int dc;
-+        } dc;
-+        struct {  // INTRA
-+            uint16_t x;
-+            uint16_t y;
-+            enum IntraPredMode mode;
-+        } i_pred;
-+        struct {  // I_PCM
-+            uint16_t x;
-+            uint16_t y;
-+            const void * src;
-+            uint32_t src_len;
-+        } i_pcm;
-+    };
-+} HEVCPredCmd;
-+
-+union qpu_mc_pred_cmd_s;
-+struct qpu_mc_pred_y_p_s;
-+struct qpu_mc_src_s;
-+
-+typedef struct HEVCRpiInterPredQ
-+{
-+    union qpu_mc_pred_cmd_u *qpu_mc_base;
-+    union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    struct qpu_mc_src_s *last_l0;
-+    struct qpu_mc_src_s *last_l1;
-+    unsigned int load;
-+    uint32_t code_setup;
-+    uint32_t code_sync;
-+    uint32_t code_exit;
-+} HEVCRpiInterPredQ;
-+
-+typedef struct HEVCRpiInterPredEnv
-+{
-+    HEVCRpiInterPredQ * q;
-+    uint8_t n;                  // Number of Qs
-+    uint8_t n_grp;              // Number of Q in a group
-+    uint8_t curr;               // Current Q number (0..n-1)
-+    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
-+    uint8_t used_grp;           // 0 if nothing in any Q in the current group
-+    unsigned int max_fill;
-+    unsigned int min_gap;
-+    GPU_MEM_PTR_T gptr;
-+} HEVCRpiInterPredEnv;
-+
-+typedef struct HEVCRpiIntraPredEnv {
-+    unsigned int n;        // Number of commands
-+    HEVCPredCmd * cmds;
-+} HEVCRpiIntraPredEnv;
-+
-+typedef struct HEVCRpiCoeffEnv {
-+    unsigned int n;
-+#if RPI_COMPRESS_COEFFS
-+    unsigned int packed; // Equal to 1 if coefficients should be being packed
-+    unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed).  Only valid if packed==0
-+#endif
-+    int16_t * buf;
-+} HEVCRpiCoeffEnv;
-+
-+typedef struct HEVCRpiCoeffsEnv {
-+    HEVCRpiCoeffEnv s[4];
-+    GPU_MEM_PTR_T gptr;
-+    void * mptr;
-+} HEVCRpiCoeffsEnv;
-+
-+typedef struct HEVCRpiFrameProgressWait {
-+    int req;
-+    struct HEVCRpiFrameProgressWait * next;
-+    sem_t sem;
-+} HEVCRpiFrameProgressWait;
-+
-+typedef struct HEVCRpiFrameProgressState {
-+    struct HEVCRpiFrameProgressWait * first;
-+    struct HEVCRpiFrameProgressWait * last;
-+    pthread_mutex_t lock;
-+} HEVCRpiFrameProgressState;
-+
-+typedef struct RpiBlk
-+{
-+    unsigned int x;
-+    unsigned int y;
-+    unsigned int w;
-+    unsigned int h;
-+} RpiBlk;
-+
-+typedef struct HEVCRpiJob {
-+    struct HEVCRpiJob * next;  // Free chain
-+    struct HEVCRpiJobCtl * jbc_local;
-+    const HEVCRpiSPS * sps;       // sps used to set up this job
-+
-+    int waited;
-+    int ctu_ts_first;
-+    int ctu_ts_last;
-+    RpiBlk bounds;  // Bounding box of job
-+
-+    struct qpu_mc_pred_y_p_s * last_y8_p;
-+    struct qpu_mc_src_s * last_y8_l1;
-+    rpi_cache_flush_env_t * rfe;
-+
-+    HEVCRpiInterPredEnv chroma_ip;
-+    HEVCRpiInterPredEnv luma_ip;
-+    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
-+    HEVCRpiIntraPredEnv intra;
-+    HEVCRpiCoeffsEnv coeffs;
-+    HEVCRpiFrameProgressWait progress_wait;
-+    sem_t sem;
-+    rpi_cache_buf_t flush_buf;
-+} HEVCRpiJob;
-+
-+struct HEVCRpiContext;
-+
-+typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
-+
-+typedef struct HEVCRpiPassQueue
-+{
-+//    int pending;
-+    volatile int terminate;
-+    sem_t sem_in;
-+    sem_t * psem_out;
-+    unsigned int job_n;
-+    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
-+    HEVCRpiWorkerFn * worker;
-+    pthread_t thread;
-+    uint8_t pass_n;  // Pass number - debug
-+    uint8_t started;
-+} HEVCRpiPassQueue;
-+
-+
-+struct HEVCRpiJobGlobal;
-+
-+typedef struct HEVCRpiJobCtl
-+{
-+    sem_t sem_out;
-+
-+    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
-+    struct HEVCRpiJobGlobal * jbg;
-+
-+    HEVCRpiLocalContext * lcw_head;
-+    HEVCRpiLocalContext * lcw_tail;
-+
-+    pthread_mutex_t in_lock;
-+    int offload_in;
-+
-+    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
-+} HEVCRpiJobCtl;
-+
-+
-+typedef struct HEVCRpiJobGlobal
-+{
-+    intptr_t ref_count;
-+    pthread_mutex_t lock;
-+    HEVCRpiJob * free1;                 // Singly linked list of free jobs
-+    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
-+    HEVCRpiLocalContext * wait_good;  // Last good tail
-+    HEVCRpiLocalContext * wait_tail;
-+
-+} HEVCRpiJobGlobal;
-+
-+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
-+
-+#if RPI_TSTATS
-+typedef struct HEVCRpiStats {
-+    int y_pred1_y8_merge;
-+    int y_pred1_xy;
-+    int y_pred1_x0;
-+    int y_pred1_y0;
-+    int y_pred1_x0y0;
-+    int y_pred1_wle8;
-+    int y_pred1_wgt8;
-+    int y_pred1_hle16;
-+    int y_pred1_hgt16;
-+    int y_pred2_xy;
-+    int y_pred2_x0;
-+    int y_pred2_y0;
-+    int y_pred2_x0y0;
-+    int y_pred2_hle16;
-+    int y_pred2_hgt16;
-+} HEVCRpiStats;
-+#endif
-+
-+typedef struct HEVCRpiCabacState
-+{
-+    uint8_t rice[4];
-+    uint8_t state[HEVC_CONTEXTS];
-+} HEVCRpiCabacState;
-+
-+#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT   6   // 64 pels
-+#define HEVC_RPI_BS_STRIDE1_PELS        (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_PEL_MASK    (HEVC_RPI_BS_STRIDE1_PELS - 1)
-+#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT  2   // 4 els per byte
-+#define HEVC_RPI_BS_PELS_PER_EL_SHIFT   2   // 4 pels per el
-+#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT  (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTES       (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+#define HEVC_RPI_BS_Y_SHR               3   // 8 vertical pels per row
-+#define HEVC_RPI_BS_COL_BYTES_SHR       (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+
-+typedef struct HEVCRpiContext {
-+    const AVClass *c;  // needed by private avoptions
-+    AVCodecContext *avctx;
-+
-+    uint8_t             threads_type;
-+    char qpu_init_ok;
-+
-+    /** 1 if the independent slice segment header was successfully parsed */
-+    uint8_t slice_initialized;
-+    char used_for_ref;  // rpi
-+    char is_irap;
-+    char offload_recon;
-+    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
-+    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
-+    uint8_t no_backward_pred_flag;
-+    uint8_t is_decoded;
-+    uint8_t no_rasl_output_flag;
-+
-+
-+    /**
-+     * Sequence counters for decoded and output frames, so that old
-+     * frames are output first after a POC reset
-+     */
-+    uint16_t seq_decode;
-+    uint16_t seq_output;
-+
-+    int                 width;
-+    int                 height;
-+
-+    HEVCRpiJobCtl * jbc;
-+    // cabac stash
-+    // b0       skip flag
-+    // b1+      ct_depth
-+    uint8_t * cabac_stash_left;
-+    uint8_t * cabac_stash_up;
-+
-+    // Function pointers
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    const uint8_t * qpu_dummy_frame_emu;
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
-+#endif
-+    HEVCRpiQpu qpu;
-+
-+    HEVCRpiFrameProgressState progress_states[2];
-+
-+    HEVCRpiCabacState *cabac_save;
-+
-+    AVFrame *frame;
-+    AVFrame *output_frame;
-+    uint8_t *sao_pixel_buffer_h[3];
-+    uint8_t *sao_pixel_buffer_v[3];
-+
-+    unsigned int col_mvf_stride;
-+    AVBufferPool *col_mvf_pool;
-+
-+    RpiSAOParams *sao;
-+    DBParams *deblock;
-+    enum HEVCNALUnitType nal_unit_type;
-+    int temporal_id;  ///< temporal_id_plus1 - 1
-+    HEVCRpiFrame *ref;
-+    int poc;
-+    int pocTid0;
-+    int slice_idx; ///< number of the slice being currently decoded
-+    int max_ra;
-+
-+    int8_t *qp_y_tab;
-+
-+    // Deblocking block strength bitmaps
-+    unsigned int bs_stride2;
-+    unsigned int bs_size;
-+    uint8_t *bs_horizontal;
-+    uint8_t *bs_vertical;
-+    uint8_t *bsf_stash_up;
-+    uint8_t *bsf_stash_left;
-+
-+#if HEVC_RPI_MAX_CTBS >= 0xffff
-+#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
-+    uint32_t *tab_slice_address;
-+#else
-+#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
-+    uint16_t *tab_slice_address;
-+#endif
-+
-+    // Bitfield 1 bit per 8 pels (min pcm size)
-+    uint8_t *is_pcm;
-+    // Bitfield 1 bit per 8 pels (min cb size)
-+    // Only needed for CIP as CIP processing is async to the main thread
-+    uint8_t *is_intra;
-+
-+    // PU
-+    HEVCRpiMvField *mvf_up;
-+    HEVCRpiMvField *mvf_left;
-+
-+    const RefPicList **rpl_up;
-+    const RefPicList **rpl_left;
-+    RefPicList * refPicList;
-+
-+    // CTB-level flags affecting loop filter operation
-+    uint8_t *filter_slice_edges;
-+
-+    /** used on BE to byteswap the lines for checksumming */
-+    uint8_t *checksum_buf;
-+    int      checksum_buf_size;
-+
-+    atomic_int wpp_err;
-+
-+    const uint8_t *data;
-+
-+    H2645Packet pkt;
-+    // type of the first VCL NAL of the current frame
-+    enum HEVCNALUnitType first_nal_type;
-+
-+    uint8_t context_initialized;
-+    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
-+                            ///< as a format defined in 14496-15
-+    int apply_defdispwin;
-+
-+    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
-+    int nuh_layer_id;
-+
-+    struct AVMD5 *md5_ctx;
-+
-+    RefPicListTab * rpl_tab;
-+    unsigned int rpl_tab_size;
-+
-+    uint8_t *is_intra_store;
-+
-+    RpiSliceHeader sh;
-+
-+    HEVCRpiParamSets ps;
-+
-+    HEVCRpiLocalContext    *HEVClc;
-+    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
-+
-+    HEVCRpiFrame DPB[HEVC_DPB_ELS];
-+
-+    ///< candidate references for the current frame
-+    RefPicList rps[5];
-+
-+    HEVCRpiPredContext hpc;
-+    HEVCDSPContext hevcdsp;
-+
-+    HEVCSEIContext sei;
-+
-+    // Put structures that allocate non-trivial storage at the end
-+    // These are mostly used indirectly so position in the structure doesn't matter
-+    HEVCRpiPassQueue passq[RPI_PASSES];
-+#if RPI_EXTRA_BIT_THREADS > 0
-+    int bt_started;
-+    // This simply contains thread descriptors - task setup is held elsewhere
-+    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
-+#endif
-+#if RPI_TSTATS
-+    HEVCRpiStats tstats;
-+#endif
-+} HEVCRpiContext;
-+
-+/**
-+ * Mark all frames in DPB as unused for reference.
-+ */
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
-+
-+/**
-+ * Drop all frames currently in DPB.
-+ */
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture sets for the current frame.
-+ */
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture list(s) for the current slice.
-+ */
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
-+
-+
-+/**
-+ * Get the number of candidate references for the current frame.
-+ */
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
-+
-+/**
-+ * Find next frame in output order and put a reference to it in frame.
-+ * @return 1 if a frame was output, 0 otherwise
-+ */
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
-+
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, HEVCRpiMvField * const mv);
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+    const unsigned int x0, const unsigned int y0,
-+    const unsigned int nPbW, const unsigned int nPbH,
-+    const unsigned int avail,
-+    HEVCRpiMvField * const mv,
-+    const unsigned int mvp_lx_flag, const unsigned int LX);
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+                                               const unsigned int x0, const unsigned int y0,
-+                                               const unsigned int log2_trafo_size, const int is_coded_block);
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
-+
-+extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra[4];
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
-+
-+// arm/hevc_misc_neon.S
-+// Neon coeff zap fn
-+#if HAVE_NEON
-+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
-+#endif
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int val, const int field);
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
-+
-+// All of these expect that s->threads_type == FF_THREAD_FRAME
-+
-+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int y)
-+{
-+    if (s->threads_type != 0)
-+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
-+{
-+    if (s->used_for_ref && s->threads_type != 0)
-+        ff_hevc_rpi_progress_signal_field(s, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCRpiFrame * const ref, const int y)
-+{
-+    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
-+{
-+    if (s->used_for_ref && s->threads_type != 0)
-+    {
-+        ff_hevc_rpi_progress_signal_field(s, y, 0);
-+    }
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
-+{
-+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
-+    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
-+}
-+
-+
-+// Set all done - signal nothing (used in missing refs)
-+// Works for both rpi & non-rpi
-+static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
-+{
-+    if (ref->tf.progress != NULL)
-+    {
-+        int * const p = (int *)ref->tf.progress->data;
-+        p[0] = INT_MAX;
-+        p[1] = INT_MAX;
-+    }
-+}
-+
-+#define HEVC_RPI_420_ONLY 1
-+#define HEVC_RPI_SAND128_ONLY 1
-+
-+static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return cidx == 0 ? 0 : 1;
-+#else
-+    return s->ps.sps->hshift[cidx];
-+#endif
-+}
-+
-+static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return cidx == 0 ? 0 : 1;
-+#else
-+    return s->ps.sps->vshift[cidx];
-+#endif
-+}
-+
-+static inline int ctx_cfmt(const HEVCRpiContext * const s)
-+{
-+#if HEVC_RPI_420_ONLY
-+    return 1;
-+#else
-+    return s->ps.sps->chroma_format_idc;
-+#endif
-+}
-+
-+static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
-+{
-+#if HEVC_RPI_SAND128_ONLY
-+    return 128;
-+#else
-+    return frame->linesize[c_idx];
-+#endif
-+}
-+
-+#if HEVC_RPI_SAND128_ONLY
-+// Propagate this decision to later zc includes
-+#define RPI_ZC_SAND128_ONLY 1
-+#endif
-+
-+#ifndef ff_hevc_rpi_copy_vert
-+static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
-+                                         int pixel_shift, int height,
-+                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    int i;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            for (i = 0; i < height; i++) {
-+                *(uint32_t *)dst = *(uint32_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        case 1:
-+            for (i = 0; i < height; i++) {
-+                *(uint16_t *)dst = *(uint16_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        default:
-+            for (i = 0; i < height; i++) {
-+                *dst = *src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+    }
-+}
-+#endif
-+
-+
-+#if MVF_STASH_WIDTH == 64
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+    return (HEVCRpiMvField *)((y < y0_ctb) ?
-+        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
-+        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
-+            lc->mvf_stash +
-+                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
-+                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+                               const unsigned int x0,
-+                               const unsigned int x)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
-+}
-+
-+#else
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+                               const unsigned int x0, const unsigned int y0,
-+                               const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+
-+    const unsigned int x0_ctb = x0 & mask_cs_hi;
-+    const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+    // If not in the same CTB for Y assume up
-+    if (y < y0_ctb) {
-+        // If not in the same CTB for X too assume up-left
-+        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
-+    }
-+    return mvf_stash_ptr(s, lc, x, y);
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+                               const unsigned int x0,
-+                               const unsigned int x)
-+{
-+    return MVF_STASH_WIDTH_PU;
-+}
-+#endif
-+
-+#endif /* AVCODEC_RPI_HEVCDEC_H */
-diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
-new file mode 100644
-index 0000000000..87f3cc9d14
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.c
-@@ -0,0 +1,450 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdsp.h"
-+#include "rpi_hevc_mv.h"
-+
-+static const int8_t transform[32][32] = {
-+    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
-+      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
-+    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
-+      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
-+    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
-+     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
-+    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
-+      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
-+    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
-+      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
-+    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
-+     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
-+    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
-+     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
-+    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
-+      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
-+    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
-+      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
-+    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
-+     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
-+    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
-+     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
-+    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
-+      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
-+    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
-+      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
-+    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
-+     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
-+    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
-+     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
-+    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
-+      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
-+    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
-+      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
-+    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
-+     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
-+    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
-+     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
-+    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
-+      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
-+    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
-+      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
-+    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
-+     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
-+    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
-+     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
-+    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
-+      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
-+    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
-+      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
-+    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
-+     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
-+    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
-+     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
-+    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
-+      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
-+    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
-+      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
-+    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
-+     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
-+    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
-+      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
-+    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
-+      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
-+    { -2, 58, 10, -2},
-+    { -4, 54, 16, -2},
-+    { -6, 46, 28, -4},
-+    { -4, 36, 36, -4},
-+    { -4, 28, 46, -6},
-+    { -2, 16, 54, -4},
-+    { -2, 10, 58, -2},
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
-+    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
-+    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
-+    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
-+};
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc0, int in_inc1)
-+{
-+    int shift = 32;
-+    uint32_t bs = 0;
-+    for (; pus > 0; pus--) {
-+        int strength, out;
-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-+        int nr_idx0 = neigh->ref_idx[0];
-+        int nr_idx1 = neigh->ref_idx[1];
-+        int neigh_refL0 = neigh_rpl0[nr_idx0];
-+        int neigh_refL1 = neigh_rpl1[nr_idx1];
-+
-+        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
-+        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
-+
-+#if 1 // This more directly matches the original implementation
-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-+            // same L0 and L1
-+            if (curr_refL0 == neigh_refL0 &&
-+                curr_refL0 == curr_refL1 &&
-+                neigh_refL0 == neigh_refL1) {
-+                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
-+                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL0 == curr_refL0 &&
-+                       neigh_refL1 == curr_refL1) {
-+                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL1 == curr_refL0 &&
-+                       neigh_refL0 == curr_refL1) {
-+                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else {
-+                strength = 1;
-+            }
-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+            MvXY curr_mv0, neigh_mv0;
-+
-+            if (curr->pred_flag & 1) {
-+                curr_mv0   = curr->xy[0];
-+            } else {
-+                curr_mv0   = curr->xy[1];
-+                curr_refL0 = curr_refL1;
-+            }
-+
-+            if (neigh->pred_flag & 1) {
-+                neigh_mv0   = neigh->xy[0];
-+            } else {
-+                neigh_mv0   = neigh->xy[1];
-+                neigh_refL0 = neigh_refL1;
-+            }
-+
-+            if (curr_refL0 == neigh_refL0) {
-+                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else
-+                strength = 1;
-+        } else
-+            strength = 1;
-+#else // This has exactly the same effect, but is more suitable for vectorisation
-+        MvXY curr_mv[2];
-+        MvXY neigh_mv[2];
-+        memcpy(curr_mv, curr->xy, sizeof curr_mv);
-+        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
-+
-+        if (!(curr->pred_flag & 2)) {
-+            curr_mv[1] = curr_mv[0];
-+            curr_refL1 = curr_refL0;
-+        }
-+        if (!(neigh->pred_flag & 2)) {
-+            neigh_mv[1] = neigh_mv[0];
-+            neigh_refL1 = neigh_refL0;
-+        }
-+        if (!(curr->pred_flag & 1)) {
-+            curr_mv[0] = curr_mv[1];
-+            curr_refL0 = curr_refL1;
-+        }
-+        if (!(neigh->pred_flag & 1)) {
-+            neigh_mv[0] = neigh_mv[1];
-+            neigh_refL0 = neigh_refL1;
-+        }
-+
-+        strength = 1;
-+
-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
-+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
-+
-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-+                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
-+                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
-+
-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
-+#endif
-+
-+        curr += in_inc0 / sizeof (HEVCRpiMvField);
-+        neigh += in_inc1 / sizeof (HEVCRpiMvField);
-+
-+        for (out = dup; out > 0; out--)
-+        {
-+            bs = (bs >> 2) | (strength << 30);
-+            shift -= 2;
-+        }
-+    }
-+    return bs >> shift;
-+}
-+
-+
-+static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
-+{
-+    unsigned int i, j;
-+
-+    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
-+        for (i = 0; i < height; i++) {
-+            for (j = 0; j < width; j+=8)
-+                AV_COPY64U(dst+j, src+j);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    } else {
-+        for (i = 0; i < height; i++) {
-+            for (j = 0; j < width; j+=16)
-+                AV_COPY128(dst+j, src+j);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    }
-+}
-+
-+
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef PEL_FUNC
-+#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
-+    for(i = 0 ; i < 10 ; i++)                                                  \
-+{                                                                              \
-+    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
-+}
-+
-+#undef EPEL_FUNCS
-+#define EPEL_FUNCS(depth)                                                     \
-+    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
-+    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
-+    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
-+    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
-+
-+#undef EPEL_UNI_FUNCS
-+#define EPEL_UNI_FUNCS(depth)                                                 \
-+    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
-+    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
-+    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
-+    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
-+    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
-+
-+#undef EPEL_BI_FUNCS
-+#define EPEL_BI_FUNCS(depth)                                                \
-+    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
-+    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
-+    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
-+    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
-+    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
-+
-+#undef QPEL_FUNCS
-+#define QPEL_FUNCS(depth)                                                     \
-+    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
-+    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
-+    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
-+    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
-+
-+#undef QPEL_UNI_FUNCS
-+#define QPEL_UNI_FUNCS(depth)                                                 \
-+    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
-+    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
-+    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
-+    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
-+
-+#undef QPEL_BI_FUNCS
-+#define QPEL_BI_FUNCS(depth)                                                  \
-+    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
-+    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
-+    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
-+    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
-+
-+#define SLICED_ADD_RESIDUAL(depth)\
-+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
-+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
-+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
-+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
-+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
-+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
-+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
-+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
-+    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
-+    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
-+    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
-+    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
-+    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
-+    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
-+#define SLICED_LOOP_FILTERS(depth)\
-+    hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
-+#define SLICED_SAO(depth)\
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
-+        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
-+        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
-+    }                                                                         \
-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
-+
-+#define HEVC_DSP(depth)                                                     \
-+    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-+    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
-+    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
-+    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
-+    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
-+    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
-+    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
-+    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
-+    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
-+    SLICED_ADD_RESIDUAL(depth);                                             \
-+    hevcdsp->dequant                = FUNC(dequant, depth);                 \
-+    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-+    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
-+    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
-+    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
-+    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
-+    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
-+                                                                            \
-+    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
-+    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
-+    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
-+    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
-+                                                                            \
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
-+        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
-+        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
-+    }                                                                       \
-+    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
-+    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
-+    SLICED_SAO(depth);                                                         \
-+                                                                               \
-+    QPEL_FUNCS(depth);                                                         \
-+    QPEL_UNI_FUNCS(depth);                                                     \
-+    QPEL_BI_FUNCS(depth);                                                      \
-+    EPEL_FUNCS(depth);                                                         \
-+    EPEL_UNI_FUNCS(depth);                                                     \
-+    EPEL_BI_FUNCS(depth);                                                      \
-+                                                                               \
-+    SLICED_LOOP_FILTERS(depth);                                                \
-+    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
-+    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
-+    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
-+int i = 0;
-+
-+    switch (bit_depth) {
-+    case 9:
-+        HEVC_DSP(9);
-+        break;
-+    case 10:
-+        HEVC_DSP(10);
-+        break;
-+    case 12:
-+        HEVC_DSP(12);
-+        break;
-+    default:
-+        HEVC_DSP(8);
-+        break;
-+    }
-+
-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-+    hevcdsp->cpy_blk = cpy_blk;
-+
-+    if (ARCH_PPC)
-+        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
-+    if (ARCH_X86)
-+        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
-+    if (ARCH_ARM)
-+        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
-+    if (ARCH_MIPS)
-+        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
-+}
-diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
-new file mode 100644
-index 0000000000..5a7cdeeb66
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.h
-@@ -0,0 +1,177 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ *
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDSP_H
-+#define AVCODEC_RPI_HEVCDSP_H
-+
-+#include "hevc.h"
-+#include "get_bits.h"
-+
-+struct HEVCRpiMvField;
-+
-+#define MAX_PB_SIZE 64
-+
-+#define RPI_HEVC_SAO_BUF_STRIDE 160
-+
-+
-+typedef struct RpiSAOParams {
-+    uint8_t band_position[3];   ///< sao_band_position (Y,U,V)
-+    uint8_t eo_class[3];        ///< sao_eo_class      (Y,U=V)
-+    uint8_t type_idx[3];        ///< sao_type_idx      (Y,U=V)
-+
-+    int16_t offset_val[3][5];   ///<SaoOffsetVal       (Y,U,V)
-+
-+} RpiSAOParams;
-+
-+
-+// This controls how many sao dsp functions there are
-+// N=5 has width = 8, 16, 32, 48, 64
-+// N=6 adds a function for width=24 (in fn array el 5 so existing code should
-+// still work)
-+#define SAO_FILTER_N 6
-+
-+
-+typedef struct HEVCDSPContext {
-+    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
-+
-+    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
-+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
-+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
-+
-+    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
-+
-+    void (*dequant)(int16_t *coeffs, int16_t log2_size);
-+
-+    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
-+
-+    void (*transform_4x4_luma)(int16_t *coeffs);
-+
-+    void (*idct[4])(int16_t *coeffs, int col_limit);
-+
-+    void (*idct_dc[4])(int16_t *coeffs);
-+
-+    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                               int width, int height);
-+
-+    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-+    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
-+
-+    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+
-+    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                        int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+
-+    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, int denom, int wx0, int wx1,
-+                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                    int height, intptr_t mx, intptr_t my, int width);
-+
-+    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width);
-+    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, int denom, int wx0, int ox0, int wx1,
-+                                         int ox1, intptr_t mx, intptr_t my, int width);
-+
-+    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                    int beta, int32_t *tc,
-+                                    uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                    int beta, int32_t *tc,
-+                                    uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                      int beta, int32_t *tc,
-+                                      uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                      int beta, int32_t *tc,
-+                                      uint8_t *no_p, uint8_t *no_q);
-+    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                        int32_t *tc, uint8_t *no_p,
-+                                        uint8_t *no_q);
-+    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+                                        int32_t *tc, uint8_t *no_p,
-+                                        uint8_t *no_q);
-+    void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                                 uint8_t * _pix_l);
-+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f);
-+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f);
-+
-+    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc0, int inc_inc1);
-+
-+    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
-+} HEVCDSPContext;
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-+
-+extern const int8_t ff_hevc_rpi_epel_filters[7][4];
-+extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
-+
-+void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
-+#endif /* AVCODEC_RPI_HEVCDSP_H */
-diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
-new file mode 100644
-index 0000000000..dea4e55e4b
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp_template.c
-@@ -0,0 +1,2279 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "get_bits.h"
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+#include "rpi_hevcdsp.h"
-+
-+#include "rpi_hevc_shader_template.h"
-+
-+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+                          GetBitContext *gb, int pcm_bit_depth)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+}
-+
-+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+                          GetBitContext *gb, int pcm_bit_depth)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+
-+    dst = (pixel *)_dst + 1;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
-+                                                ptrdiff_t stride, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size; x++) {
-+            dst[x] = av_clip_pixel(dst[x] + *res);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size; x++) {
-+            dst[x] = av_clip_pixel(dst[x] + dc);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_v, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + *res);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_u, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, unsigned int size)
-+{
-+    unsigned int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int16_t * ru = res;
-+    const int16_t * rv = res + size * size;
-+
-+//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
-+//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
-+//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
-+        }
-+        dst += stride;
-+    }
-+
-+//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int dc_v = dc >> 16;
-+    const int dc_u = (dc << 16) >> 16;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual)(_dst, res, stride, 32);
-+}
-+
-+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 32);
-+}
-+
-+// -- U -- (plaited)
-+
-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
-+}
-+
-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
-+}
-+
-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
-+}
-+
-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- V -- (plaited)
-+
-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
-+}
-+
-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
-+}
-+
-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
-+}
-+
-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- C -- (plaited - both U & V)
-+
-+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+
-+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
-+{
-+    int16_t *coeffs = (int16_t *) _coeffs;
-+    int x, y;
-+    int size = 1 << log2_size;
-+
-+    if (mode) {
-+        coeffs += size;
-+        for (y = 0; y < size - 1; y++) {
-+            for (x = 0; x < size; x++)
-+                coeffs[x] += coeffs[x - size];
-+            coeffs += size;
-+        }
-+    } else {
-+        for (y = 0; y < size; y++) {
-+            for (x = 1; x < size; x++)
-+                coeffs[x] += coeffs[x - 1];
-+            coeffs += size;
-+        }
-+    }
-+}
-+
-+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
-+{
-+    int shift  = 15 - BIT_DEPTH - log2_size;
-+    int x, y;
-+    int size = 1 << log2_size;
-+
-+    if (shift > 0) {
-+        int offset = 1 << (shift - 1);
-+        for (y = 0; y < size; y++) {
-+            for (x = 0; x < size; x++) {
-+                *coeffs = (*coeffs + offset) >> shift;
-+                coeffs++;
-+            }
-+        }
-+    } else {
-+        for (y = 0; y < size; y++) {
-+            for (x = 0; x < size; x++) {
-+                *coeffs = *coeffs << -shift;
-+                coeffs++;
-+            }
-+        }
-+    }
-+}
-+
-+#define SET(dst, x)   (dst) = (x)
-+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
-+
-+#define TR_4x4_LUMA(dst, src, step, assign)                             \
-+    do {                                                                \
-+        int c0 = src[0 * step] + src[2 * step];                         \
-+        int c1 = src[2 * step] + src[3 * step];                         \
-+        int c2 = src[0 * step] - src[3 * step];                         \
-+        int c3 = 74 * src[1 * step];                                    \
-+                                                                        \
-+        assign(dst[2 * step], 74 * (src[0 * step] -                     \
-+                                    src[2 * step] +                     \
-+                                    src[3 * step]));                    \
-+        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
-+        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
-+        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
-+    } while (0)
-+
-+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
-+{
-+    int i;
-+    int shift    = 7;
-+    int add      = 1 << (shift - 1);
-+    int16_t *src = coeffs;
-+
-+    for (i = 0; i < 4; i++) {
-+        TR_4x4_LUMA(src, src, 4, SCALE);
-+        src++;
-+    }
-+
-+    shift = 20 - BIT_DEPTH;
-+    add   = 1 << (shift - 1);
-+    for (i = 0; i < 4; i++) {
-+        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
-+        coeffs += 4;
-+    }
-+}
-+
-+#undef TR_4x4_LUMA
-+
-+#define TR_4(dst, src, dstep, sstep, assign, end)                 \
-+    do {                                                          \
-+        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
-+        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
-+        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
-+        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
-+                                                                  \
-+        assign(dst[0 * dstep], e0 + o0);                          \
-+        assign(dst[1 * dstep], e1 + o1);                          \
-+        assign(dst[2 * dstep], e1 - o1);                          \
-+        assign(dst[3 * dstep], e0 - o0);                          \
-+    } while (0)
-+
-+#define TR_8(dst, src, dstep, sstep, assign, end)                 \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_8[4];                                               \
-+        int o_8[4] = { 0 };                                       \
-+        for (i = 0; i < 4; i++)                                   \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
-+        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
-+                                                                  \
-+        for (i = 0; i < 4; i++) {                                 \
-+            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
-+            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
-+        }                                                         \
-+    } while (0)
-+
-+#define TR_16(dst, src, dstep, sstep, assign, end)                \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_16[8];                                              \
-+        int o_16[8] = { 0 };                                      \
-+        for (i = 0; i < 8; i++)                                   \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
-+        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
-+                                                                  \
-+        for (i = 0; i < 8; i++) {                                 \
-+            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
-+            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
-+        }                                                         \
-+    } while (0)
-+
-+#define TR_32(dst, src, dstep, sstep, assign, end)                \
-+    do {                                                          \
-+        int i, j;                                                 \
-+        int e_32[16];                                             \
-+        int o_32[16] = { 0 };                                     \
-+        for (i = 0; i < 16; i++)                                  \
-+            for (j = 1; j < end; j += 2)                          \
-+                o_32[i] += transform[j][i] * src[j * sstep];      \
-+        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
-+                                                                  \
-+        for (i = 0; i < 16; i++) {                                \
-+            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
-+            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
-+        }                                                         \
-+    } while (0)
-+
-+#define IDCT_VAR4(H)                                              \
-+    int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR8(H)                                              \
-+    int limit  = FFMIN(col_limit, H);                             \
-+    int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR16(H)   IDCT_VAR8(H)
-+#define IDCT_VAR32(H)   IDCT_VAR8(H)
-+
-+#define IDCT(H)                                                   \
-+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
-+                                        int col_limit)            \
-+{                                                                 \
-+    int i;                                                        \
-+    int      shift = 7;                                           \
-+    int      add   = 1 << (shift - 1);                            \
-+    int16_t *src   = coeffs;                                      \
-+    IDCT_VAR ## H(H);                                             \
-+                                                                  \
-+    for (i = 0; i < H; i++) {                                     \
-+        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
-+        if (limit2 < H && i%4 == 0 && !!i)                        \
-+            limit2 -= 4;                                          \
-+        src++;                                                    \
-+    }                                                             \
-+                                                                  \
-+    shift = 20 - BIT_DEPTH;                                       \
-+    add   = 1 << (shift - 1);                                     \
-+    for (i = 0; i < H; i++) {                                     \
-+        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
-+        coeffs += H;                                              \
-+    }                                                             \
-+}
-+
-+#define IDCT_DC(H)                                                \
-+static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
-+{                                                                 \
-+    int i, j;                                                     \
-+    int shift = 14 - BIT_DEPTH;                                   \
-+    int add   = 1 << (shift - 1);                                 \
-+    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
-+                                                                  \
-+    for (j = 0; j < H; j++) {                                     \
-+        for (i = 0; i < H; i++) {                                 \
-+            coeffs[i + j * H] = coeff;                            \
-+        }                                                         \
-+    }                                                             \
-+}
-+
-+IDCT( 4)
-+IDCT( 8)
-+IDCT(16)
-+IDCT(32)
-+
-+IDCT_DC( 4)
-+IDCT_DC( 8)
-+IDCT_DC(16)
-+IDCT_DC(32)
-+
-+#undef TR_4
-+#undef TR_8
-+#undef TR_16
-+#undef TR_32
-+
-+#undef SET
-+#undef SCALE
-+
-+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  int16_t *sao_offset_val, int sao_left_class,
-+                                  int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int offset_table[32] = { 0 };
-+    int k, y, x;
-+    int shift  = BIT_DEPTH - 5;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    for (k = 0; k < 4; k++)
-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+        dst += stride_dst;
-+        src += stride_src;
-+    }
-+}
-+
-+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
-+
-+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
-+                                  int eo, int width, int height) {
-+
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            int diff0 = CMP(src[x], src[x + a_stride]);
-+            int diff1 = CMP(src[x], src[x + b_stride]);
-+            int offset_val        = edge_idx[2 + diff0 + diff1];
-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-+        }
-+        src += stride_src;
-+        dst += stride_dst;
-+    }
-+}
-+
-+
-+#if BIT_DEPTH == 10
-+// We need a 32 bit variation for the _c restores so hijack bit depth 10
-+#undef pixel
-+#undef BIT_DEPTH
-+#define pixel uint32_t
-+#define BIT_DEPTH 32
-+// All 16 bit variations are the same
-+#define sao_edge_restore_0_10 sao_edge_restore_0_9
-+#define sao_edge_restore_1_10 sao_edge_restore_1_9
-+#define sao_edge_restore_0_11 sao_edge_restore_0_9
-+#define sao_edge_restore_1_11 sao_edge_restore_1_9
-+#define sao_edge_restore_0_12 sao_edge_restore_0_9
-+#define sao_edge_restore_1_12 sao_edge_restore_1_9
-+#define sao_edge_restore_0_13 sao_edge_restore_0_9
-+#define sao_edge_restore_1_13 sao_edge_restore_1_9
-+#define sao_edge_restore_0_14 sao_edge_restore_0_9
-+#define sao_edge_restore_1_14 sao_edge_restore_1_9
-+#define sao_edge_restore_0_15 sao_edge_restore_0_9
-+#define sao_edge_restore_1_15 sao_edge_restore_1_9
-+#define sao_edge_restore_0_16 sao_edge_restore_0_9
-+#define sao_edge_restore_1_16 sao_edge_restore_1_9
-+#endif
-+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
-+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int sao_eo_class    = sao->eo_class[c_idx];
-+    int init_x = 0, width = _width, height = _height;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    if (sao_eo_class != SAO_EO_VERT) {
-+        if (borders[0]) {
-+            for (y = 0; y < height; y++) {
-+                dst[y * stride_dst] = src[y * stride_src];
-+            }
-+            init_x = 1;
-+        }
-+        if (borders[2]) {
-+            int offset     = width - 1;
-+            for (x = 0; x < height; x++) {
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+            }
-+            width--;
-+        }
-+    }
-+    if (sao_eo_class != SAO_EO_HORIZ) {
-+        if (borders[1]) {
-+            for (x = init_x; x < width; x++)
-+                dst[x] = src[x];
-+        }
-+        if (borders[3]) {
-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
-+            for (x = init_x; x < width; x++)
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-+            height--;
-+        }
-+    }
-+}
-+
-+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int sao_eo_class    = sao->eo_class[c_idx];
-+    int init_x = 0, init_y = 0, width = _width, height = _height;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+
-+    if (sao_eo_class != SAO_EO_VERT) {
-+        if (borders[0]) {
-+            for (y = 0; y < height; y++) {
-+                dst[y * stride_dst] = src[y * stride_src];
-+            }
-+            init_x = 1;
-+        }
-+        if (borders[2]) {
-+            int offset     = width - 1;
-+            for (x = 0; x < height; x++) {
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+            }
-+            width--;
-+        }
-+    }
-+    if (sao_eo_class != SAO_EO_HORIZ) {
-+        if (borders[1]) {
-+            for (x = init_x; x < width; x++)
-+                dst[x] = src[x];
-+            init_y = 1;
-+        }
-+        if (borders[3]) {
-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
-+            for (x = init_x; x < width; x++)
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-+            height--;
-+        }
-+    }
-+
-+    {
-+        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
-+        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
-+        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
-+        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
-+
-+        // Restore pixels that can't be modified
-+        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
-+            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
-+                dst[y*stride_dst] = src[y*stride_src];
-+        }
-+        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
-+            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
-+                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
-+        }
-+
-+        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
-+            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
-+                dst[x] = src[x];
-+        }
-+        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
-+            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
-+                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
-+        }
-+        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
-+            dst[0] = src[0];
-+        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
-+            dst[width-1] = src[width-1];
-+        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
-+            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
-+        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
-+            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
-+
-+    }
-+}
-+#endif
-+#if BIT_DEPTH == 32
-+#undef BIT_DEPTH
-+#undef pixel
-+#define BIT_DEPTH 10
-+#define pixel uint16_t
-+#endif
-+
-+// --- Plaited chroma versions
-+
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int offset_table_u[32] = { 0 };
-+    int offset_table_v[32] = { 0 };
-+    int k, y, x;
-+    int shift  = BIT_DEPTH - 5;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+    width *= 2;
-+
-+    for (k = 0; k < 4; k++)
-+    {
-+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+    }
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2)
-+        {
-+//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
-+//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
-+            // *** & 31 shouldn't be wanted but just now we generate broken input that
-+            // crashes us in 10-bit world
-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
-+        }
-+        dst += stride_dst;
-+        src += stride_src;
-+    }
-+}
-+
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+                                  int eo, int width, int height) {
-+
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+
-+    stride_dst /= sizeof(pixel);
-+    width *= 2;
-+
-+    av_assert0(width <= 64);
-+
-+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2) {
-+            int diff0u = CMP(src[x], src[x + a_stride]);
-+            int diff1u = CMP(src[x], src[x + b_stride]);
-+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
-+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
-+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
-+        }
-+        src += stride_src;
-+        dst += stride_dst;
-+    }
-+}
-+
-+// Do once
-+#if BIT_DEPTH == 8
-+// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
-+#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
-+// We need 32 bit for 9 bit+
-+#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
-+#endif
-+
-+#undef CMP
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-+                                      uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = src[x] << (14 - BIT_DEPTH);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                          int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        memcpy(dst, src, width * sizeof(pixel));
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int16_t *src2,
-+                                         int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                           int16_t *src2,
-+                                           int height, int denom, int wx0, int wx1,
-+                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src          = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
-+        }
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define QPEL_FILTER(src, stride)                                               \
-+    (filter[0] * src[x - 3 * stride] +                                         \
-+     filter[1] * src[x - 2 * stride] +                                         \
-+     filter[2] * src[x -     stride] +                                         \
-+     filter[3] * src[x             ] +                                         \
-+     filter[4] * src[x +     stride] +                                         \
-+     filter[5] * src[x + 2 * stride] +                                         \
-+     filter[6] * src[x + 3 * stride] +                                         \
-+     filter[7] * src[x + 4 * stride])
-+
-+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    for (y = 0; y < height; y++)  {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
-+                                   uint8_t *_src,
-+                                   ptrdiff_t _srcstride,
-+                                   int height, intptr_t mx,
-+                                   intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+        tmp += MAX_PB_SIZE;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                      uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                     uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+
-+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                       uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift =  14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int16_t *src2,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                        uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox,
-+                                        intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+    int shift = 14  + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                        uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox,
-+                                        intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel        *src       = (pixel*)_src;
-+    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
-+                                         uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int height, int denom, int wx, int ox,
-+                                         intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    ox = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int16_t *src2,
-+                                        int height, int denom, int wx0, int wx1,
-+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    const int8_t *filter;
-+    pixel *src = (pixel*)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    src   -= QPEL_EXTRA_BEFORE * srcstride;
-+    filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+    for (y = 0; y < height + QPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define EPEL_FILTER(src, stride)                                               \
-+    (filter[0] * src[x - stride] +                                             \
-+     filter[1] * src[x]          +                                             \
-+     filter[2] * src[x + stride] +                                             \
-+     filter[3] * src[x + 2 * stride])
-+
-+static void FUNC(put_hevc_epel_h)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_v)(int16_t *dst,
-+                                  uint8_t *_src, ptrdiff_t _srcstride,
-+                                  int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
-+                                   uint8_t *_src, ptrdiff_t _srcstride,
-+                                   int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+        tmp += MAX_PB_SIZE;
-+        dst += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        }
-+        dst  += dststride;
-+        src  += srcstride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+        src += srcstride;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                     int16_t *src2,
-+                                     int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+        dst  += dststride;
-+        src  += srcstride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                      int16_t *src2,
-+                                      int height, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        }
-+        dst += dststride;
-+        src += srcstride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++) {
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+        }
-+        dst += dststride;
-+        src += srcstride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                       int16_t *src2,
-+                                       int height, int denom, int wx0, int wx1,
-+                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+        src  += srcstride;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+    int offset = 1 << (shift - 1);
-+#else
-+    int offset = 0;
-+#endif
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    ox     = ox * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+        tmp += MAX_PB_SIZE;
-+        dst += dststride;
-+    }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                        int16_t *src2,
-+                                        int height, int denom, int wx0, int wx1,
-+                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+    int x, y;
-+    pixel *src = (pixel *)_src;
-+    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+    pixel *dst          = (pixel *)_dst;
-+    ptrdiff_t dststride = _dststride / sizeof(pixel);
-+    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+    int16_t *tmp = tmp_array;
-+    int shift = 14 + 1 - BIT_DEPTH;
-+    int log2Wd = denom + shift - 1;
-+
-+    src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+    for (y = 0; y < height + EPEL_EXTRA; y++) {
-+        for (x = 0; x < width; x++)
-+            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+        src += srcstride;
-+        tmp += MAX_PB_SIZE;
-+    }
-+
-+    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+    filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
-+    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
-+        tmp  += MAX_PB_SIZE;
-+        dst  += dststride;
-+        src2 += MAX_PB_SIZE;
-+    }
-+}
-+
-+// line zero
-+#define P3 pix[-4 * xstride]
-+#define P2 pix[-3 * xstride]
-+#define P1 pix[-2 * xstride]
-+#define P0 pix[-1 * xstride]
-+#define Q0 pix[0 * xstride]
-+#define Q1 pix[1 * xstride]
-+#define Q2 pix[2 * xstride]
-+#define Q3 pix[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix[-4 * xstride + 3 * ystride]
-+#define TP2 pix[-3 * xstride + 3 * ystride]
-+#define TP1 pix[-2 * xstride + 3 * ystride]
-+#define TP0 pix[-1 * xstride + 3 * ystride]
-+#define TQ0 pix[0  * xstride + 3 * ystride]
-+#define TQ1 pix[1  * xstride + 3 * ystride]
-+#define TQ2 pix[2  * xstride + 3 * ystride]
-+#define TQ3 pix[3  * xstride + 3 * ystride]
-+
-+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
-+                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
-+                                        int beta, int *_tc,
-+                                        uint8_t *_no_p, uint8_t *_no_q)
-+{
-+    int d, j;
-+    pixel *pix        = (pixel *)_pix;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    beta <<= BIT_DEPTH - 8;
-+
-+    for (j = 0; j < 2; j++) {
-+        const int dp0  = abs(P2  - 2 * P1  + P0);
-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
-+        const int d0   = dp0 + dq0;
-+        const int d3   = dp3 + dq3;
-+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
-+        const int no_p = _no_p[j];
-+        const int no_q = _no_q[j];
-+
-+        if (d0 + d3 >= beta) {
-+            pix += 4 * ystride;
-+            continue;
-+        } else {
-+            const int beta_3 = beta >> 3;
-+            const int beta_2 = beta >> 2;
-+            const int tc25   = ((tc * 5 + 1) >> 1);
-+
-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
-+                // strong filtering
-+                const int tc2 = tc << 1;
-+                for (d = 0; d < 4; d++) {
-+                    const int p3 = P3;
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    const int q3 = Q3;
-+                    if (!no_p) {
-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+                    }
-+                    if (!no_q) {
-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+                    }
-+                    pix += ystride;
-+                }
-+            } else { // normal filtering
-+                int nd_p = 1;
-+                int nd_q = 1;
-+                const int tc_2 = tc >> 1;
-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_p = 2;
-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_q = 2;
-+
-+                for (d = 0; d < 4; d++) {
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+                    if (abs(delta0) < 10 * tc) {
-+                        delta0 = av_clip(delta0, -tc, tc);
-+                        if (!no_p)
-+                            P0 = av_clip_pixel(p0 + delta0);
-+                        if (!no_q)
-+                            Q0 = av_clip_pixel(q0 - delta0);
-+                        if (!no_p && nd_p > 1) {
-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+                            P1 = av_clip_pixel(p1 + deltap1);
-+                        }
-+                        if (!no_q && nd_q > 1) {
-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+                            Q1 = av_clip_pixel(q1 + deltaq1);
-+                        }
-+                    }
-+                    pix += ystride;
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
-+                                          ptrdiff_t _ystride, int *_tc,
-+                                          uint8_t *_no_p, uint8_t *_no_q)
-+{
-+    int d, j, no_p, no_q;
-+    pixel *pix        = (pixel *)_pix;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    for (j = 0; j < 2; j++) {
-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
-+        if (tc <= 0) {
-+            pix += 4 * ystride;
-+            continue;
-+        }
-+        no_p = _no_p[j];
-+        no_q = _no_q[j];
-+
-+        for (d = 0; d < 4; d++) {
-+            int delta0;
-+            const int p1 = P1;
-+            const int p0 = P0;
-+            const int q0 = Q0;
-+            const int q1 = Q1;
-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+            if (!no_p)
-+                P0 = av_clip_pixel(p0 + delta0);
-+            if (!no_q)
-+                Q0 = av_clip_pixel(q0 - delta0);
-+            pix += ystride;
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                            int32_t *tc, uint8_t *no_p,
-+                                            uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+                                            int32_t *tc, uint8_t *no_p,
-+                                            uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                          int beta, int32_t *tc, uint8_t *no_p,
-+                                          uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
-+                                beta, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+                                          int beta, int32_t *tc, uint8_t *no_p,
-+                                          uint8_t *no_q)
-+{
-+    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
-+                                beta, tc, no_p, no_q);
-+}
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+// line zero
-+#define P3 pix_l[0 * xstride]
-+#define P2 pix_l[1 * xstride]
-+#define P1 pix_l[2 * xstride]
-+#define P0 pix_l[3 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+#define Q2 pix_r[2 * xstride]
-+#define Q3 pix_r[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix_l[0 * xstride + 3 * ystride]
-+#define TP2 pix_l[1 * xstride + 3 * ystride]
-+#define TP1 pix_l[2 * xstride + 3 * ystride]
-+#define TP0 pix_l[3 * xstride + 3 * ystride]
-+#define TQ0 pix_r[0 * xstride + 3 * ystride]
-+#define TQ1 pix_r[1 * xstride + 3 * ystride]
-+#define TQ2 pix_r[2 * xstride + 3 * ystride]
-+#define TQ3 pix_r[3 * xstride + 3 * ystride]
-+
-+// This is identical to hevc_loop_filter_luma except that the P/Q
-+// components are on separate pointers
-+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+                                 uint8_t * _pix_l)
-+{
-+    int d, j;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    const ptrdiff_t xstride = 1;
-+    const ptrdiff_t ystride = _stride / sizeof(pixel);
-+
-+    beta <<= BIT_DEPTH - 8;
-+
-+    for (j = 0; j < 2; j++) {
-+        const int dp0  = abs(P2  - 2 * P1  + P0);
-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
-+        const int d0   = dp0 + dq0;
-+        const int d3   = dp3 + dq3;
-+        const int tc   = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
-+        const int no_p = no_f & 1;
-+        const int no_q = no_f & 2;
-+
-+        if (d0 + d3 >= beta) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        } else {
-+            const int beta_3 = beta >> 3;
-+            const int beta_2 = beta >> 2;
-+            const int tc25   = ((tc * 5 + 1) >> 1);
-+
-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
-+                // strong filtering
-+                const int tc2 = tc << 1;
-+                for (d = 0; d < 4; d++) {
-+                    const int p3 = P3;
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    const int q3 = Q3;
-+                    if (!no_p) {
-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+                    }
-+                    if (!no_q) {
-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            } else { // normal filtering
-+                int nd_p = 1;
-+                int nd_q = 1;
-+                const int tc_2 = tc >> 1;
-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_p = 2;
-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_q = 2;
-+
-+                for (d = 0; d < 4; d++) {
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+                    if (abs(delta0) < 10 * tc) {
-+                        delta0 = av_clip(delta0, -tc, tc);
-+                        if (!no_p)
-+                            P0 = av_clip_pixel(p0 + delta0);
-+                        if (!no_q)
-+                            Q0 = av_clip_pixel(q0 - delta0);
-+                        if (!no_p && nd_p > 1) {
-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+                            P1 = av_clip_pixel(p1 + deltap1);
-+                        }
-+                        if (!no_q && nd_q > 1) {
-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+                            Q1 = av_clip_pixel(q1 + deltaq1);
-+                        }
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
-+{
-+    // Just call the non-2 function having massaged the parameters
-+    int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
-+    uint8_t no_p[2] = {no_f & 1, no_f & 1};
-+    uint8_t no_q[2] = {no_f & 2, no_f & 2};
-+    FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
-+}
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#define P1 pix_l[0 * xstride]
-+#define P0 pix_l[1 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+
-+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
-+                                          ptrdiff_t _ystride, const int32_t *_tc,
-+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
-+{
-+    int d, j, no_p, no_q;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    for (j = 0; j < 2; j++) {
-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
-+        if (tc <= 0) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        }
-+        no_p = _no_p[j];
-+        no_q = _no_q[j];
-+
-+        for (d = 0; d < 4; d++) {
-+            int delta0;
-+            const int p1 = P1;
-+            const int p0 = P0;
-+            const int q0 = Q0;
-+            const int q1 = Q1;
-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+            if (!no_p)
-+                P0 = av_clip_pixel(p0 + delta0);
-+            if (!no_q)
-+                Q0 = av_clip_pixel(q0 - delta0);
-+            pix_l += ystride;
-+            pix_r += ystride;
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
-+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
-+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
-+}
-+
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+
-diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
-new file mode 100644
-index 0000000000..0aa8809a4b
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.c
-@@ -0,0 +1,161 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "rpi_hevcpred.h"
-+#if (ARCH_ARM)
-+#include "arm/rpi_hevcpred_arm.h"
-+#endif
-+
-+#define PRED_C 0
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+#define PRED_C 1
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef FUNCC
-+#define FUNCC(a, depth) a ## _ ## depth ## _c
-+
-+#define HEVC_PRED_Y(depth)                                \
-+    hpc->intra_pred      = FUNC(intra_pred, depth);     \
-+    hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
-+    hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
-+    hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
-+    hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
-+    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
-+    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
-+    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
-+    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
-+    hpc->pred_dc[0]      = FUNC(pred_dc_0, depth);      \
-+    hpc->pred_dc[1]      = FUNC(pred_dc_1, depth);      \
-+    hpc->pred_dc[2]      = FUNC(pred_dc_2, depth);      \
-+    hpc->pred_dc[3]      = FUNC(pred_dc_3, depth);      \
-+    hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
-+    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
-+    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
-+    hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
-+    hpc->pred_dc0[0]     = FUNC(pred_dc0_0, depth);     \
-+    hpc->pred_dc0[1]     = FUNC(pred_dc0_1, depth);     \
-+    hpc->pred_dc0[2]     = FUNC(pred_dc0_2, depth);     \
-+    hpc->pred_dc0[3]     = FUNC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED_C(depth)                                \
-+    hpc->intra_pred_c      = FUNCC(intra_pred, depth);     \
-+	hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
-+	hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
-+	hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
-+	hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
-+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
-+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
-+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
-+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
-+    hpc->pred_dc_c[0]      = FUNCC(pred_dc_0, depth);      \
-+    hpc->pred_dc_c[1]      = FUNCC(pred_dc_1, depth);      \
-+    hpc->pred_dc_c[2]      = FUNCC(pred_dc_2, depth);      \
-+    hpc->pred_dc_c[3]      = FUNCC(pred_dc_3, depth);      \
-+    hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
-+    hpc->pred_dc0_c[0]     = FUNCC(pred_dc0_0, depth);     \
-+    hpc->pred_dc0_c[1]     = FUNCC(pred_dc0_1, depth);     \
-+    hpc->pred_dc0_c[2]     = FUNCC(pred_dc0_2, depth);     \
-+    hpc->pred_dc0_c[3]     = FUNCC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED(depth) \
-+    HEVC_PRED_Y(depth); \
-+    HEVC_PRED_C(depth);
-+
-+    switch (bit_depth) {
-+    case 9:
-+        HEVC_PRED(9);
-+        break;
-+    case 10:
-+        HEVC_PRED(10);
-+        break;
-+    case 12:
-+        HEVC_PRED(12);
-+        break;
-+    default:
-+        HEVC_PRED(8);
-+        break;
-+    }
-+
-+#if (ARCH_ARM)
-+    ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
-+#elif (ARCH_MIPS)
-+    ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
-+#endif
-+}
-diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
-new file mode 100644
-index 0000000000..9f0edb8798
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.h
-@@ -0,0 +1,123 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCPRED_H
-+#define AVCODEC_RPI_HEVCPRED_H
-+
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "config.h"
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiLocalContext;
-+
-+enum IntraPredMode {
-+    INTRA_PLANAR = 0,
-+    INTRA_DC,
-+    INTRA_ANGULAR_2,
-+    INTRA_ANGULAR_3,
-+    INTRA_ANGULAR_4,
-+    INTRA_ANGULAR_5,
-+    INTRA_ANGULAR_6,
-+    INTRA_ANGULAR_7,
-+    INTRA_ANGULAR_8,
-+    INTRA_ANGULAR_9,
-+    INTRA_ANGULAR_10,
-+    INTRA_ANGULAR_11,
-+    INTRA_ANGULAR_12,
-+    INTRA_ANGULAR_13,
-+    INTRA_ANGULAR_14,
-+    INTRA_ANGULAR_15,
-+    INTRA_ANGULAR_16,
-+    INTRA_ANGULAR_17,
-+    INTRA_ANGULAR_18,
-+    INTRA_ANGULAR_19,
-+    INTRA_ANGULAR_20,
-+    INTRA_ANGULAR_21,
-+    INTRA_ANGULAR_22,
-+    INTRA_ANGULAR_23,
-+    INTRA_ANGULAR_24,
-+    INTRA_ANGULAR_25,
-+    INTRA_ANGULAR_26,
-+    INTRA_ANGULAR_27,
-+    INTRA_ANGULAR_28,
-+    INTRA_ANGULAR_29,
-+    INTRA_ANGULAR_30,
-+    INTRA_ANGULAR_31,
-+    INTRA_ANGULAR_32,
-+    INTRA_ANGULAR_33,
-+    INTRA_ANGULAR_34,
-+};
-+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
-+#define INTRA_ANGULAR_VERTICAL   INTRA_ANGULAR_26
-+
-+typedef void intra_filter_fn_t(
-+        uint8_t * const left, uint8_t * const top,
-+        const unsigned int req, const unsigned int avail,
-+        const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
-+        const unsigned int stride,
-+        const unsigned int top_right_size, const unsigned int down_left_size);
-+
-+typedef struct HEVCRpiPredContext {
-+    void (*intra_pred)(const struct HEVCRpiContext * const s,
-+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
-+                          const unsigned int avail, const unsigned int log2_size);
-+
-+    intra_filter_fn_t *intra_filter[4];
-+    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-+                           const uint8_t *left, ptrdiff_t stride);
-+    void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+                    ptrdiff_t stride);
-+    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
-+
-+    void (*intra_pred_c)(const struct HEVCRpiContext * const s,
-+                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
-+                          const unsigned int avail, const unsigned int log2_size);
-+    intra_filter_fn_t *intra_filter_c[4];
-+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
-+                           const uint8_t *left, ptrdiff_t stride);
-+    void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+                    ptrdiff_t stride);
-+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int mode);
-+    void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
-+} HEVCRpiPredContext;
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
-+
-+#endif /* AVCODEC_RPI_HEVCPRED_H */
-diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
-new file mode 100644
-index 0000000000..f2ebcad332
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred_template.c
-@@ -0,0 +1,1407 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "bit_depth_template.c"
-+
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevcpred.h"
-+
-+#define DUMP_PRED 0
-+
-+#define POS(x, y) src[(x) + stride * (y)]
-+
-+// INCLUDED_ONCE defined at EOF
-+#ifndef INCLUDED_ONCE
-+typedef uint8_t (* c8_dst_ptr_t)[2];
-+typedef const uint8_t (* c8_src_ptr_t)[2];
-+typedef uint16_t (* c16_dst_ptr_t)[2];
-+typedef const uint16_t (* c16_src_ptr_t)[2];
-+
-+// *** On ARM make these NEON registers
-+typedef struct pixel4_16 {
-+    uint16_t x[4];
-+} pixel4_16;
-+typedef struct pixel4_32 {
-+    uint32_t x[4];
-+} pixel4_32;
-+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
-+{
-+    pixel4_16 t = {{x, x, x, x}};
-+    return t;
-+}
-+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
-+{
-+    pixel4_32 t = {{x, x, x, x}};
-+    return t;
-+}
-+#endif
-+
-+#if PRED_C
-+// For chroma we double pixel size so we copy pairs
-+#undef pixel
-+#undef pixel2
-+#undef pixel4
-+#undef dctcoef
-+#undef INIT_CLIP
-+#undef no_rnd_avg_pixel4
-+#undef rnd_avg_pixel4
-+#undef AV_RN2P
-+#undef AV_RN4P
-+#undef AV_RN4PA
-+#undef AV_WN2P
-+#undef AV_WN4P
-+#undef AV_WN4PA
-+#undef CLIP
-+#undef FUNC
-+#undef FUNCC
-+#undef av_clip_pixel
-+#undef PIXEL_SPLAT_X4
-+
-+#if BIT_DEPTH == 8
-+#define pixel uint16_t
-+#define pixel4 pixel4_16
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
-+#define cpel uint8_t
-+#define c_src_ptr_t  c8_src_ptr_t
-+#define c_dst_ptr_t  c8_dst_ptr_t
-+#else
-+#define pixel uint32_t
-+#define pixel4 pixel4_32
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
-+#define cpel uint16_t
-+#define c_src_ptr_t c16_dst_ptr_t
-+#define c_dst_ptr_t c16_dst_ptr_t
-+#endif
-+#define AV_RN4P(p) (*(pixel4*)(p))
-+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
-+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
-+#endif
-+
-+
-+// Get PW prior to horrid PRED_C trickery
-+#if BIT_DEPTH == 8
-+#define PW 1
-+#else
-+#define PW 2
-+#endif
-+
-+
-+#if DUMP_PRED && !defined(INCLUDED_ONCE)
-+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
-+{
-+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
-+        for (unsigned int x = 0; x != size; x++) {
-+            printf("%4d", data[x * 2]);
-+        }
-+        printf("\n");
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+#ifndef INCLUDED_ONCE
-+static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t v4 = v | (v << 8);
-+        uint32_t * p = (uint32_t *)ptr;
-+        v4 = v4 | (v4 << 16);
-+        do {
-+            *p++ = v4;
-+        } while (--n != 0);
-+    }
-+}
-+
-+static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t v2 = v | (v << 16);
-+        uint32_t * p = (uint32_t *)ptr;
-+        do {
-+            *p++ = v2;
-+            *p++ = v2;
-+        } while (--n != 0);
-+    }
-+}
-+
-+static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
-+{
-+    if ((n >>= 2) != 0) {
-+        uint32_t * p = (uint32_t *)ptr;
-+        do {
-+            *p++ = v;
-+            *p++ = v;
-+            *p++ = v;
-+            *p++ = v;
-+        } while (--n != 0);
-+    }
-+}
-+
-+// Beware that this inverts the avail ordering
-+// For CIP it seems easier this way round
-+static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
-+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+                              unsigned int s0, unsigned int odd_s)
-+{
-+    const unsigned int n = 1 << log2_intra_bits;
-+    unsigned int fa = 0;
-+    unsigned int i;
-+
-+    size >>= 2;   // Now in 4-pel units
-+    s0 >>= 2;
-+
-+    if ((avail & AVAIL_DL) != 0)
-+        fa |= ((1 << s0) - 1) << (size - s0);
-+    if ((avail & AVAIL_L) != 0)
-+        fa |= ((1 << size) - 1) << size;
-+    if ((avail & AVAIL_UL) != 0)
-+        fa |= 1 << (size << 1);
-+
-+    if (odd_s) {
-+        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
-+            fa &= ~1;
-+        is_intra += i_stride;
-+    }
-+
-+    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
-+        const unsigned int m = ((1 << n) - 1) << i;
-+        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
-+            fa &= ~m;
-+    }
-+
-+    return fa;
-+}
-+
-+static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
-+                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+                                unsigned int s1, unsigned int odd_s)
-+{
-+    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
-+    {
-+        return 0;
-+    }
-+    else
-+    {
-+        const unsigned int n = 1 << log2_intra_bits;
-+        unsigned int fa = 0;
-+        unsigned int i;
-+        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
-+
-+        size >>= 2;   // Now in 4-pel units
-+        s1 >>= 2;
-+
-+        if ((avail & AVAIL_U) != 0)
-+            fa |= ((1 << size) - 1);
-+        if ((avail & AVAIL_UR) != 0)
-+            fa |= ((1 << s1) - 1) << size;
-+
-+        if (odd_s) {
-+            fa &= im | ~1;
-+            im >>= 1;
-+        }
-+
-+        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
-+            const unsigned int m = ((1 << n) - 1) << i;
-+            if ((im & 1) == 0)
-+                fa &= ~m;
-+        }
-+        return fa;
-+    }
-+}
-+
-+
-+
-+static inline unsigned int rmbd(unsigned int x)
-+{
-+#if 1
-+    return __builtin_ctz(x);
-+#else
-+    unsigned int n = 0;
-+    if ((x & 0xffff) == 0) {
-+        x >>= 16;
-+        n += 16;
-+    }
-+    if ((x & 0xff) == 0) {
-+        x >>= 8;
-+        n += 8;
-+    }
-+    if ((x & 0xf) == 0) {
-+        x >>= 4;
-+        n += 4;
-+    }
-+    if ((x & 0x3) == 0) {
-+        x >>= 2;
-+        n += 2;
-+    }
-+
-+    return (x & 1) == 0 ? n + 1 : n;
-+#endif
-+}
-+#endif
-+
-+
-+static void FUNC(cip_fill)(pixel * const left, pixel * const top,
-+    const unsigned int avail_l, const unsigned int avail_u,
-+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+    const unsigned int stride,
-+    const unsigned int size)
-+{
-+    pixel a;
-+    unsigned int i;
-+
-+    // 1st find DL value
-+    if ((avail_l & 1) == 0) {
-+        if (avail_l != 0)
-+            a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
-+        else
-+        {
-+            // (avail_l | avail_u) != 0 so this must be good
-+            const unsigned int n = rmbd(avail_u)*4;
-+            a = (n >= size) ? src_ur[n - size] : src_u[n];
-+        }
-+    }
-+
-+    // L
-+    {
-+        pixel * d = left + size * 2 - 1;
-+        const pixel * s = src_l + (size * 2 - 1) * stride;
-+        unsigned int x = avail_l;
-+        for (i = 0; i < size * 2; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = *s;
-+                s -= stride;
-+                *d-- = a = *s;
-+                s -= stride;
-+            }
-+            else
-+            {
-+                *d-- = a;
-+                *d-- = a;
-+                *d-- = a;
-+                *d-- = a;
-+                s -= stride * 4;
-+            }
-+        }
-+        // UL
-+        *d = a = (x & 1) != 0 ? *s : a;
-+    }
-+
-+    // U
-+    {
-+        pixel * d = top;
-+        const pixel * s = src_u;
-+        unsigned int x = avail_u;
-+
-+        for (i = 0; i < size; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = a = *s++;
-+            }
-+            else
-+            {
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                s += 4;
-+            }
-+        }
-+
-+        // UR
-+        s = src_ur;
-+        for (i = 0; i < size; i += 4, x >>= 1)
-+        {
-+            if ((x & 1) != 0) {
-+                // Avail
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = *s++;
-+                *d++ = a = *s++;
-+            }
-+            else
-+            {
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                *d++ = a;
-+                s += 4;
-+            }
-+        }
-+    }
-+}
-+
-+
-+#if !PRED_C && PW == 1
-+#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
-+#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
-+#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
-+#else
-+#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
-+#endif
-+
-+// Reqs:
-+//
-+// Planar:  DL[0], L, ul, U, UR[0]
-+// DC:         dl, L, ul, U, ur
-+// A2-9:       DL, L, ul, u, ur
-+// A10:        dl, L, ul, u, ur
-+// A11-17      dl, L, UL, U, ur
-+// A18-25      dl, L, Ul, U, ur
-+// A26         dl, l, ul, U, ur
-+// A27-34      dl, l, ul, U, UR
-+
-+#ifndef INCLUDED_ONCE
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+
-+static const uint8_t req_avail_c[35] =
-+{
-+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0         |  AVAIL_U,             // DC
-+    AVAIL_DL | AVAIL_L,                                    // 2
-+    AVAIL_DL | AVAIL_L,                                    // 3
-+    AVAIL_DL | AVAIL_L,                                    // 4
-+    AVAIL_DL | AVAIL_L,                                    // 5
-+    AVAIL_DL | AVAIL_L,                                    // 6
-+    AVAIL_DL | AVAIL_L,                                    // 7
-+    AVAIL_DL | AVAIL_L,                                    // 8
-+    AVAIL_DL | AVAIL_L,                                    // 9
-+               AVAIL_L,                                    // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
-+                                    AVAIL_U,               // 26 (V)
-+                                    AVAIL_U | AVAIL_UR,    // 27
-+                                    AVAIL_U | AVAIL_UR,    // 28
-+                                    AVAIL_U | AVAIL_UR,    // 29
-+                                    AVAIL_U | AVAIL_UR,    // 30
-+                                    AVAIL_U | AVAIL_UR,    // 31
-+                                    AVAIL_U | AVAIL_UR,    // 32
-+                                    AVAIL_U | AVAIL_UR,    // 33
-+                                    AVAIL_U | AVAIL_UR     // 34
-+};
-+
-+static const uint8_t req_avail[4][35] = {
-+{
-+    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0         |  AVAIL_U,             // DC
-+    AVAIL_DL | AVAIL_L,                                    // 2
-+    AVAIL_DL | AVAIL_L,                                    // 3
-+    AVAIL_DL | AVAIL_L,                                    // 4
-+    AVAIL_DL | AVAIL_L,                                    // 5
-+    AVAIL_DL | AVAIL_L,                                    // 6
-+    AVAIL_DL | AVAIL_L,                                    // 7
-+    AVAIL_DL | AVAIL_L,                                    // 8
-+    AVAIL_DL | AVAIL_L,                                    // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U,               // 26 (V)
-+                                    AVAIL_U | AVAIL_UR,    // 27
-+                                    AVAIL_U | AVAIL_UR,    // 28
-+                                    AVAIL_U | AVAIL_UR,    // 29
-+                                    AVAIL_U | AVAIL_UR,    // 30
-+                                    AVAIL_U | AVAIL_UR,    // 31
-+                                    AVAIL_U | AVAIL_UR,    // 32
-+                                    AVAIL_U | AVAIL_UR,    // 33
-+                                    AVAIL_U | AVAIL_UR     // 34
-+},
-+{  // 3
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 3
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 4
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 5
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 6
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 7
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 8
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | 0,             // 27
-+                                    AVAIL_U | AVAIL_UR | 0,             // 28
-+                                    AVAIL_U | AVAIL_UR | 0,             // 29
-+                                    AVAIL_U | AVAIL_UR | 0,             // 30
-+                                    AVAIL_U | AVAIL_UR | 0,             // 31
-+                                    AVAIL_U | AVAIL_UR | 0,             // 32
-+                                    AVAIL_U | AVAIL_UR | 0,             // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
-+},
-+{  // 4
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 3
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 4
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 5
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 6
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 7
-+    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 8
-+    AVAIL_DL | AVAIL_L                                 | 0,             // 9
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | 0,             // 27
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 28
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 29
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 30
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 31
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 32
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
-+},
-+{  // 5
-+    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
-+               AVAIL_L | 0        | AVAIL_U,                            // DC
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 2
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 3
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 4
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 5
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 6
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 7
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 8
-+    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 9
-+               AVAIL_L                                 | 0,             // 10 (H)
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 11
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 12
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 13
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 14
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 15
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 16
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 17
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 18
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 19
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 20
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 21
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 22
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 23
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 24
-+               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 25
-+                                    AVAIL_U            | 0,             // 26 (V)
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
-+                                    AVAIL_U | AVAIL_UR | FILTER_EITHER  // 34
-+}
-+};
-+
-+
-+#endif
-+
-+#define filter_light1 FUNC(filter_light1)
-+static inline pixel filter_light1(pixel a, pixel b, pixel c)
-+{
-+    return (a + b*2 + c + 2) >> 2;
-+}
-+
-+#define filter_light FUNC(filter_light)
-+static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
-+{
-+    pixel p0;
-+    pixel p2 = *src;
-+    // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
-+    unsigned int n_minus_1 = n - 1;
-+
-+    do
-+    {
-+        src += sstride;
-+        p0 = p1;
-+        p1 = p2;
-+        p2 = *src;
-+        *dst++ = filter_light1(p0, p1, p2);
-+    } while (--n_minus_1 != 0);
-+    *dst = filter_light1(p1, p2, pn);
-+}
-+
-+#define filter_strong FUNC(filter_strong)
-+static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
-+{
-+    unsigned int a = 64 * p0 + 32;
-+    const int v = p1 - p0;
-+
-+    do
-+    {
-+        *dst++ = (a += v) >> 6;
-+    } while (--n != 0);
-+}
-+
-+#define intra_filter FUNC(intra_filter)
-+static av_always_inline void intra_filter(
-+    pixel * const left, pixel * const top,
-+    const unsigned int req, const unsigned int avail,
-+    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+    const unsigned int stride,
-+    const unsigned int top_right_size, const unsigned int down_left_size,
-+    const unsigned int log2_size)
-+{
-+    const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
-+    const unsigned int size = 1 << log2_size;
-+
-+    // a_ is the first pel in a section working round dl -> ur
-+    // b_ is the last
-+    // Beware that top & left work out from UL so usage of a_ & b_ may
-+    // swap between them.  It is a bad naming scheme but I have found no
-+    // better
-+    const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
-+    const pixel * b_dl = src_l + size * stride;
-+    const pixel * a_l  = src_l + (size - 1) * stride;
-+    const pixel * b_l  = src_l;
-+    const pixel * ab_ul = src_l - stride;
-+    const pixel * a_u = src_u;
-+    const pixel * b_u = src_u + size - 1;
-+    const pixel * a_ur = src_ur;
-+    const pixel * b_ur = src_ur + top_right_size - 1;
-+
-+    const unsigned int want = req & ~avail;
-+    const unsigned int have = req & avail;
-+    unsigned int i;
-+
-+    if ((avail & AVAIL_DL) == 0)
-+    {
-+        a_dl = a_ur;
-+        if ((avail & AVAIL_U) != 0)
-+            a_dl = a_u;
-+        if ((avail & AVAIL_UL) != 0)
-+            a_dl = ab_ul;
-+        if ((avail & AVAIL_L) != 0)
-+            a_dl = a_l;
-+        b_dl = a_dl;
-+    }
-+
-+    if ((avail & AVAIL_L) == 0)
-+    {
-+        a_l = b_dl;
-+        b_l = b_dl;
-+    }
-+    if ((avail & AVAIL_UL) == 0)
-+    {
-+        ab_ul = b_l;
-+    }
-+    if ((avail & AVAIL_U) == 0)
-+    {
-+        a_u = ab_ul;
-+        b_u = ab_ul;
-+    }
-+    if ((avail & AVAIL_UR) == 0)
-+    {
-+        a_ur = b_u;
-+        b_ur = b_u;
-+    }
-+
-+    if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2)  // PRED_C, log2_size compiler opt hints
-+    {
-+        if ((req & AVAIL_UL) != 0)
-+            left[-1] = *ab_ul;
-+
-+        if ((want & AVAIL_L) != 0)
-+            EXTEND(left, *a_l, size);
-+        if ((want & AVAIL_DL) != 0)
-+            EXTEND(left + size, *a_dl, size);
-+        if ((want & AVAIL_U) != 0)
-+            EXTEND(top, *a_u, size);
-+        if ((want & AVAIL_UR) != 0)
-+            EXTEND(top + size, *a_ur, size);
-+
-+        if ((have & AVAIL_U) != 0)
-+            // Always good - even with sand
-+            memcpy(top, a_u, size * sizeof(pixel));
-+        if ((have & AVAIL_UR) != 0)
-+        {
-+            memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
-+            EXTEND(top + size + top_right_size, *b_ur,
-+                   size - top_right_size);
-+        }
-+        if ((have & AVAIL_L) != 0)
-+        {
-+            for (i = 0; i < size; i++)
-+                left[i] = b_l[stride * i];
-+        }
-+        if ((have & AVAIL_DL) != 0)
-+        {
-+            for (i = 0; i < down_left_size; i++)
-+                left[i + size] = b_dl[stride * i];
-+            EXTEND(left + size + down_left_size, *a_dl,
-+                   size - down_left_size);
-+        }
-+    }
-+    else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
-+            FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
-+            FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
-+    {
-+        if ((req & (AVAIL_U | AVAIL_UR)) != 0)
-+            filter_strong(top, *ab_ul, *b_ur, size * 2);
-+        left[-1] = *ab_ul;
-+        if ((req & (AVAIL_L | AVAIL_DL)) != 0)
-+            filter_strong(left, *ab_ul, *a_dl, size*2);
-+    }
-+    else
-+    {
-+        // Same code for both have & want for UL
-+        if ((req & AVAIL_UL) != 0)
-+        {
-+            left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
-+        }
-+
-+        if ((want & AVAIL_L) != 0)
-+        {
-+            EXTEND(left, *a_l, size);
-+            left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
-+        }
-+        if ((want & AVAIL_DL) != 0)
-+        {
-+            // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
-+            EXTEND(left + size, *a_l, size);
-+        }
-+        if ((want & AVAIL_U) != 0)
-+        {
-+            EXTEND(top, *a_u, size);
-+            top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
-+        }
-+        if ((want & AVAIL_UR) != 0)
-+        {
-+            // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
-+            EXTEND(top + size, *a_ur, size);
-+        }
-+
-+        if ((have & AVAIL_U) != 0)
-+        {
-+            filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
-+        }
-+        if ((have & AVAIL_UR) != 0) {
-+            filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
-+            top[size*2 - 1] = *b_ur;
-+            EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
-+        }
-+        if ((have & AVAIL_L) != 0)
-+        {
-+            filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
-+        }
-+        if ((have & AVAIL_DL) != 0)
-+        {
-+            filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
-+            left[size*2 - 1] = *a_dl;
-+            EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
-+        }
-+    }
-+}
-+
-+#define INTRA_FILTER(log2_size) \
-+static void FUNC(intra_filter_ ## log2_size)( \
-+     uint8_t * const left, uint8_t * const top, \
-+     const unsigned int req, const unsigned int avail, \
-+     const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
-+     const unsigned int stride, \
-+     const unsigned int top_right_size, const unsigned int down_left_size) \
-+{ \
-+    intra_filter((pixel *)left, (pixel *)top, req, avail, \
-+        (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
-+}
-+
-+INTRA_FILTER(2)
-+INTRA_FILTER(3)
-+INTRA_FILTER(4)
-+INTRA_FILTER(5)
-+
-+#undef intra_filter
-+#undef INTRA_FILTER
-+
-+static void FUNC(intra_pred)(const HEVCRpiContext * const s,
-+                                              const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
-+                                              const unsigned int log2_size)
-+{
-+    // c_idx will alaways be 1 for _c versions and 0 for y
-+    const unsigned int c_idx = PRED_C;
-+    const unsigned int hshift = ctx_hshift(s, c_idx);
-+    const unsigned int vshift = ctx_vshift(s, c_idx);
-+    const unsigned int size = (1 << log2_size);
-+    const unsigned int x = x0 >> hshift;
-+    const unsigned int y = y0 >> vshift;
-+
-+    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
-+    pixel *const src = c_idx == 0 ?
-+        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
-+        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
-+
-+    // Align so we can do multiple loads in the asm
-+    // Padded to 16 byte boundary so as not to confuse anything
-+    DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
-+    DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
-+
-+    pixel  * const left  = left_array  + 16 / sizeof(pixel);
-+    const pixel * top_pred = top;
-+
-+    const pixel * src_l = src - 1;
-+    const pixel * src_u = src - stride;
-+    const pixel * src_ur = src_u + size;
-+#if !PRED_C
-+    const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
-+#else
-+    const unsigned int req = req_avail_c[mode];
-+#endif
-+
-+    // If we have nothing to pred from then fill with grey
-+    // This isn't a common case but dealing with it here means we don't have to
-+    // test for it later
-+    if (avail == 0)
-+    {
-+dc_only:
-+#if !PRED_C
-+        s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
-+#else
-+        s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
-+#endif
-+        return;
-+    }
-+
-+    {
-+        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
-+        const AVFrame * const frame = s->frame;
-+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
-+        if ((x & mask) == 0)
-+            src_l -= stripe_adj;
-+        if (((x + size) & mask) == 0)
-+            src_ur += stripe_adj;
-+    }
-+
-+    // Can deal with I-slices in 'normal' code even if CIP
-+    // This also means that we don't need to generate (elsewhere) is_intra
-+    // for IRAP frames
-+    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
-+        s->sh.slice_type != HEVC_SLICE_I)
-+    {
-+        // * If we ever actually care about CIP performance then we should
-+        //   special case out size 4 stuff (can be done by 'normal') and
-+        //   have 8-pel avail masks
-+        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
-+                                           -(int)(s->ps.sps->pcm_width),
-+                                           1 << (((x - 1) >> (3 - hshift)) & 7),
-+                                           1 - hshift,
-+                                           avail,
-+                                           size,
-+                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
-+                                           vshift != 0 ? 0 : (y >> 2) & 1);
-+
-+        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
-+                                           (x >> (3 - hshift)) & 7,
-+                                           1 - hshift,
-+                                           avail,
-+                                           size,
-+                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
-+                                           hshift != 0 ? 0 : (x >> 2) & 1);
-+
-+        // Anything left?
-+        if ((avail_l | avail_u) == 0)
-+            goto dc_only;
-+
-+        FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
-+
-+#if !PRED_C
-+        if ((req & FILTER_LIGHT) != 0)
-+        {
-+            const unsigned threshold = 1 << (BIT_DEPTH - 5);
-+            if ((req & FILTER_STRONG) != 0 &&
-+                (int)(FFABS(left[-1]  + top[63] - 2 * top[31]))  < threshold &&
-+                (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
-+            {
-+                filter_strong(top, left[-1], top[63], 64);
-+                filter_strong(left, left[-1], left[63], 64);
-+            } else
-+            {
-+                // LHS writes UL too so copy for top
-+                const pixel p_ul = left[-1];
-+                filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
-+                filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
-+            }
-+        }
-+#endif
-+    }
-+    else
-+    {
-+        const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
-+        if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
-+            ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
-+        {
-+            top_pred = src_u;
-+        }
-+        else
-+        {
-+#if !PRED_C
-+            s->hpc.intra_filter[log2_size - 2]
-+#else
-+            s->hpc.intra_filter_c[log2_size - 2]
-+#endif
-+                ((uint8_t *)left, (uint8_t *)top, req, avail,
-+                 (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
-+                              ur_size,
-+                              FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
-+        }
-+    }
-+
-+
-+#if !PRED_C
-+    switch (mode) {
-+    case INTRA_PLANAR:
-+        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                          (uint8_t *)left, stride);
-+        break;
-+    case INTRA_DC:
-+        s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                       (uint8_t *)left, stride);
-+        break;
-+    case INTRA_ANGULAR_HORIZONTAL:
-+        s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    case INTRA_ANGULAR_VERTICAL:
-+        s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    default:
-+        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    }
-+#else
-+    switch (mode) {
-+    case INTRA_PLANAR:
-+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                          (uint8_t *)left, stride);
-+        break;
-+    case INTRA_DC:
-+        s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                       (uint8_t *)left, stride);
-+        break;
-+    case INTRA_ANGULAR_HORIZONTAL:
-+        s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    case INTRA_ANGULAR_VERTICAL:
-+        s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    default:
-+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+                                           (uint8_t *)left, stride,
-+                                           mode);
-+        break;
-+    }
-+
-+#if DUMP_PRED
-+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
-+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
-+#endif
-+#endif
-+}
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
-+                                  const uint8_t *_left, ptrdiff_t stride,
-+                                  int trafo_size)
-+{
-+    int x, y;
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+    int size = 1 << trafo_size;
-+    for (y = 0; y < size; y++)
-+        for (x = 0; x < size; x++)
-+            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
-+                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
-+}
-+#else
-+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
-+                                  const uint8_t * _left, ptrdiff_t stride,
-+                                  int trafo_size)
-+{
-+    int x, y;
-+    int size = 1 << trafo_size;
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    for (y = 0; y < size; y++, src += stride)
-+    {
-+        for (x = 0; x < size; x++)
-+        {
-+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
-+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
-+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
-+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_PLANAR(size)\
-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
-+}
-+
-+PRED_PLANAR(0)
-+PRED_PLANAR(1)
-+PRED_PLANAR(2)
-+PRED_PLANAR(3)
-+
-+#undef PRED_PLANAR
-+
-+#if !PRED_C
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+                          const uint8_t *_left,
-+                          ptrdiff_t stride, int log2_size)
-+{
-+    int i, j, x, y;
-+    int size          = (1 << log2_size);
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+    int dc            = size;
-+    pixel4 a;
-+    for (i = 0; i < size; i++)
-+        dc += left[i] + top[i];
-+
-+    dc >>= log2_size + 1;
-+
-+    a = PIXEL_SPLAT_X4(dc);
-+
-+    for (i = 0; i < size; i++)
-+        for (j = 0; j < size; j+=4)
-+            AV_WN4P(&POS(j, i), a);
-+
-+//    if (c_idx == 0 && size < 32)
-+// As we now have separate fns for y & c - no need to test that
-+    if (size < 32)
-+    {
-+        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
-+        for (x = 1; x < size; x++)
-+            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
-+        for (y = 1; y < size; y++)
-+            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
-+    }
-+}
-+#else
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+                          const uint8_t *_left,
-+                          ptrdiff_t stride, int log2_size)
-+{
-+    unsigned int i, j;
-+    const unsigned int size = (1 << log2_size);
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+    unsigned int dc0 = size;
-+    unsigned int dc1 = size;
-+
-+    for (i = 0; i < size; i++)
-+    {
-+        dc0 += left[i][0] + top[i][0];
-+        dc1 += left[i][1] + top[i][1];
-+    }
-+
-+    dc0 >>= log2_size + 1;
-+    dc1 >>= log2_size + 1;
-+
-+    for (i = 0; i < size; i++, src += stride)
-+    {
-+        for (j = 0; j < size; ++j)
-+        {
-+            src[j][0] = dc0;
-+            src[j][1] = dc1;
-+
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_DC(size)\
-+static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_dc)(src, top, left, stride, size + 2);                        \
-+}
-+
-+PRED_DC(0)
-+PRED_DC(1)
-+PRED_DC(2)
-+PRED_DC(3)
-+
-+#undef PRED_DC
-+
-+
-+
-+
-+#if !PRED_C
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+    int i, j;
-+    int size          = (1 << log2_size);
-+    pixel *src        = (pixel *)_src;
-+    pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
-+
-+    for (i = 0; i < size; i++)
-+        for (j = 0; j < size; j+=4)
-+            AV_WN4P(&POS(j, i), a);
-+}
-+#else
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+    unsigned int i, j;
-+    const unsigned int size = (1 << log2_size);
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const pixel a = (1 << (BIT_DEPTH - 1));
-+
-+    for (i = 0; i < size; i++, src += stride)
-+    {
-+        for (j = 0; j < size; ++j)
-+        {
-+            src[j][0] = a;
-+            src[j][1] = a;
-+        }
-+    }
-+}
-+#endif
-+
-+#define PRED_DC0(size)\
-+static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride)   \
-+{                                                                               \
-+    FUNC(pred_dc0)(src, stride, size + 2);                        \
-+}
-+
-+PRED_DC0(0)
-+PRED_DC0(1)
-+PRED_DC0(2)
-+PRED_DC0(3)
-+
-+#undef PRED_DC0
-+
-+
-+
-+
-+#ifndef ANGLE_CONSTS
-+#define ANGLE_CONSTS
-+static const int intra_pred_angle[] = {
-+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
-+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
-+};
-+static const int inv_angle[] = {
-+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
-+    -630, -910, -1638, -4096
-+};
-+#endif
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+                                                const uint8_t *_top,
-+                                                const uint8_t *_left,
-+                                                ptrdiff_t stride,
-+                                                int mode, int size)
-+{
-+    int x, y;
-+    pixel *src        = (pixel *)_src;
-+    const pixel *top  = (const pixel *)_top;
-+    const pixel *left = (const pixel *)_left;
-+
-+    int angle = intra_pred_angle[mode - 2];
-+    pixel ref_array[3 * MAX_TB_SIZE + 4];
-+    pixel *ref_tmp = ref_array + size;
-+    const pixel *ref;
-+    int last = (size * angle) >> 5;
-+
-+    if (mode >= 18) {
-+        ref = top - 1;
-+
-+        if (angle < 0)
-+        {
-+            memcpy(ref_tmp + 1, top, size * PW);
-+            ref_tmp[0] = left[-1];
-+
-+            for (x = last; x <= -1; x++)
-+                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+            ref = ref_tmp;
-+        }
-+
-+        for (y = 0; y < size; y++) {
-+            int idx  = ((y + 1) * angle) >> 5;
-+            int fact = ((y + 1) * angle) & 31;
-+            if (fact) {
-+                for (x = 0; x < size; x += 4) {
-+                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
-+                                           fact  * ref[x + idx + 2] + 16) >> 5;
-+                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
-+                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
-+                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
-+                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
-+                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
-+                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
-+                }
-+            } else {
-+                for (x = 0; x < size; x += 4)
-+                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
-+            }
-+        }
-+        if (mode == 26 && size < 32) {
-+            for (y = 0; y < size; y++)
-+                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
-+        }
-+
-+    } else {
-+        ref = left - 1;
-+        if (angle < 0 && last < -1) {
-+            for (x = 0; x <= size; x += 4)
-+                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
-+            // Inv angle <= -256 so top offset >= 0
-+            for (x = last; x <= -1; x++)
-+                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+            ref = ref_tmp;
-+        }
-+
-+        for (x = 0; x < size; x++) {
-+            int idx  = ((x + 1) * angle) >> 5;
-+            int fact = ((x + 1) * angle) & 31;
-+            if (fact) {
-+                for (y = 0; y < size; y++) {
-+                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
-+                                       fact  * ref[y + idx + 2] + 16) >> 5;
-+                }
-+            } else {
-+                for (y = 0; y < size; y++)
-+                    POS(x, y) = ref[y + idx + 1];
-+            }
-+        }
-+        if (mode == 10 && size < 32) {
-+            for (x = 0; x < size; x += 4) {
-+                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - left[-1]) >> 1));
-+                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
-+                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
-+                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
-+            }
-+        }
-+    }
-+}
-+#else
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+                                                const uint8_t *_top,
-+                                                const uint8_t *_left,
-+                                                ptrdiff_t stride,
-+                                                int mode, int size)
-+{
-+    int x, y;
-+    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
-+    c_src_ptr_t top  = (c_src_ptr_t)_top;
-+    c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    const int angle = intra_pred_angle[mode - 2];
-+    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
-+    c_dst_ptr_t ref_tmp = ref_array + size;
-+    c_src_ptr_t ref;
-+    const int last = (size * angle) >> 5;
-+
-+    if (mode >= 18) {
-+        ref = top - 1;
-+        if (angle < 0) {
-+            memcpy(ref_tmp + 1, top, size * 2 * PW);
-+            ref_tmp[0][0] = left[-1][0];
-+            ref_tmp[0][1] = left[-1][1];
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (y = 0; y < size; y++, src += stride) {
-+            const int idx  = ((y + 1) * angle) >> 5;
-+            const int fact = ((y + 1) * angle) & 31;
-+            if (fact) {
-+                for (x = 0; x < size; ++x) {
-+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
-+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
-+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
-+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                memcpy(src, ref + idx + 1, size * 2 * PW);
-+            }
-+        }
-+    } else {
-+        ref = left - 1;
-+        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (x = 0; x < size; x++, src++) {
-+            const int idx  = ((x + 1) * angle) >> 5;
-+            const int fact = ((x + 1) * angle) & 31;
-+            if (fact) {
-+                for (y = 0; y < size; y++) {
-+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
-+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
-+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
-+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                for (y = 0; y < size; y++)
-+                {
-+                    src[y * stride][0] = ref[y + idx + 1][0];
-+                    src[y * stride][1] = ref[y + idx + 1][1];
-+                }
-+            }
-+        }
-+    }
-+}
-+#endif
-+
-+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
-+}
-+
-+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
-+}
-+
-+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
-+}
-+
-+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
-+                                 const uint8_t *left,
-+                                 ptrdiff_t stride, int mode)
-+{
-+    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
-+}
-+
-+#undef cpel
-+#undef c_src_ptr_t
-+#undef c_dst_ptr_t
-+
-+#undef EXTEND
-+#undef POS
-+#undef PW
-+
-+#undef filter_light1
-+#undef filter_light
-+#undef filter_strong
-+#undef ref_gen
-+
-+#ifndef INCLUDED_ONCE
-+#define INCLUDED_ONCE
-+#endif
-+
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-new file mode 100644
-index 0000000000..98a0b104b7
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,155 @@
-+/*
-+Copyright (c) 2012, Broadcom Europe Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include <stdio.h>
-+#include <string.h>
-+#include <stdlib.h>
-+#include <fcntl.h>
-+#include <unistd.h>
-+#include <assert.h>
-+#include <stdint.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/ioctl.h>
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/vcio"
-+
-+#include "rpi_mailbox.h"
-+//#include <interface/vctypes/vc_image_structs.h>
-+
-+/*
-+ * use ioctl to send mbox property message
-+ */
-+
-+static int mbox_property(int file_desc, void *buf)
-+{
-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+   if (ret_val < 0) {
-+      printf("ioctl_set_msg failed:%d\n", ret_val);
-+   }
-+
-+#ifdef DEBUG
-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-+   for (i=0; i<size/4; i++)
-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-+#endif
-+   return ret_val;
-+}
-+
-+#define GET_VCIMAGE_PARAMS 0x30044
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
-+{
-+    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
-+    uint32_t * p = buf;
-+    void * rimg;
-+    int rv;
-+
-+    *p++ = 0; // size
-+    *p++ = 0; // process request
-+    *p++ = GET_VCIMAGE_PARAMS;
-+    *p++ = sizeof(*img);
-+    *p++ = sizeof(*img);
-+    rimg = p;
-+    memcpy(p, img, sizeof(*img));
-+    p += sizeof(*img) / sizeof(*p);
-+    *p++ = 0;  // End tag
-+    buf[0] = (p - buf) * sizeof(*p);
-+
-+    rv = mbox_property(fd, buf);
-+    memcpy(img, rimg, sizeof(*img));
-+
-+    return rv;
-+}
-+
-+
-+#define SET_CLOCK_RATE 0x00038002
-+#define GET_MAX_CLOCK 0x00030004
-+#define CLOCK_HEVC 11
-+
-+static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
-+{
-+    uint32_t buf[32];
-+    uint32_t * p = buf;
-+    int rv;
-+
-+    *p++ = 0; // size
-+    *p++ = 0; // process request
-+    *p++ = command;
-+    *p++ = 8;
-+    *p++ = 8;
-+    *p++ = *word0;
-+    *p++ = *word1;
-+    *p++ = 0;  // End tag
-+    buf[0] = (p - buf) * sizeof(*p);
-+
-+    rv = mbox_property(fd, buf);
-+    *word0 = buf[6];
-+    *word1 = buf[7];
-+    return rv;
-+}
-+
-+int mbox_open() {
-+   int file_desc;
-+
-+   // open a char device file used for communicating with kernel mbox driver
-+   file_desc = open(DEVICE_FILE_NAME, 0);
-+   if (file_desc < 0) {
-+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-+   }
-+   return file_desc;
-+}
-+
-+void mbox_close(int file_desc) {
-+  close(file_desc);
-+}
-+
-+int mbox_request_clock(int fd) {
-+   int rv;
-+   unsigned word0, word1 = 0;
-+   word0 = CLOCK_HEVC;
-+   rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
-+   if (rv != 0)
-+      return rv;
-+   word1 = word0;
-+   word0 = CLOCK_HEVC;
-+   rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
-+   return rv;
-+}
-+
-+int mbox_release_clock(int fd) {
-+  int rv;
-+  unsigned word0, word1 = 0;
-+  word0 = CLOCK_HEVC;
-+  word1 = 0;
-+  rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
-+  return rv;
-+}
-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-new file mode 100644
-index 0000000000..b2654ef01e
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,58 @@
-+#ifndef RPI_MAILBOX_H
-+#define RPI_MAILBOX_H
-+
-+/* The image structure. */
-+typedef struct vc_image_extra_uv_s {
-+  void *u, *v;
-+  int vpitch;
-+} VC_IMAGE_EXTRA_UV_T;
-+
-+typedef union {
-+    VC_IMAGE_EXTRA_UV_T uv;
-+//  VC_IMAGE_EXTRA_RGBA_T rgba;
-+//  VC_IMAGE_EXTRA_PAL_T pal;
-+//  VC_IMAGE_EXTRA_TF_T tf;
-+//  VC_IMAGE_EXTRA_BAYER_T bayer;
-+//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
-+//  VC_IMAGE_EXTRA_CODEC_T codec;
-+//  VC_IMAGE_EXTRA_OPENGL_T opengl;
-+} VC_IMAGE_EXTRA_T;
-+
-+
-+typedef struct VC_IMAGE_T {
-+  unsigned short                  type;           /* should restrict to 16 bits */
-+  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
-+  unsigned short                  width;          /* width in pixels */
-+  unsigned short                  height;         /* height in pixels */
-+  int                             pitch;          /* pitch of image_data array in bytes */
-+  int                             size;           /* number of bytes available in image_data array */
-+  void                           *image_data;     /* pixel data */
-+  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
-+  void                           *metadata;       /* metadata header for the image */
-+  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
-+  int                             mem_handle;     /* the mem handle for relocatable memory storage */
-+  int                             metadata_size;  /* size of metadata of each channel in bytes */
-+  int                             channel_offset; /* offset of consecutive channels in bytes */
-+  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
-+  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
-+  uint8_t                         current_channel;/* the channel this header is currently pointing to */
-+  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
-+  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
-+                                                            into a linked-mulitchannel image */
-+  uint8_t                         channel_index;         /* index of the channel this header represents while
-+                                                            it is being linked. */
-+  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
-+} VC_IMAGE_T;
-+
-+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
-+
-+
-+extern int mbox_open(void);
-+extern void mbox_close(int file_desc);
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
-+
-+int mbox_request_clock(int fd);
-+int mbox_release_clock(int fd);
-+
-+#endif
-diff --git a/libavcodec/rpi_mem.c b/libavcodec/rpi_mem.c
-new file mode 100644
-index 0000000000..812921f665
---- /dev/null
-+++ b/libavcodec/rpi_mem.c
-@@ -0,0 +1,326 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+
-+#include "config.h"
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <bcm_host.h>
-+#include <interface/vctypes/vc_image_types.h>
-+#include <interface/vcsm/user-vcsm.h>
-+#pragma GCC diagnostic pop
-+
-+#include "rpi_mem.h"
-+#include "rpi_zc_frames.h"
-+
-+
-+#define OPT_PREFER_CMA 0
-+
-+struct rpi_cache_flush_env_s {
-+  struct vcsm_user_clean_invalid2_s v;
-+};
-+
-+
-+// GPU memory alloc fns (internal)
-+
-+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
-+{
-+    if (p->arm != NULL)
-+        vcsm_unlock_ptr(p->arm);
-+    if (p->vcsm_handle != 0)
-+        vcsm_free(p->vcsm_handle);
-+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
-+}
-+
-+
-+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
-+    const int numbytes, const unsigned int cache_type, const char * const name)
-+{
-+    memset(p, 0, sizeof(*p));
-+    p->numbytes = (numbytes + 255) & ~255;  // Round up
-+
-+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
-+        goto fail;
-+    }
-+    if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
-+        goto fail;
-+    }
-+    if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
-+        goto fail;
-+    }
-+    if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
-+        goto fail;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    gpu_free_internal(p);
-+    return AVERROR(ENOMEM);
-+}
-+
-+// Public gpu fns
-+
-+// Allocate memory on GPU
-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+// Returns 0 on success.
-+// This allocates memory that will not be cached in ARM's data cache.
-+// Therefore safe to use without data cache flushing.
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
-+}
-+
-+// This allocates data that will be
-+//    Cached in ARM L2
-+//    Uncached in VPU L2
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T * const p) {
-+    gpu_free_internal(p);
-+}
-+
-+void rpi_mem_gpu_uninit(void)
-+{
-+    vcsm_exit();
-+    bcm_host_deinit();
-+}
-+
-+int rpi_mem_gpu_init(const unsigned int flags)
-+{
-+    const int wants_cma = bcm_host_is_fkms_active();
-+    int use_cma;
-+
-+    (void)flags;
-+
-+    if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
-+        use_cma = 1;
-+    else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
-+        use_cma = 0;
-+    else
-+        return AVERROR(EINVAL);
-+
-+    bcm_host_init();
-+
-+    return use_cma + 1;
-+}
-+
-+// ----------------------------------------------------------------------------
-+//
-+// Cache flush functions
-+
-+#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
-+{
-+  rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
-+  *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
-+  return rfe;
-+}
-+
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
-+{
-+  // Nothing needed
-+}
-+
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
-+{
-+    int rc = 0;
-+    if (rfe->v.op_count != 0) {
-+        if (vcsm_clean_invalid2(&rfe->v) != 0)
-+        {
-+          const int err = errno;
-+          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
-+          rc = AVERROR(err);
-+        }
-+        rfe->v.op_count = 0;
-+    }
-+    return rc;
-+}
-+
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
-+{
-+  int rc = rpi_cache_flush_execute(rfe);;
-+
-+  return rc;
-+}
-+
-+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
-+{
-+  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+
-+  av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
-+
-+  b->invalidate_mode = mode;
-+  b->block_count = blocks;
-+  b->start_address = gm->arm + offset0;
-+  b->block_size = block_size;
-+  b->inter_block_stride = block_stride;
-+}
-+
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset, const unsigned int size)
-+{
-+  // Deal with empty pointer trivially
-+  if (gm == NULL || size == 0)
-+    return;
-+
-+  av_assert1(offset <= gm->numbytes);
-+  av_assert1(size <= gm->numbytes);
-+  av_assert1(offset + size <= gm->numbytes);
-+
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
-+}
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
-+{
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
-+}
-+
-+
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
-+{
-+#if !RPI_ONE_BUF
-+#error Fixme! (NIF)
-+#endif
-+  if (gpu_is_buf1(frame)) {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
-+  }
-+  else
-+  {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
-+  }
-+}
-+
-+// Flush an area of a frame
-+// Width, height, x0, y0 in luma pels
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma)
-+{
-+  const unsigned int y_offset = frame->linesize[0] * y0;
-+  const unsigned int y_size = frame->linesize[0] * height;
-+  // Round UV up/down to get everything
-+  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
-+  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
-+
-+#if 0
-+  // *** frame->height is cropped height so not good
-+  // As all unsigned they will also reject -ve
-+  // Test individually as well as added to reject overflow
-+  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
-+  av_assert0(n <= (unsigned int)frame->height);
-+  av_assert0(start_line + n <= (unsigned int)frame->height);
-+#endif
-+
-+  if (!gpu_is_buf1(frame))
-+  {
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
-+    }
-+  }
-+  else if (!av_rpi_is_sand_frame(frame))
-+  {
-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
-+    }
-+  }
-+  else
-+  {
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
-+    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
-+    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
-+    av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
-+
-+    if (do_chroma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
-+      b->block_size = uv_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+    if (do_luma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
-+      b->block_size = y_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+  }
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
-+{
-+  rpi_cache_buf_t cbuf;
-+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
-+  rpi_cache_flush_finish(rfe);
-+}
-+
-diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h
-new file mode 100644
-index 0000000000..a451079806
---- /dev/null
-+++ b/libavcodec/rpi_mem.h
-@@ -0,0 +1,88 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#ifndef RPI_MEM_H
-+#define RPI_MEM_H
-+
-+typedef struct gpu_mem_ptr_s {
-+  unsigned char *arm; // Pointer to memory mapped on ARM side
-+  int vc_handle;   // Videocore handle of relocatable memory
-+  int vcsm_handle; // Handle for use by VCSM
-+  int vc;       // Address for use in GPU code
-+  int numbytes; // Size of memory block
-+} GPU_MEM_PTR_T;
-+
-+// General GPU functions
-+
-+#define GPU_INIT_GPU 1
-+#define GPU_INIT_CMA 2
-+
-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T * const p);
-+int rpi_mem_gpu_init(const unsigned int flags);
-+void rpi_mem_gpu_uninit(void);
-+
-+// Cache flush stuff
-+
-+struct rpi_cache_flush_env_s;
-+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
-+
-+typedef struct {uint32_t t[33];} rpi_cache_buf_t;
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
-+// Free env without flushing
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & clear but do not free the env
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & free the env
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
-+
-+typedef enum
-+{
-+    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
-+    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
-+    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
-+} rpi_cache_flush_mode_t;
-+
-+struct AVFrame;
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
-+  const unsigned int offset, const unsigned int size);
-+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma);
-+
-+// init, add, finish for one gm ptr
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
-+
-+#endif
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-new file mode 100644
-index 0000000000..cb7b96119e
---- /dev/null
-+++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,776 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "libavutil/avassert.h"
-+
-+#include "config.h"
-+
-+#include <pthread.h>
-+#include <time.h>
-+
-+#include <interface/vcsm/user-vcsm.h>
-+
-+#include "rpi_mailbox.h"
-+#include "rpi_mem.h"
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_transform8.h"
-+#include "rpi_hevc_transform10.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
-+#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
-+
-+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
-+// Beware this is expensive and will probably throw off all other timing by >10%
-+#define RPI_TRACE_QPU_PROFILE_ALL       0
-+
-+// QPU "noflush" flags
-+// a mixture of flushing & profiling
-+
-+#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
-+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
-+#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
-+#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
-+#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
-+
-+#define vcos_verify_ge0(x) ((x)>=0)
-+
-+// Size in 32bit words
-+#define QPU_CODE_SIZE 4098
-+#define VPU_CODE_SIZE 16384
-+
-+static const short rpi_transMatrix2even[32][16] = { // Even rows first
-+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
-+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
-+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
-+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
-+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
-+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
-+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
-+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
-+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
-+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
-+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
-+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
-+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
-+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
-+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
-+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
-+// Odd rows
-+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
-+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
-+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
-+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
-+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
-+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
-+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
-+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
-+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
-+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
-+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
-+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
-+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
-+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
-+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
-+};
-+
-+// Code/constants on GPU
-+struct GPU
-+{
-+//  unsigned int qpu_code[QPU_CODE_SIZE];
-+    unsigned int vpu_code8[VPU_CODE_SIZE];
-+    unsigned int vpu_code10[VPU_CODE_SIZE];
-+    short transMatrix2even[16*16*2];
-+};
-+
-+#define WAIT_COUNT_MAX 16
-+
-+typedef struct trace_time_one_s
-+{
-+    int count;
-+    int64_t start[WAIT_COUNT_MAX];
-+    int64_t total[WAIT_COUNT_MAX];
-+} trace_time_one_t;
-+
-+typedef struct trace_time_wait_s
-+{
-+    unsigned int jcount;
-+    int64_t start0;
-+    int64_t last_update;
-+    trace_time_one_t active;
-+    trace_time_one_t wait;
-+} trace_time_wait_t;
-+
-+typedef struct vq_wait_s
-+{
-+    sem_t sem;
-+    struct vq_wait_s * next;
-+} vq_wait_t;
-+
-+#define VQ_WAIT_POOL_SIZE 16
-+typedef struct vq_wait_pool_s
-+{
-+    vq_wait_t * head;
-+    vq_wait_t pool[VQ_WAIT_POOL_SIZE];
-+} vq_wait_pool_t;
-+
-+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
-+
-+typedef struct gpu_env_s
-+{
-+    int open_count;
-+    int init_count;
-+    int vpu_i_cache_flushed;
-+    GPU_MEM_PTR_T qpu_code_gm_ptr;
-+    GPU_MEM_PTR_T code_gm_ptr;
-+    GPU_MEM_PTR_T dummy_gm_ptr;
-+    vq_wait_pool_t wait_pool;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+    trace_time_wait_t ttw;
-+#endif
-+} gpu_env_t;
-+
-+// Stop more than one thread trying to allocate memory or use the processing resources at once
-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static gpu_env_t * gpu = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+
-+static int64_t ns_time(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
-+}
-+
-+
-+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
-+
-+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
-+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
-+#define T_ARG(t) T_SEC(t), T_MS(t)
-+#define T_FMT "%u.%03u"
-+
-+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
-+{
-+    // Update totals for levels that are still pending
-+    for (int i = 0; i < tto->count; ++i) {
-+        tto->total[i] += now - tto->start[i];
-+        tto->start[i] = now;
-+    }
-+
-+    printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
-+         prefix,
-+         T_ARG(now - start0 - tto->total[0]),
-+         T_ARG(tto->total[0]),
-+         T_ARG(tto->total[1]),
-+         T_ARG(tto->total[2]),
-+         T_ARG(tto->total[3]));
-+}
-+
-+
-+static void tto_start(trace_time_one_t * const tto, const int64_t now)
-+{
-+    av_assert0(tto->count < WAIT_COUNT_MAX);
-+    tto->start[tto->count++] = now;
-+}
-+
-+static void tto_end(trace_time_one_t * const tto, const int64_t now)
-+{
-+    const int n = --tto->count;
-+    av_assert0(n >= 0);
-+    tto->total[n] += now - tto->start[n];
-+}
-+
-+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
-+{
-+    printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
-+    tto_print(&ttw->active, now, ttw->start0, "Active");
-+    tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
-+}
-+
-+#endif
-+
-+// GPU memory alloc fns (internal)
-+
-+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
-+{
-+    if (p->arm != NULL)
-+        vcsm_unlock_ptr(p->arm);
-+    if (p->vcsm_handle != 0)
-+        vcsm_free(p->vcsm_handle);
-+    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
-+}
-+
-+
-+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
-+    const int numbytes, const unsigned int cache_type, const char * const name)
-+{
-+    memset(p, 0, sizeof(*p));
-+    p->numbytes = (numbytes + 255) & ~255;  // Round up
-+
-+    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
-+        (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
-+        (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
-+        (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
-+    {
-+        gpu_free_internal(p);
-+        return AVERROR(ENOMEM);
-+    }
-+    return 0;
-+}
-+
-+
-+// GPU init, free, lock, unlock
-+
-+static void gpu_term(void)
-+{
-+    gpu_env_t * const ge = gpu;
-+
-+    // We have to hope that eveything has terminated...
-+    gpu = NULL;
-+
-+    vc_gpuserv_deinit();
-+
-+    gpu_free_internal(&ge->code_gm_ptr);
-+    gpu_free_internal(&ge->qpu_code_gm_ptr);
-+    gpu_free_internal(&ge->dummy_gm_ptr);
-+
-+    vcsm_exit();
-+
-+    vq_wait_pool_deinit(&ge->wait_pool);
-+
-+    free(ge);
-+}
-+
-+
-+// Connect to QPU, returns 0 on success.
-+static int gpu_init(gpu_env_t ** const gpu) {
-+    volatile struct GPU* ptr;
-+    gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
-+    int rv;
-+    *gpu = NULL;
-+
-+    if (ge == NULL)
-+        return -1;
-+
-+    vq_wait_pool_init(&ge->wait_pool);
-+
-+    vcsm_init();
-+
-+    // Now copy over the QPU code into GPU memory
-+    if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
-+      return rv;
-+
-+    {
-+        int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
-+        av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
-+        memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
-+    }
-+
-+    // And the VPU code
-+    if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
-+        return rv;
-+    ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
-+
-+    // Zero everything so we have zeros between the code bits
-+    memset((void *)ptr, 0, sizeof(*ptr));
-+    {
-+        int num_bytes = sizeof(rpi_hevc_transform8);
-+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
-+    }
-+    {
-+        int num_bytes = sizeof(rpi_hevc_transform10);
-+        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+        memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
-+    }
-+    // And the transform coefficients
-+    memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+
-+    // Generate a dummy "frame" & fill with 0x80
-+    // * Could reset to 1 <<bit_depth?
-+    if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
-+        return rv;
-+    memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
-+
-+    *gpu = ge;
-+    return 0;
-+}
-+
-+
-+
-+static void gpu_unlock(void) {
-+    pthread_mutex_unlock(&gpu_mutex);
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static gpu_env_t * gpu_lock(void) {
-+    pthread_mutex_lock(&gpu_mutex);
-+
-+    av_assert1(gpu != NULL);
-+    return gpu;
-+}
-+
-+static gpu_env_t * gpu_lock_ref(void)
-+{
-+    pthread_mutex_lock(&gpu_mutex);
-+
-+    if (gpu == NULL) {
-+        int rv = gpu_init(&gpu);
-+        if (rv != 0) {
-+            gpu_unlock();
-+            return NULL;
-+        }
-+    }
-+
-+    ++gpu->open_count;
-+    return gpu;
-+}
-+
-+static void gpu_unlock_unref(gpu_env_t * const ge)
-+{
-+    if (--ge->open_count == 0)
-+        gpu_term();
-+
-+    gpu_unlock();
-+}
-+
-+static inline gpu_env_t * gpu_ptr(void)
-+{
-+    av_assert1(gpu != NULL);
-+    return gpu;
-+}
-+
-+unsigned int vpu_get_fn(const unsigned int bit_depth) {
-+  uint32_t a = 0;
-+
-+  // Make sure that the gpu is initialized
-+  av_assert1(gpu != NULL);
-+  switch (bit_depth){
-+    case 8:
-+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
-+      break;
-+    case 10:
-+      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
-+      break;
-+    default:
-+      av_assert0(0);
-+  }
-+  return a;
-+}
-+
-+unsigned int vpu_get_constants(void) {
-+  av_assert1(gpu != NULL);
-+  return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
-+}
-+
-+void gpu_ref(void)
-+{
-+  gpu_lock_ref();
-+  gpu_unlock();
-+}
-+
-+void gpu_unref(void)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  gpu_unlock_unref(ge);
-+}
-+
-+// ----------------------------------------------------------------------------
-+
-+
-+// Wait abstractions - mostly so we can easily add profile code
-+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_init(&wp->pool[i].sem, 0, 0);
-+    wp->pool[i].next = wp->pool + i + 1;
-+  }
-+  wp->head = wp->pool + 0;
-+  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
-+}
-+
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  wp->head = NULL;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_destroy(&wp->pool[i].sem);
-+    wp->pool[i].next = NULL;
-+  }
-+}
-+
-+
-+// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(void)
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  vq_wait_t * const wait = ge->wait_pool.head;
-+  ge->wait_pool.head = wait->next;
-+  wait->next = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  tto_start(&ge->ttw.active, ns_time());
-+#endif
-+
-+  gpu_unlock();
-+  return wait;
-+}
-+
-+static void vq_wait_delete(vq_wait_t * const wait)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  wait->next = ge->wait_pool.head;
-+  ge->wait_pool.head = wait;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    trace_time_wait_t * const ttw = &ge->ttw;
-+    const int64_t now = ns_time();
-+    ++ttw->jcount;
-+    tto_end(&ttw->wait, now);
-+
-+    if (ttw->start0 == 0)
-+    {
-+      ttw->start0 = ttw->active.start[0];
-+      ttw->last_update = ttw->start0;
-+    }
-+    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
-+    {
-+      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
-+      ttw_print(ttw, now);
-+    }
-+  }
-+#endif
-+  gpu_unlock_unref(ge);
-+}
-+
-+static void vq_wait_wait(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+      const int64_t now = ns_time();
-+      gpu_env_t * const ge = gpu_lock();
-+      tto_start(&ge->ttw.wait, now);
-+      gpu_unlock();
-+  }
-+#endif
-+
-+  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
-+    /* loop */;
-+}
-+
-+static void vq_wait_post(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    gpu_env_t *const ge = gpu_lock();
-+    tto_end(&ge->ttw.active, ns_time());
-+    gpu_unlock();
-+  }
-+#endif
-+
-+  sem_post(&wait->sem);
-+}
-+
-+
-+
-+// Header comments were wrong for these two
-+#define VPU_QPU_MASK_QPU  1
-+#define VPU_QPU_MASK_VPU  2
-+
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
-+{
-+//  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
-+  vpu_qpu_job_env_t * vqj = buf;
-+//  memset(vqj, 0, sizeof(*vqj));
-+  vqj->n = 0;
-+  vqj->mask = 0;
-+  return vqj;
-+}
-+
-+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
-+{
-+//  memset(vqj, 0, sizeof(*vqj));
-+//  free(vqj);
-+}
-+
-+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
-+{
-+  struct gpu_job_s * const j = vqj->j + vqj->n++;
-+  av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
-+  return j;
-+}
-+
-+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
-+{
-+  if (vpu_code != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_VPU;
-+
-+    j->command = EXECUTE_VPU;
-+    j->callback.func = 0;
-+    j->callback.cookie = NULL;
-+    // The bottom two bits of the execute address contain no-flush flags
-+    // b0 will flush the VPU I-cache if unset so we nearly always want that set
-+    // as we never reload code
-+    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
-+    j->u.v.q[1] = r0;
-+    j->u.v.q[2] = r1;
-+    j->u.v.q[3] = r2;
-+    j->u.v.q[4] = r3;
-+    j->u.v.q[5] = r4;
-+    j->u.v.q[6] = r5;
-+    gpu->vpu_i_cache_flushed = 1;
-+  }
-+}
-+
-+// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
-+{
-+  if (n != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_QPU;
-+
-+    j->command = EXECUTE_QPU;
-+    j->callback.func = 0;
-+    j->callback.cookie = NULL;
-+
-+    j->u.q.jobs = n;
-+#if RPI_TRACE_QPU_PROFILE_ALL
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
-+#else
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
-+#endif
-+    j->u.q.timeout = 5000;
-+    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  }
-+}
-+
-+// Convert callback to sem post
-+static void vpu_qpu_job_callback_wait(void * v)
-+{
-+  vq_wait_post(v);
-+}
-+
-+// Poke a user-supplied sem
-+static void vpu_qpu_job_callback_sem(void * v)
-+{
-+  sem_post((sem_t *)v);
-+}
-+
-+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
-+{
-+  vq_wait_t * wait;
-+
-+  if (vqj->mask == 0) {
-+    *wait_h = NULL;
-+    return;
-+  }
-+
-+  // We are going to want a sync object
-+  wait = vq_wait_new();
-+
-+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+  // If we only posted one thing or only QPU jobs
-+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+  {
-+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+    av_assert1(j->callback.func == 0);
-+
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+  else
-+  {
-+    struct gpu_job_s *const j = new_job(vqj);
-+
-+    j->command = EXECUTE_SYNC;
-+    j->u.s.mask = vqj->mask;
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+
-+  vqj->mask = 0;
-+  *wait_h = wait;
-+}
-+
-+// Returns 0 if no sync added ('cos Q empty), 1 if sync added
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
-+{
-+  // If nothing on q then just return
-+  if (vqj->mask == 0)
-+    return 0;
-+
-+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+  // If we only posted one thing or only QPU jobs
-+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+  {
-+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+    av_assert1(j->callback.func == 0);
-+
-+    j->callback.func = vpu_qpu_job_callback_sem;
-+    j->callback.cookie = sem;
-+  }
-+  else
-+  {
-+    struct gpu_job_s *const j = new_job(vqj);
-+
-+    j->command = EXECUTE_SYNC;
-+    j->u.s.mask = vqj->mask;
-+    j->callback.func = vpu_qpu_job_callback_sem;
-+    j->callback.cookie = sem;
-+  }
-+
-+  vqj->mask = 0;
-+  return 1;
-+}
-+
-+
-+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
-+{
-+  if (vqj->n == 0)
-+    return 0;
-+
-+  return vc_gpuserv_execute_code(vqj->n, vqj->j);
-+}
-+
-+// Simple wrapper of start + delete
-+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
-+{
-+  int rv;
-+  rv = vpu_qpu_job_start(vqj);
-+  vpu_qpu_job_delete(vqj);
-+  return rv;
-+}
-+
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
-+{
-+  if (wait_h != NULL)
-+  {
-+    vq_wait_t * const wait = *wait_h;
-+    if (wait != NULL) {
-+      *wait_h = NULL;
-+      vq_wait_wait(wait);
-+      vq_wait_delete(wait);
-+    }
-+  }
-+}
-+
-+int vpu_qpu_init()
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  if (ge == NULL)
-+    return -1;
-+
-+  if (ge->init_count++ == 0)
-+  {
-+    vc_gpuserv_init();
-+  }
-+
-+  gpu_unlock();
-+  return 0;
-+}
-+
-+void vpu_qpu_term()
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+
-+  if (--ge->init_count == 0) {
-+    vc_gpuserv_deinit();
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+    ttw_print(&ge->ttw, ns_time());
-+#endif
-+  }
-+
-+  gpu_unlock_unref(ge);
-+}
-+
-+uint32_t qpu_fn(const int * const mc_fn)
-+{
-+  return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
-+}
-+
-+uint32_t qpu_dummy(void)
-+{
-+  return gpu->dummy_gm_ptr.vc;
-+}
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
-+{
-+  // Dummy values we can catch with emulation
-+  qf->y_pxx = ~1U;
-+  qf->y_bxx = ~2U;
-+  qf->y_p00 = ~3U;
-+  qf->y_b00 = ~4U;
-+  qf->c_pxx = ~5U;
-+  qf->c_bxx = ~6U;
-+
-+  switch (bit_depth) {
-+    case 8:
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y_b00);
-+      qf->c_pxx = qpu_fn(mc_filter_c_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c_b);
-+      break;
-+    case 10:
-+      qf->c_pxx = qpu_fn(mc_filter_c10_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c10_b);
-+      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
-+      break;
-+    default:
-+      return -1;
-+  }
-+  return 0;
-+}
-+
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-new file mode 100644
-index 0000000000..8777687021
---- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,103 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#ifndef RPI_QPU_H
-+#define RPI_QPU_H
-+
-+#include "rpi_mem.h"
-+#include "rpi_zc_frames.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
-+#include "interface/vmcs_host/vc_vchi_gpuserv.h"  // for gpu_job_s
-+#pragma GCC diagnostic pop
-+
-+// QPU specific functions
-+
-+typedef struct HEVCRpiQpu {
-+    uint32_t c_pxx;
-+    uint32_t c_pxx_l1;
-+    uint32_t c_bxx;
-+    uint32_t y_pxx;
-+    uint32_t y_bxx;
-+    uint32_t y_p00;
-+    uint32_t y_b00;
-+} HEVCRpiQpu;
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
-+
-+uint32_t qpu_fn(const int * const mc_fn);
-+uint32_t qpu_dummy(void);
-+
-+#define QPU_N_GRP    4
-+#define QPU_N_MAX    12
-+
-+#define QPU_MAIL_EL_VALS  2
-+
-+struct vpu_qpu_wait_s;
-+typedef struct vq_wait_s * vpu_qpu_wait_h;
-+
-+// VPU specific functions
-+
-+struct vpu_qpu_job_env_s;
-+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
-+
-+#define VPU_QPU_JOB_MAX 4
-+struct vpu_qpu_job_env_s
-+{
-+  unsigned int n;
-+  unsigned int mask;
-+  struct gpu_job_s j[VPU_QPU_JOB_MAX];
-+};
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
-+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
-+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
-+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
-+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
-+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
-+
-+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
-+extern unsigned int vpu_get_constants(void);
-+
-+// Waits for previous post_codee to complete and Will null out *wait_h after use
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_init(void);
-+void vpu_qpu_term(void);
-+
-+void gpu_ref(void);
-+void gpu_unref(void);
-+
-+#endif
-diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
-new file mode 100644
-index 0000000000..edd0412aa0
---- /dev/null
-+++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,1214 @@
-+#include "config.h"
-+
-+#include "libavcodec/avcodec.h"
-+#include "rpi_mem.h"
-+#include "rpi_mailbox.h"
-+#include "rpi_zc.h"
-+#include "libavutil/avassert.h"
-+#include <pthread.h>
-+
-+#include "libavutil/buffer_internal.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <interface/vctypes/vc_image_types.h>
-+#include <interface/vcsm/user-vcsm.h>
-+#pragma GCC diagnostic pop
-+
-+#define TRACE_ALLOC 0
-+#define DEBUG_ALWAYS_KEEP_LOCKED 0
-+
-+struct ZcPoolEnt;
-+
-+typedef struct ZcPool
-+{
-+    size_t numbytes;
-+    struct ZcPoolEnt * head;
-+    pthread_mutex_t lock;
-+} ZcPool;
-+
-+typedef struct ZcPoolEnt
-+{
-+    size_t numbytes;
-+
-+    unsigned int vcsm_handle;
-+    unsigned int vc_handle;
-+    void * map_arm;
-+    unsigned int map_vc;
-+
-+    struct ZcPoolEnt * next;
-+    struct ZcPool * pool;
-+} ZcPoolEnt;
-+
-+typedef struct ZcOldCtxVals
-+{
-+    int thread_safe_callbacks;
-+    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
-+    void * opaque;
-+} ZcOldCtxVals;
-+
-+typedef struct AVZcEnv
-+{
-+    unsigned int refcount;
-+    ZcOldCtxVals old;
-+
-+    void * pool_env;
-+    av_rpi_zc_alloc_buf_fn_t * alloc_buf;
-+    av_rpi_zc_free_pool_fn_t * free_pool;
-+
-+    unsigned int pool_size;
-+} ZcEnv;
-+
-+typedef struct ZcUserBufEnv {
-+    void * v;
-+    const av_rpi_zc_buf_fn_tab_t * fn;
-+    size_t numbytes;
-+    int offset;
-+} ZcUserBufEnv;
-+
-+#define ZC_BUF_INVALID  0
-+#define ZC_BUF_VALID    1
-+#define ZC_BUF_NEVER    2
-+
-+typedef struct ZcBufEnv {
-+    GPU_MEM_PTR_T gmem;
-+    AVZcEnvPtr zc;
-+    int is_valid;
-+    AVBufferRef * user;
-+    AVRpiZcFrameGeometry geo;
-+    size_t size_y;
-+    size_t size_c;
-+    size_t size_pic;
-+    ssize_t offset;
-+    pthread_mutex_t lock;
-+    pthread_cond_t cond;
-+} ZcBufEnv;
-+
-+
-+
-+
-+
-+
-+#define ALLOC_PAD       0
-+#define ALLOC_ROUND     0x1000
-+#define STRIDE_ROUND    64
-+#define STRIDE_OR       0
-+
-+#define DEBUG_ZAP0_BUFFERS 0
-+
-+static inline int av_rpi_is_sand_format(const int format)
-+{
-+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
-+        (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand_format(frame->format);
-+}
-+
-+//----------------------------------------------------------------------------
-+//
-+// Internal pool stuff
-+
-+// Pool entry functions
-+
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
-+{
-+    ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
-+
-+    // Round up to 4k & add 4k
-+    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
-+
-+    if (zp == NULL) {
-+        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
-+        goto fail0;
-+    }
-+
-+    // The 0x80 here maps all pages here rather than waiting for lazy mapping
-+    // BEWARE that in GPU land a later unlock/lock pair will put us back into
-+    // lazy mode - which will also break cache invalidate calls.
-+    if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
-+        goto fail1;
-+    }
-+
-+#if TRACE_ALLOC
-+    printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
-+#endif
-+
-+    zp->numbytes = alloc_size;
-+    zp->pool = pool;
-+    return zp;
-+
-+fail1:
-+    av_free(zp);
-+fail0:
-+    return NULL;
-+}
-+
-+static void zc_pool_ent_free(ZcPoolEnt * const zp)
-+{
-+#if TRACE_ALLOC
-+    printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
-+#endif
-+
-+    if (zp->vcsm_handle != 0)
-+    {
-+        // VC addr & handle need no dealloc
-+        if (zp->map_arm != NULL)
-+            vcsm_unlock_hdl(zp->vcsm_handle);
-+        vcsm_free(zp->vcsm_handle);
-+    }
-+    av_free(zp);
-+}
-+
-+//----------------------------------------------------------------------------
-+//
-+// Pool functions
-+
-+static void zc_pool_free_ent_list(ZcPoolEnt * p)
-+{
-+    while (p != NULL)
-+    {
-+        ZcPoolEnt * const zp = p;
-+        p = p->next;
-+        zc_pool_ent_free(zp);
-+    }
-+}
-+
-+static void zc_pool_flush(ZcPool * const pool)
-+{
-+    ZcPoolEnt * p = pool->head;
-+    pool->head = NULL;
-+    pool->numbytes = ~0U;
-+    zc_pool_free_ent_list(p);
-+}
-+
-+static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
-+{
-+    ZcPoolEnt * zp = NULL;
-+    ZcPoolEnt * flush_list = NULL;
-+    size_t numbytes;
-+
-+    pthread_mutex_lock(&pool->lock);
-+
-+    numbytes = pool->numbytes;
-+
-+    // If size isn't close then dump the pool
-+    // Close in this context means within 128k
-+    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
-+    {
-+        flush_list = pool->head;
-+        pool->head = NULL;
-+        pool->numbytes = numbytes = req_bytes;
-+    }
-+    else if (pool->head != NULL)
-+    {
-+        zp = pool->head;
-+        pool->head = zp->next;
-+    }
-+
-+    pthread_mutex_unlock(&pool->lock);
-+
-+    zc_pool_free_ent_list(flush_list);
-+
-+    if (zp == NULL)
-+        zp = zc_pool_ent_alloc(pool, numbytes);
-+
-+    return zp;
-+}
-+
-+static void zc_pool_put_ent(ZcPoolEnt * const zp)
-+{
-+    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
-+    if (zp != NULL)
-+    {
-+        pthread_mutex_lock(&pool->lock);
-+#if TRACE_ALLOC
-+        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
-+#endif
-+
-+        if (pool->numbytes == zp->numbytes)
-+        {
-+            zp->next = pool->head;
-+            pool->head = zp;
-+            pthread_mutex_unlock(&pool->lock);
-+        }
-+        else
-+        {
-+            pthread_mutex_unlock(&pool->lock);
-+            zc_pool_ent_free(zp);
-+        }
-+    }
-+}
-+
-+static ZcPool *
-+zc_pool_new(void)
-+{
-+    ZcPool * const pool = av_mallocz(sizeof(*pool));
-+    if (pool == NULL)
-+        return NULL;
-+
-+    pool->numbytes = -1;
-+    pool->head = NULL;
-+    pthread_mutex_init(&pool->lock, NULL);
-+    return pool;
-+}
-+
-+static void
-+zc_pool_delete(ZcPool * const pool)
-+{
-+    if (pool != NULL)
-+    {
-+        pool->numbytes = -1;
-+        zc_pool_flush(pool);
-+        pthread_mutex_destroy(&pool->lock);
-+        av_free(pool);
-+    }
-+}
-+
-+//============================================================================
-+//
-+// ZC implementation using above pool implementation
-+//
-+// Fn table fns...
-+
-+static void zc_pool_free_v(void * v)
-+{
-+    zc_pool_put_ent(v);
-+}
-+
-+static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    return zp->vcsm_handle;
-+}
-+
-+static unsigned int zc_pool_ent_vc_handle_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    if (zp->vc_handle == 0)
-+    {
-+        if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
-+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
-+                   __func__, zp->vcsm_handle);
-+    }
-+    return zp->vc_handle;
-+}
-+
-+static void * zc_pool_ent_map_arm_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    if (zp->map_arm == NULL)
-+    {
-+        if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
-+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
-+                   __func__, zp->vcsm_handle);
-+    }
-+    return zp->map_arm;
-+}
-+
-+static unsigned int zc_pool_ent_map_vc_v(void * v)
-+{
-+    ZcPoolEnt * zp = v;
-+    if (zp->map_vc == 0)
-+    {
-+        if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
-+            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
-+                   __func__, zp->vcsm_handle);
-+    }
-+    return zp->map_vc;
-+}
-+
-+static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
-+    .free        = zc_pool_free_v,
-+    .vcsm_handle = zc_pool_ent_vcsm_handle_v,
-+    .vc_handle   = zc_pool_ent_vc_handle_v,
-+    .map_arm     = zc_pool_ent_map_arm_v,
-+    .map_vc      = zc_pool_ent_map_vc_v,
-+};
-+
-+// ZC Env fns
-+
-+// Delete pool
-+// All buffers guaranteed freed by now
-+static void
-+zc_pool_delete_v(void * v)
-+{
-+    zc_pool_delete((ZcPool *)v);
-+    rpi_mem_gpu_uninit();
-+}
-+
-+// Allocate a new ZC buffer
-+static AVBufferRef *
-+zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
-+{
-+    ZcPool * const pool = v;
-+    ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
-+    AVBufferRef * buf;
-+
-+    (void)geo;  // geo ignored here
-+
-+    if (zp == NULL) {
-+        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
-+        goto fail0;
-+    }
-+
-+    if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
-+        goto fail2;
-+    }
-+
-+    return buf;
-+
-+fail2:
-+    zc_pool_put_ent(zp);
-+fail0:
-+    return NULL;
-+}
-+
-+// Init wrappers - the public fns
-+
-+AVZcEnvPtr
-+av_rpi_zc_int_env_alloc(void * logctx)
-+{
-+    ZcEnv * zc;
-+    ZcPool * pool_env;
-+
-+    if (rpi_mem_gpu_init(0) < 0)
-+        return NULL;
-+
-+    if ((pool_env = zc_pool_new()) == NULL)
-+        goto fail1;
-+
-+    if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
-+        goto fail2;
-+
-+    return zc;
-+
-+fail2:
-+    zc_pool_delete(pool_env);
-+fail1:
-+    rpi_mem_gpu_uninit();
-+    return NULL;
-+}
-+
-+void
-+av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
-+{
-+    const AVZcEnvPtr zc = *zcp;
-+    *zcp = NULL;
-+    if (zc != NULL)
-+        av_rpi_zc_env_release(zc);
-+}
-+
-+//============================================================================
-+//
-+// Geometry
-+//
-+// This is a separate chunck to the rest
-+
-+// Get mailbox fd - should be in a lock when called
-+// Rely on process close to close it
-+static int mbox_fd(void)
-+{
-+    static int fd = -1;
-+    if (fd != -1)
-+        return fd;
-+    return (fd = mbox_open());
-+}
-+
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const int format, const unsigned int video_width, const unsigned int video_height)
-+{
-+    static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+
-+    AVRpiZcFrameGeometry geo = {
-+        .format       = format,
-+        .video_width  = video_width,
-+        .video_height = video_height
-+    };
-+
-+    switch (format)
-+    {
-+        case AV_PIX_FMT_YUV420P:
-+            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+            geo.stride_c = geo.stride_y / 2;
-+            geo.height_y = (video_height + 32 + 31) & ~31;
-+            geo.height_c = geo.height_y / 2;
-+            geo.planes_c = 2;
-+            geo.stripes = 1;
-+            geo.bytes_per_pel = 1;
-+            geo.stripe_is_yc = 1;
-+            break;
-+
-+        case AV_PIX_FMT_YUV420P10:
-+            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+            geo.stride_c = geo.stride_y / 2;
-+            geo.height_y = (video_height + 32 + 31) & ~31;
-+            geo.height_c = geo.height_y / 2;
-+            geo.planes_c = 2;
-+            geo.stripes = 1;
-+            geo.bytes_per_pel = 2;
-+            geo.stripe_is_yc = 1;
-+            break;
-+
-+        case AV_PIX_FMT_SAND128:
-+        case AV_PIX_FMT_RPI4_8:
-+        {
-+            const unsigned int stripe_w = 128;
-+
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+            if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV_UV,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                mbox_get_image_params(mbox_fd(), &new_img);
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.stripe_is_yc = 1;
-+            if (geo.height_y * stripe_w > img.pitch)
-+            {
-+                // "tall" sand - all C blocks now follow Y
-+                geo.height_y = img.pitch / stripe_w;
-+                geo.height_c = geo.height_y;
-+                geo.stripe_is_yc = 0;
-+            }
-+            geo.planes_c = 1;
-+            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 1;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+#if 1
-+            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
-+                   video_width, video_height,
-+                   geo.stride_y, geo.stride_c,
-+                   geo.height_y, geo.height_c,
-+                   geo.stripes, img.pitch);
-+#endif
-+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
-+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
-+            break;
-+        }
-+
-+        case AV_PIX_FMT_RPI4_10:
-+        {
-+            const unsigned int stripe_w = 128;  // bytes
-+
-+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+            if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV10COL,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                mbox_get_image_params(mbox_fd(), &new_img);
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.planes_c = 1;
-+            geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 1;
-+            geo.stripe_is_yc = 1;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+
-+            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
-+            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
-+            break;
-+        }
-+
-+        case AV_PIX_FMT_SAND64_16:
-+        case AV_PIX_FMT_SAND64_10:
-+        {
-+            const unsigned int stripe_w = 128;  // bytes
-+
-+            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+            static VC_IMAGE_T img = {0};
-+
-+            // Given the overhead of calling the mailbox keep a stashed
-+            // copy as we will almost certainly just want the same numbers again
-+            // but that means we need a lock
-+            pthread_mutex_lock(&sand_lock);
-+
-+             if (img.width != video_width || img.height != video_height)
-+            {
-+                VC_IMAGE_T new_img = {
-+                    .type = VC_IMAGE_YUV_UV_16,
-+                    .width = video_width,
-+                    .height = video_height
-+                };
-+
-+                mbox_get_image_params(mbox_fd(), &new_img);
-+                img = new_img;
-+            }
-+
-+            geo.stride_y = stripe_w;
-+            geo.stride_c = stripe_w;
-+            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+            geo.height_c = img.pitch / stripe_w - geo.height_y;
-+            geo.planes_c = 1;
-+            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
-+            geo.bytes_per_pel = 2;
-+            geo.stripe_is_yc = 1;
-+
-+            pthread_mutex_unlock(&sand_lock);
-+            break;
-+        }
-+
-+        default:
-+            break;
-+    }
-+    return geo;
-+}
-+
-+//============================================================================
-+//
-+// ZC Env fns
-+//
-+// Frame copy fns
-+
-+static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
-+    const AVFrame * const src)
-+{
-+    AVFrame dest_frame;
-+    AVFrame * const dest = &dest_frame;
-+    unsigned int i;
-+    uint8_t * psrc, * pdest;
-+
-+    dest->format = src->format;
-+    dest->width = src->width;
-+    dest->height = src->height;
-+
-+    if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
-+        av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
-+    {
-+        return NULL;
-+    }
-+
-+    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
-+         i != dest->height;
-+         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
-+    {
-+        memcpy(pdest, psrc, dest->width);
-+    }
-+    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
-+         i != dest->height / 2;
-+         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
-+    {
-+        memcpy(pdest, psrc, dest->width / 2);
-+    }
-+    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
-+         i != dest->height / 2;
-+         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
-+    {
-+        memcpy(pdest, psrc, dest->width / 2);
-+    }
-+
-+    return dest->buf[0];
-+}
-+
-+
-+static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
-+    const AVFrame * const src)
-+{
-+    assert(0);
-+    return NULL;
-+}
-+
-+
-+static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
-+    const AVFrame * const src, const unsigned int src_bits)
-+{
-+    assert(0);
-+    return NULL;
-+}
-+
-+//----------------------------------------------------------------------------
-+//
-+// Public info extraction calls
-+
-+static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
-+
-+static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
-+{
-+    // Kludge where we check the free fn to check this is really
-+    // one of our buffers - can't think of a better way
-+    return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
-+        av_buffer_get_opaque(buf);
-+}
-+
-+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
-+{
-+    // As gmem is the first el NULL should be preserved
-+    return &pic_zbe_ptr(buf)->gmem;
-+}
-+
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? -1 : p->vc_handle;
-+}
-+
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
-+{
-+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
-+    return zbe == NULL ? 0 : zbe->offset;
-+}
-+
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
-+{
-+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
-+    return zbe == NULL ? 0 : zbe->size_pic;
-+}
-+
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
-+{
-+    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+    return p == NULL ? 0 : p->numbytes;
-+}
-+
-+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
-+{
-+    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
-+    return zbe == NULL ? NULL : &zbe->geo;
-+}
-+
-+AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
-+    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
-+{
-+    av_assert0(!maycopy || zc != NULL);
-+
-+    if (frame->format != AV_PIX_FMT_YUV420P &&
-+        frame->format != AV_PIX_FMT_YUV420P10 &&
-+        !av_rpi_is_sand_frame(frame))
-+    {
-+        av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
-+        return NULL;
-+    }
-+
-+    if (frame->buf[1] != NULL || frame->format != expected_format)
-+    {
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
-+        {
-+//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
-+            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
-+        }
-+#endif
-+
-+        if (maycopy)
-+        {
-+            if (frame->buf[1] != NULL)
-+                av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
-+            else
-+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
-+
-+            switch (frame->format)
-+            {
-+                case AV_PIX_FMT_YUV420P10:
-+                    return zc_420p10_to_sand128(zc, frame);
-+
-+                case AV_PIX_FMT_SAND64_10:
-+                    return zc_sand64_16_to_sand128(zc, frame, 10);
-+
-+                default:
-+                    return zc_copy(zc, frame);
-+            }
-+        }
-+        else
-+        {
-+            if (frame->buf[1] != NULL)
-+                av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
-+            else
-+                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
-+            return NULL;
-+        }
-+    }
-+
-+    if (pic_gm_ptr(frame->buf[0]) == NULL)
-+    {
-+        if (maycopy)
-+        {
-+            av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
-+            return zc_copy(zc, frame);
-+        }
-+        else
-+        {
-+            av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
-+            return NULL;
-+        }
-+    }
-+
-+    return av_buffer_ref(frame->buf[0]);
-+}
-+
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
-+{
-+    if (fr_ref != NULL)
-+    {
-+        av_buffer_unref(&fr_ref);
-+    }
-+}
-+
-+//----------------------------------------------------------------------------
-+
-+// Extract user environment from an AVBufferRef
-+void * av_rpi_zc_buf_v(AVBufferRef * const buf)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
-+    if (zbe != NULL && zbe->user != NULL)
-+    {
-+        const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
-+        return zub == NULL ? NULL : zub->v;
-+    }
-+    return NULL;
-+}
-+
-+// AV buffer pre-free callback
-+static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
-+{
-+    if (opaque != NULL)
-+    {
-+        ZcUserBufEnv * const zub = opaque;
-+
-+        if (zub->fn->free)
-+            zub->fn->free(zub->v);
-+
-+        av_free(zub);
-+    }
-+}
-+
-+static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
-+{
-+    if (opaque != NULL)
-+    {
-+        ZcBufEnv * const zbe = opaque;
-+
-+        av_buffer_unref(&zbe->user);
-+
-+        if (zbe->zc != NULL)
-+            av_rpi_zc_env_release(zbe->zc);
-+
-+        pthread_cond_destroy(&zbe->cond);
-+        pthread_mutex_destroy(&zbe->lock);
-+        av_free(zbe);
-+    }
-+}
-+
-+
-+// Wrap the various ZC bits in an AV Buffer and resolve those things we want
-+// resolved now.
-+// Currently we resolve everything, but in future we might not
-+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
-+{
-+    AVBufferRef *buf;
-+    ZcUserBufEnv * zub;
-+
-+    if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
-+        return NULL;
-+
-+    zub->fn = fn_tab;
-+    zub->v = v;
-+    zub->numbytes = numbytes;
-+    zub->offset = addr_offset;
-+
-+    if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
-+        av_free(zub);
-+        return NULL;
-+    }
-+
-+    return buf;
-+}
-+
-+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
-+
-+    if (zbe == NULL)
-+        return AVERROR(EINVAL);
-+
-+    if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
-+        return AVERROR(EAGAIN);
-+
-+    if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
-+    {
-+        pthread_mutex_lock(&zbe->lock);
-+        while (!zbe->is_valid)
-+            pthread_cond_wait(&zbe->cond, &zbe->lock);
-+        pthread_mutex_unlock(&zbe->lock);
-+    }
-+
-+    if (zbe->is_valid == ZC_BUF_NEVER)
-+        return AVERROR(EINVAL);
-+
-+    // Do alloc if we need it
-+    if (zbe->user == NULL)
-+    {
-+        ZcEnv * const zc = zbe->zc;
-+        const ZcUserBufEnv * zub;
-+
-+        av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
-+
-+        if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
-+            goto fail;
-+        }
-+        zub = (const ZcUserBufEnv *)zbe->user->data;
-+
-+        // Track
-+
-+        zbe->offset = zub->offset;
-+        zbe->gmem.numbytes = zub->numbytes;
-+        if ((zbe->gmem.arm =  zub->fn->map_arm(zub->v)) == NULL)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
-+            goto fail;
-+        }
-+
-+        if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
-+            goto fail;
-+        }
-+
-+        if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
-+            goto fail;
-+        }
-+        if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
-+        {
-+            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
-+            goto fail;
-+        }
-+
-+        buf->buffer->data = zbe->gmem.arm + zbe->offset;
-+        buf->buffer->size = zbe->size_pic;
-+
-+        // In this mode we shouldn't have anyone waiting for us
-+        // so no need to signal
-+        if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
-+            zbe->is_valid = 1;
-+    }
-+
-+    // Just overwrite - no point in testing
-+    buf->data = zbe->gmem.arm + zbe->offset;
-+    buf->size = zbe->size_pic;
-+    return 0;
-+
-+fail:
-+    av_buffer_unref(&zbe->user);
-+    return AVERROR(ENOMEM);
-+}
-+
-+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
-+{
-+    int rv;
-+
-+    // Do alloc if we need it
-+    if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
-+        return rv;
-+
-+    // If we are a framebuf copy then the alloc can be done but we haven't
-+    // imported its results yet
-+    if (frame->data[0] == NULL)
-+    {
-+        const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
-+
-+        frame->linesize[0] = zbe->geo.stride_y;
-+        frame->linesize[1] = zbe->geo.stride_c;
-+        frame->linesize[2] = zbe->geo.stride_c;
-+        // abuse: linesize[3] = "stripe stride"
-+        // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
-+        // In a general case this makes the calculation an xor and multiply rather
-+        // than a divide and multiply
-+        if (zbe->geo.stripes > 1)
-+            frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
-+
-+        frame->data[0] = frame->buf[0]->data;
-+        frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
-+        if (zbe->geo.planes_c > 1)
-+            frame->data[2] = frame->data[1] + zbe->size_c;
-+
-+        frame->extended_data = frame->data;
-+        // Leave extended buf alone
-+    }
-+
-+    return 0;
-+}
-+
-+int av_rpi_zc_set_valid_frame(AVFrame * const frame)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
-+
-+    if (zbe == NULL)
-+        return AVERROR(EINVAL);
-+
-+    zbe->is_valid = ZC_BUF_VALID;
-+    pthread_cond_broadcast(&zbe->cond);
-+
-+    return 0;
-+}
-+
-+int av_rpi_zc_set_broken_frame(AVFrame * const frame)
-+{
-+    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
-+
-+    if (zbe == NULL)
-+        return AVERROR(EINVAL);
-+
-+    zbe->is_valid = ZC_BUF_NEVER;
-+    pthread_cond_broadcast(&zbe->cond);
-+
-+    return 0;
-+}
-+
-+void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
-+{
-+    zc->pool_size = pool_size;
-+}
-+
-+unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
-+{
-+    return zc->pool_size;
-+}
-+
-+int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
-+{
-+#if 1
-+    ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
-+
-+    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
-+        frame->buf[i] = NULL;
-+        frame->data[i] = NULL;
-+        frame->linesize[i] = 0;
-+    }
-+
-+    if (zbe == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
-+    {
-+        av_free(zbe);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    pthread_mutex_init(&zbe->lock, NULL);
-+    pthread_cond_init(&zbe->cond, NULL);
-+    zbe->zc = zc;
-+    atomic_fetch_add(&zc->refcount, 1);
-+
-+    zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);  // Note geometry for later use
-+    zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
-+    zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
-+    zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
-+
-+#else
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
-+    const unsigned int size_y = geo.stride_y * geo.height_y;
-+    const unsigned int size_c = geo.stride_c * geo.height_c;
-+    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
-+    AVBufferRef * buf;
-+    unsigned int i;
-+
-+//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
-+
-+    if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
-+    {
-+        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    // Track
-+    atomic_fetch_add(&zc->refcount, 1);
-+    pic_zbe_ptr(buf)->zc = zc;
-+
-+    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
-+        frame->buf[i] = NULL;
-+        frame->data[i] = NULL;
-+        frame->linesize[i] = 0;
-+    }
-+
-+    frame->buf[0] = buf;
-+
-+    frame->linesize[0] = geo.stride_y;
-+    frame->linesize[1] = geo.stride_c;
-+    frame->linesize[2] = geo.stride_c;
-+    // abuse: linesize[3] = "stripe stride"
-+    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
-+    // In a general case this makes the calculation an xor and multiply rather
-+    // than a divide and multiply
-+    if (geo.stripes > 1)
-+        frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
-+
-+    frame->data[0] = buf->data;
-+    frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
-+    if (geo.planes_c > 1)
-+        frame->data[2] = frame->data[1] + size_c;
-+
-+    frame->extended_data = frame->data;
-+    // Leave extended buf alone
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF != 0
-+    // *** If we intend to use this for real we will want a 2nd buffer pool
-+    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
-+#endif
-+#endif
-+
-+    return 0;
-+}
-+
-+void av_rpi_zc_env_release(const AVZcEnvPtr zc)
-+{
-+    const int n = atomic_fetch_add(&zc->refcount, -1);
-+    if (n == 1)  // was 1, now 0
-+    {
-+        zc->free_pool(zc->pool_env);
-+        av_free(zc);
-+    }
-+}
-+
-+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
-+                    void * pool_env,
-+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
-+{
-+    ZcEnv * zc;
-+
-+    if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
-+    {
-+        av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
-+        return NULL;
-+    }
-+
-+    *zc = (ZcEnv){
-+        .refcount = ATOMIC_VAR_INIT(1),
-+        .pool_env = pool_env,
-+        .alloc_buf = alloc_buf_fn,
-+        .free_pool = free_pool_fn,
-+        .pool_size = 0
-+    };
-+
-+    return zc;
-+}
-+
-+//============================================================================
-+//
-+// External ZC initialisation
-+
-+#define RPI_GET_BUFFER2 1
-+
-+
-+static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
-+{
-+#if !RPI_GET_BUFFER2
-+    return avcodec_default_get_buffer2(s, frame, flags);
-+#else
-+    int rv;
-+
-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
-+    {
-+//        printf("Do default alloc: format=%#x\n", frame->format);
-+        rv = avcodec_default_get_buffer2(s, frame, flags);
-+    }
-+    else if (frame->format == AV_PIX_FMT_YUV420P ||
-+             av_rpi_is_sand_frame(frame))
-+    {
-+        if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
-+            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
-+    }
-+    else
-+    {
-+        rv = avcodec_default_get_buffer2(s, frame, flags);
-+    }
-+
-+#if 0
-+    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
-+        frame->format, frame->width, frame->height,
-+        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
-+        frame->data[0], frame->data[1], frame->data[2],
-+        frame->buf[0], frame->buf[1], frame->buf[2],
-+        av_buffer_get_opaque(frame->buf[0]));
-+#endif
-+    return rv;
-+#endif
-+}
-+
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
-+{
-+    return s->get_buffer2 == zc_get_buffer2;
-+}
-+
-+int av_rpi_zc_init2(struct AVCodecContext * const s,
-+                    void * pool_env,
-+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
-+{
-+    ZcEnv * zc;
-+
-+    av_assert0(!av_rpi_zc_in_use(s));
-+
-+    if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    zc->old = (ZcOldCtxVals){
-+        .opaque = s->opaque,
-+        .get_buffer2 = s->get_buffer2,
-+        .thread_safe_callbacks = s->thread_safe_callbacks
-+    };
-+
-+    s->opaque = zc;
-+    s->get_buffer2 = zc_get_buffer2;
-+    s->thread_safe_callbacks = 1;
-+    return 0;
-+}
-+
-+void av_rpi_zc_uninit2(struct AVCodecContext * const s)
-+{
-+    ZcEnv * const zc = s->opaque;
-+
-+    av_assert0(av_rpi_zc_in_use(s));
-+
-+    s->get_buffer2 = zc->old.get_buffer2;
-+    s->opaque = zc->old.opaque;
-+    s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
-+
-+    av_rpi_zc_env_release(zc);
-+}
-+
-diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
-new file mode 100644
-index 0000000000..efc8ad4160
---- /dev/null
-+++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,180 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#ifndef LIBAVCODEC_RPI_ZC_H
-+#define LIBAVCODEC_RPI_ZC_H
-+
-+// Zero-Copy frame code for RPi
-+// RPi needs Y/U/V planes to be contiguous for display.  By default
-+// ffmpeg will allocate separated planes so a memcpy is needed before
-+// display.  This code provides a method a making ffmpeg allocate a single
-+// bit of memory for the frame when can then be reference counted until
-+// display has finished with it.
-+
-+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
-+// 0 disables
-+// *** This option still in development
-+//     Only works if SAO active
-+//     Allocates buffers that are twice the required size
-+#define RPI_ZC_SAND_8_IN_10_BUF  0
-+
-+struct AVBufferRef;
-+struct AVFrame;
-+struct AVCodecContext;
-+enum AVPixelFormat;
-+
-+// "Opaque" pointer to whatever we are using as a buffer reference
-+typedef struct AVBufferRef * AVRpiZcRefPtr;
-+
-+struct AVZcEnv;
-+typedef struct AVZcEnv * AVZcEnvPtr;
-+
-+typedef struct AVRpiZcFrameGeometry
-+{
-+    unsigned int stride_y;  // Luma stride (bytes)
-+    unsigned int height_y;  // Luma height (lines)
-+    unsigned int stride_c;  // Chroma stride (bytes)
-+    unsigned int height_c;  // Chroma stride (lines)
-+    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
-+    unsigned int stripes;   // Number of stripes (sand)
-+    unsigned int bytes_per_pel;
-+    int stripe_is_yc;       // A single stripe is Y then C (false for tall sand)
-+
-+    int format;                 // Requested format
-+    unsigned int video_width;   // Requested width
-+    unsigned int video_height;  // Requested height
-+} AVRpiZcFrameGeometry;
-+
-+
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const int format,
-+    const unsigned int video_width, const unsigned int video_height);
-+
-+// Replacement fn for avctx->get_buffer2
-+// Should be set before calling avcodec_decode_open2
-+//
-+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
-+// must be set to 1 as otherwise the buffer info is killed before being returned
-+// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
-+// returned must be manually derefed with av_frame_unref.  This should be done
-+// after av_rpi_zc_ref has been called.
-+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
-+
-+// Generate a ZC reference to the buffer(s) in this frame
-+// If the buffer doesn't appear to be one allocated by ZC
-+// then the behaviour depends on maycopy:
-+//   If maycopy=0 then return NULL
-+//   If maycopy=1 && the src frame is in a form where we can easily copy
-+//     the data, then allocate a new buffer and copy the data into it
-+//   Otherwise return NULL
-+// If maycopy == 0 then ZC may be NULL
-+AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
-+    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
-+
-+// Get the vc_handle from the frame ref
-+// Returns -1 if ref doesn't look valid
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
-+// Get offset from the start of the memory referenced
-+// by the vc_handle to valid data
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
-+// Length of buffer data
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
-+// Get the number of bytes allocated from the frame ref
-+// Returns 0 if ref doesn't look valid
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
-+// Geometry this frame was allocated with
-+const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
-+
-+// Unreference the buffer refed/allocated by _zc_ref
-+// If fr_ref is NULL then this will NOP
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
-+
-+// Test to see if the context is using zc (checks get_buffer2)
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
-+
-+// Init ZC into a context
-+// There is nothing magic in this fn - it just packages setting
-+// get_buffer2 & get_buffer_context
-+
-+typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
-+                                               const AVRpiZcFrameGeometry * geo);
-+typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
-+
-+int av_rpi_zc_init2(struct AVCodecContext * const s,
-+                    void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
-+
-+// Free ZC from a context
-+// There is nothing magic in this fn - it just packages unsetting
-+// get_buffer2 & get_buffer_context
-+void av_rpi_zc_uninit2(struct AVCodecContext * const s);
-+
-+void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
-+AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
-+void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
-+unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
-+
-+// Get buffer generates placeholders for later alloc
-+int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
-+// Resolve actually does the alloc (noop if already alloced)
-+// Set data pointers on a buffer/frame that was copied before the alloc
-+// accured
-+#define ZC_RESOLVE_FAIL         0  // return error on invalid
-+#define ZC_RESOLVE_ALLOC        1  // alloc as invalid
-+#define ZC_RESOLVE_WAIT_VALID   2  // wait for valid
-+#define ZC_RESOLVE_ALLOC_VALID  3  // alloc as valid
-+int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
-+int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
-+
-+int av_rpi_zc_set_valid_frame(AVFrame * const frame);
-+int av_rpi_zc_set_broken_frame(AVFrame * const frame);
-+
-+
-+typedef struct av_rpi_zc_buf_fn_tab_s {
-+    void (* free)(void * v);
-+
-+    unsigned int (* vcsm_handle)(void * v);
-+    unsigned int (* vc_handle)(void * v);
-+    void * (* map_arm)(void * v);
-+    unsigned int (* map_vc)(void * v);
-+} av_rpi_zc_buf_fn_tab_t;
-+
-+AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
-+void * av_rpi_zc_buf_v(AVBufferRef * const buf);
-+
-+
-+AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
-+                    void * pool_env,
-+                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
-+                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
-+void av_rpi_zc_env_release(const AVZcEnvPtr zc);
-+
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h
-new file mode 100644
-index 0000000000..990cffa21a
---- /dev/null
-+++ b/libavcodec/rpi_zc_frames.h
-@@ -0,0 +1,142 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox, Ben Avison
-+*/
-+
-+#ifndef RPI_ZC_FRAMES_H
-+#define RPI_ZC_FRAMES_H
-+
-+#define RPI_ONE_BUF 1
-+
-+#include "rpi_mem.h"  // for GPU_MEM_PTR_T
-+#include "libavutil/frame.h"
-+
-+#if !RPI_ONE_BUF
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
-+    return p->vc;
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
-+}
-+
-+#else
-+
-+static inline int gpu_is_buf1(const AVFrame * const frame)
-+{
-+    return frame->buf[1] == NULL;
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
-+{
-+    return av_buffer_get_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
-+{
-+    return av_buffer_pool_opaque(frame->buf[n]);
-+}
-+
-+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
-+{
-+    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
-+    return gm->vc + (frame->data[n] - gm->arm);
-+}
-+
-+
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 0);
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 1);
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 2);
-+}
-+
-+#if 0
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.numbytes = frame->data[1] - frame->data[0];
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 0);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[1] - frame->data[0];
-+        g.vc += frame->data[1] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 1);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[2] - frame->data[0];
-+        g.vc += frame->data[2] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 2);
-+}
-+#endif
-+#endif
-+
-+#endif
-diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c
-new file mode 100644
-index 0000000000..1e33468dbc
---- /dev/null
-+++ b/libavcodec/rpivid_hevc.c
-@@ -0,0 +1,2032 @@
-+// FFMPEG HEVC decoder hardware accelerator
-+// Andrew Holme, Argon Design Ltd
-+// Copyright (c) June 2017 Raspberry Pi Ltd
-+
-+#include <stdio.h>
-+#include <fcntl.h>
-+#include <pthread.h>
-+#include <semaphore.h>
-+#include <sys/mman.h>
-+
-+#include "fftools/ffmpeg.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/imgutils.h"
-+#include "avcodec.h"
-+#include "hwaccel.h"
-+#include "decode.h"
-+
-+#include "hevc.h"
-+#include "hevcdec.h"
-+#include "rpi_zc.h"
-+#include "rpi_mem.h"
-+#include "rpi_zc_frames.h"
-+#include "rpi_mailbox.h"
-+
-+
-+#define OPT_PHASE_TIMING 0      // Generate stats for phase usage
-+
-+#define NUM_SCALING_FACTORS 4064
-+
-+#define AXI_BASE64 0
-+
-+#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
-+#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
-+
-+#define RPIVID_COL_PICS 17                 // 16 ref & current
-+
-+#define RPIVID_BITBUFS          2          // Bit + Cmd bufs (phase 0 & 1)
-+#define RPIVID_BITBUF_SIZE      (4 << 20)  // Bit + Cmd buf size
-+
-+#define RPIVID_COEFFBUFS        3          // PU + Coeff bufs (phase 1 & 2)
-+#define RPIVID_COEFFBUF_SIZE    (16 << 20) // PU + Coeff buf size
-+
-+//////////////////////////////////////////////////////////////////////////////
-+//
-+// Register offsets
-+
-+#define RPI_SPS0         0
-+#define RPI_SPS1         4
-+#define RPI_PPS          8
-+#define RPI_SLICE        12
-+#define RPI_TILESTART    16
-+#define RPI_TILEEND      20
-+#define RPI_SLICESTART   24
-+#define RPI_MODE         28
-+#define RPI_LEFT0        32
-+#define RPI_LEFT1        36
-+#define RPI_LEFT2        40
-+#define RPI_LEFT3        44
-+#define RPI_QP           48
-+#define RPI_CONTROL      52
-+#define RPI_STATUS       56
-+#define RPI_VERSION      60
-+#define RPI_BFBASE       64
-+#define RPI_BFNUM        68
-+#define RPI_BFCONTROL    72
-+#define RPI_BFSTATUS     76
-+#define RPI_PUWBASE      80
-+#define RPI_PUWSTRIDE    84
-+#define RPI_COEFFWBASE   88
-+#define RPI_COEFFWSTRIDE 92
-+#define RPI_SLICECMDS    96
-+#define RPI_BEGINTILEEND 100
-+#define RPI_TRANSFER     104
-+#define RPI_CFBASE       108
-+#define RPI_CFNUM        112
-+#define RPI_CFSTATUS     116
-+
-+#define RPI_PURBASE       0x8000
-+#define RPI_PURSTRIDE     0x8004
-+#define RPI_COEFFRBASE    0x8008
-+#define RPI_COEFFRSTRIDE  0x800C
-+#define RPI_NUMROWS       0x8010
-+#define RPI_CONFIG2       0x8014
-+#define RPI_OUTYBASE      0x8018
-+#define RPI_OUTYSTRIDE    0x801C
-+#define RPI_OUTCBASE      0x8020
-+#define RPI_OUTCSTRIDE    0x8024
-+#define RPI_STATUS2       0x8028
-+#define RPI_FRAMESIZE     0x802C
-+#define RPI_MVBASE        0x8030
-+#define RPI_MVSTRIDE      0x8034
-+#define RPI_COLBASE       0x8038
-+#define RPI_COLSTRIDE     0x803C
-+#define RPI_CURRPOC       0x8040
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+// Unused but left here to illustrate the diffrences between FFmpegs prob
-+// structure and the rpivid one
-+
-+struct FFM_PROB {
-+    uint8_t  sao_merge_flag                   [ 1];
-+    uint8_t  sao_type_idx                     [ 1];
-+    uint8_t  split_coding_unit_flag           [ 3];
-+    uint8_t  cu_transquant_bypass_flag        [ 1];
-+    uint8_t  skip_flag                        [ 3];
-+    uint8_t  cu_qp_delta                      [ 3];
-+    uint8_t  pred_mode_flag                   [ 1];
-+    uint8_t  part_mode                        [ 4];
-+    uint8_t  prev_intra_luma_pred_flag        [ 1];
-+    uint8_t  intra_chroma_pred_mode           [ 2];
-+    uint8_t  merge_flag                       [ 1];
-+    uint8_t  merge_idx                        [ 1];
-+    uint8_t  inter_pred_idc                   [ 5];
-+    uint8_t  ref_idx_l0                       [ 2];
-+    uint8_t  ref_idx_l1                       [ 2];
-+    uint8_t  abs_mvd_greater0_flag            [ 2];
-+    uint8_t  abs_mvd_greater1_flag            [ 2];
-+    uint8_t  mvp_lx_flag                      [ 1];
-+    uint8_t  no_residual_data_flag            [ 1];
-+    uint8_t  split_transform_flag             [ 3];
-+    uint8_t  cbf_luma                         [ 2];
-+    uint8_t  cbf_cb_cr                        [ 4];
-+    uint8_t  transform_skip_flag/*[][]*/      [ 2];
-+    uint8_t  explicit_rdpcm_flag/*[][]*/      [ 2];
-+    uint8_t  explicit_rdpcm_dir_flag/*[][]*/  [ 2];
-+    uint8_t  last_significant_coeff_x_prefix  [18];
-+    uint8_t  last_significant_coeff_y_prefix  [18];
-+    uint8_t  significant_coeff_group_flag     [ 4];
-+    uint8_t  significant_coeff_flag           [44];
-+    uint8_t  coeff_abs_level_greater1_flag    [24];
-+    uint8_t  coeff_abs_level_greater2_flag    [ 6];
-+    uint8_t  log2_res_scale_abs               [ 8];
-+    uint8_t  res_scale_sign_flag              [ 2];
-+    uint8_t  cu_chroma_qp_offset_flag         [ 1];
-+    uint8_t  cu_chroma_qp_offset_idx          [ 1];
-+} __attribute__((packed));
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+struct RPI_PROB {
-+    uint8_t  SAO_MERGE_FLAG             [ 1];
-+    uint8_t  SAO_TYPE_IDX               [ 1];
-+    uint8_t  SPLIT_FLAG                 [ 3];
-+    uint8_t  CU_SKIP_FLAG               [ 3];
-+    uint8_t  CU_TRANSQUANT_BYPASS_FLAG  [ 1];
-+    uint8_t  PRED_MODE                  [ 1];
-+    uint8_t  PART_SIZE                  [ 4];
-+    uint8_t  INTRA_PRED_MODE            [ 1];
-+    uint8_t  CHROMA_PRED_MODE           [ 1];
-+    uint8_t  MERGE_FLAG_EXT             [ 1];
-+    uint8_t  MERGE_IDX_EXT              [ 1];
-+    uint8_t  INTER_DIR                  [ 5];
-+    uint8_t  REF_PIC                    [ 2];
-+    uint8_t  MVP_IDX                    [ 1];
-+    uint8_t  MVD                        [ 2];
-+    uint8_t  QT_ROOT_CBF                [ 1];
-+    uint8_t  TRANS_SUBDIV_FLAG          [ 3];
-+    uint8_t  QT_CBF                     [ 6];
-+    uint8_t  DQP                        [ 2];
-+    uint8_t  ONE_FLAG                   [24];
-+    uint8_t  LASTX                      [18];
-+    uint8_t  LASTY                      [18];
-+    uint8_t  SIG_CG_FLAG                [ 4];
-+    uint8_t  ABS_FLAG                   [ 6];
-+    uint8_t  TRANSFORMSKIP_FLAG         [ 2];
-+    uint8_t  SIG_FLAG                   [42];
-+    uint8_t  SIG_FLAG_unused            [ 2];
-+} __attribute__((packed));
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+struct RPI_CMD {
-+    uint32_t addr;
-+    uint32_t data;
-+} __attribute__((packed));
-+
-+struct RPI_BIT {
-+    int         cmd;
-+    const void *ptr;
-+    int         len;
-+};
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+struct RPI_T;
-+
-+// Actual addressability is 38bits but we can only alloc in the bottom 32
-+// currently - when passed to rpivid h/w the address is always >> 6 so will
-+// fit in 32 bit there
-+// At some point we may weant to make this uint64_t
-+typedef uint32_t vid_vc_addr_t;
-+
-+typedef enum rpivid_decode_state_e {
-+    RPIVID_DECODE_NEW = 0,
-+    RPIVID_DECODE_START,
-+    RPIVID_DECODE_SLICE,
-+    RPIVID_DECODE_END,
-+} rpivid_decode_state_t;
-+
-+#define RPI_PROB_VALS 154U
-+#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
-+
-+typedef struct dec_env_s {
-+    const AVCodecContext * avctx;
-+
-+    rpivid_decode_state_t state;
-+    unsigned int    decode_order;
-+
-+    int             phase_no;           // Current phase (i.e. the last one we waited for)
-+    struct dec_env_s * phase_wait_q_next;
-+    sem_t           phase_wait;
-+
-+    struct RPI_BIT *bit_fifo;
-+    struct RPI_CMD *cmd_fifo;
-+    unsigned int    bit_len, bit_max;
-+    unsigned int    cmd_len, cmd_max;
-+    unsigned int    num_slice_msgs;
-+    unsigned int    PicWidthInCtbsY;
-+    unsigned int    PicHeightInCtbsY;
-+    unsigned int    dpbno_col;
-+    uint32_t        reg_slicestart;
-+    unsigned int    wpp_entry_x;
-+    unsigned int    wpp_entry_y;
-+    uint16_t        slice_msgs[2*HEVC_MAX_REFS*8+3];
-+    uint8_t         scaling_factors[NUM_SCALING_FACTORS];
-+//    unsigned int    RefPicList[2][HEVC_MAX_REFS];
-+} dec_env_t;
-+
-+#define RPIVID_PHASES 3
-+#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
-+#define RPIVID_PHASE_START (-1)          // Phase after we have inced decode_order
-+
-+#if OPT_PHASE_TIMING
-+static const unsigned int time_thresholds[8] = {
-+    10, 15, 20, 30, 45, 60, 75, 90
-+};
-+#endif
-+
-+typedef struct phase_wait_env_s {
-+    unsigned int    last_order;
-+    dec_env_t *     q;
-+#if OPT_PHASE_TIMING
-+    uint64_t phase_time;
-+    uint64_t max_phase_time;
-+    uint64_t time_in_phase;
-+    uint64_t time_out_phase;
-+    unsigned int max_time_decode_order;
-+    unsigned int time_bins[9];
-+    unsigned int time_bins3[9];
-+    unsigned int time_bins5[9];
-+    uint64_t time_stash[16];
-+    unsigned int i3;
-+#endif
-+} phase_wait_env_t;                      // Single linked list of threads waiting for this phase
-+
-+typedef struct RPI_T {
-+    atomic_int      ref_count;
-+    sem_t           ref_zero;
-+
-+    dec_env_t **    dec_envs;
-+    AVZcEnvPtr      zc;
-+
-+    pthread_mutex_t phase_lock;
-+    phase_wait_env_t phase_reqs[RPIVID_PHASES];
-+
-+    volatile uint32_t * regs;
-+    volatile uint32_t * ints;
-+
-+    GPU_MEM_PTR_T   gcolbuf;
-+    unsigned int    col_stride;
-+    size_t          col_picsize;
-+
-+    unsigned int    bitbuf_no;
-+    sem_t           bitbuf_sem;
-+    GPU_MEM_PTR_T   gbitbufs[RPIVID_BITBUFS];
-+
-+    unsigned int    max_pu_msgs;
-+    unsigned int    coeffbuf_no;
-+    sem_t           coeffbuf_sem;
-+    GPU_MEM_PTR_T   gcoeffbufs[RPIVID_COEFFBUFS];
-+
-+    unsigned int    decode_order;
-+    int             mbox_fd;
-+    int             gpu_init_type;
-+} RPI_T;
-+
-+#if OPT_PHASE_TIMING
-+static uint64_t tus64(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
-+}
-+#endif
-+
-+static inline unsigned int rnd64(unsigned int x)
-+{
-+    return (x + 63) & ~63;
-+}
-+
-+static inline int rpi_sem_wait(sem_t * const sem)
-+{
-+    int rv;
-+    while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
-+        /* Loop */;
-+    return rv;
-+}
-+
-+//============================================================================
-+
-+#define TRACE_DEV 0
-+#define TRACE_ENTRY 0
-+
-+#define REGS_NAME "/dev/rpivid-hevcmem"
-+#define REGS_SIZE 0x10000
-+#define INTS_NAME "/dev/rpivid-intcmem"
-+#define INTS_SIZE 0x10000  // 4 is probably enough but we are going to alloc a page anyway
-+
-+static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
-+{
-+    void *gpio_map;
-+    int  mem_fd;
-+
-+    /* open /dev/mem */
-+    if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
-+        av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
-+        return NULL;
-+    }
-+
-+    // Now map it
-+    gpio_map = mmap(
-+       NULL,
-+       size,
-+       PROT_READ|PROT_WRITE,
-+       MAP_SHARED,
-+       mem_fd,
-+       0
-+    );
-+
-+    close(mem_fd);  // No longer need the FD
-+
-+    if (gpio_map == MAP_FAILED) {
-+        av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
-+        return NULL;
-+    }
-+
-+    return (volatile uint32_t *)gpio_map;
-+}
-+
-+static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
-+{
-+    volatile uint32_t * const gpio_map = *p_gpio_map;
-+    if (gpio_map != NULL) {
-+        *p_gpio_map = NULL;
-+        munmap((void *)gpio_map, size);
-+    }
-+}
-+
-+#define MANGLE(x) ((x) &~0xc0000000)          // ** If x is ever a 64 bit thing this will need fixing!
-+#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
-+
-+static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
-+{
-+    rpi->regs[addr >> 2] = MANGLE64(data);
-+}
-+
-+static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
-+{
-+    rpi->regs[addr >> 2] = data >> 6;  // ?? rnd64 - but not currently needed
-+}
-+
-+static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
-+{
-+#if TRACE_DEV
-+    printf("W %x %08x\n", addr, data);
-+#endif
-+
-+    rpi->regs[addr >> 2] = data;
-+}
-+
-+static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
-+{
-+    const uint32_t v = rpi->regs[addr >> 2];
-+#if TRACE_DEV
-+    printf("R %x (=%x)\n", addr, v);
-+#endif
-+    return v;
-+}
-+
-+#define ARG_IC_ICTRL_ACTIVE1_INT_SET                   0x00000001
-+#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET                  0x00000002
-+#define ARG_IC_ICTRL_ACTIVE1_EN_SET                    0x00000004
-+#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET                0x00000008
-+#define ARG_IC_ICTRL_ACTIVE2_INT_SET                   0x00000010
-+#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET                  0x00000020
-+#define ARG_IC_ICTRL_ACTIVE2_EN_SET                    0x00000040
-+#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET                0x00000080
-+
-+static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
-+{
-+    const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
-+    const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
-+    uint32_t ival;
-+    while (((ival = rpi->ints[0]) & mask_done) == 0) {
-+        usleep(1000);
-+    }
-+    rpi->ints[0] = ival & mask_reset;
-+}
-+
-+#if TRACE_DEV
-+static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
-+    int i;
-+
-+    for (i=0; i<num; i++)
-+    {
-+        if ((i%4)==0)
-+          printf("%08x: ", 0x7eb00000 + addr + 4*i);
-+
-+        printf("%08x", rpi->regs[(addr>>2)+i]);
-+
-+        if ((i%4)==3 || i+1 == num)
-+            printf("\n");
-+        else
-+            printf(" ");
-+    }
-+}
-+
-+static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
-+    int i;
-+
-+    for (i=0; i<size>>2; i++)
-+    {
-+        if ((i%4)==0)
-+            printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
-+
-+        printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
-+
-+        if ((i%4)==3 || i+1 == size>>2)
-+            printf("\n");
-+        else
-+            printf(" ");
-+    }
-+}
-+#endif
-+
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Scaling factors
-+
-+static void expand_scaling_list(
-+    const unsigned int sizeID,
-+    const unsigned int matrixID,
-+    uint8_t * const dst0,
-+    const uint8_t * const src0,
-+    uint8_t dc)
-+{
-+    switch (sizeID) {
-+        case 0:
-+            memcpy(dst0, src0, 16);
-+            break;
-+        case 1:
-+            memcpy(dst0, src0, 64);
-+            break;
-+        case 2:
-+        {
-+            uint8_t * d = dst0;
-+            for (unsigned int y=0; y != 16; y++) {
-+                const uint8_t * s = src0 + (y >> 1) * 8;
-+                for (unsigned int x = 0; x != 8; ++x) {
-+                    *d++ = *s;
-+                    *d++ = *s++;
-+                }
-+            }
-+            dst0[0] = dc;
-+            break;
-+        }
-+        default:
-+        {
-+            uint8_t * d = dst0;
-+            for (unsigned int y=0; y != 32; y++) {
-+                const uint8_t * s = src0 + (y >> 2) * 8;
-+                for (unsigned int x = 0; x != 8; ++x) {
-+                    *d++ = *s;
-+                    *d++ = *s;
-+                    *d++ = *s;
-+                    *d++ = *s++;
-+                }
-+            }
-+            dst0[0] = dc;
-+            break;
-+        }
-+    }
-+}
-+
-+static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
-+    // Array of constants for scaling factors
-+    static const uint32_t scaling_factor_offsets[4][6] = {
-+        // MID0    MID1    MID2    MID3    MID4    MID5
-+        {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050},   // SID0 (4x4)
-+        {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0},   // SID1 (8x8)
-+        {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0},   // SID2 (16x16)
-+        {0x07E0,      0,      0, 0x0BE0,      0,      0}};  // SID3 (32x32)
-+
-+    // ffmpeg places SID3,MID1 where matrixID 3 normally is
-+    const ScalingList * const sl =
-+        s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
-+                                                  : &s->ps.sps->scaling_list;
-+    unsigned int mid;
-+
-+    for (mid=0; mid<6; mid++)
-+        expand_scaling_list(0, mid,
-+            de->scaling_factors + scaling_factor_offsets[0][mid],
-+            sl->sl[0][mid], 0);
-+    for (mid=0; mid<6; mid++)
-+        expand_scaling_list(1, mid,
-+            de->scaling_factors + scaling_factor_offsets[1][mid],
-+            sl->sl[1][mid], 0);
-+    for (mid=0; mid<6; mid++)
-+        expand_scaling_list(2, mid,
-+            de->scaling_factors + scaling_factor_offsets[2][mid],
-+            sl->sl[2][mid],
-+            sl->sl_dc[0][mid]);
-+    // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
-+    for (mid=0; mid<6; mid += 3)
-+        expand_scaling_list(3, mid,
-+            de->scaling_factors + scaling_factor_offsets[3][mid],
-+            sl->sl[3][mid],
-+            sl->sl_dc[1][mid]);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Probabilities
-+
-+static const uint8_t prob_init[3][156] = {
-+	{
-+		 153, 200, 139, 141, 157, 154, 154, 154,
-+		 154, 154, 184, 154, 154, 154, 184,  63,
-+		 154, 154, 154, 154, 154, 154, 154, 154,
-+		 154, 154, 154, 154, 154, 153, 138, 138,
-+		 111, 141,  94, 138, 182, 154, 154, 154,
-+		 140,  92, 137, 138, 140, 152, 138, 139,
-+		 153,  74, 149,  92, 139, 107, 122, 152,
-+		 140, 179, 166, 182, 140, 227, 122, 197,
-+		 110, 110, 124, 125, 140, 153, 125, 127,
-+		 140, 109, 111, 143, 127, 111,  79, 108,
-+		 123,  63, 110, 110, 124, 125, 140, 153,
-+		 125, 127, 140, 109, 111, 143, 127, 111,
-+		  79, 108, 123,  63,  91, 171, 134, 141,
-+		 138, 153, 136, 167, 152, 152, 139, 139,
-+		 111, 111, 125, 110, 110,  94, 124, 108,
-+		 124, 107, 125, 141, 179, 153, 125, 107,
-+		 125, 141, 179, 153, 125, 107, 125, 141,
-+		 179, 153, 125, 140, 139, 182, 182, 152,
-+		 136, 152, 136, 153, 136, 139, 111, 136,
-+		 139, 111,   0,   0,	},
-+	{
-+		 153, 185, 107, 139, 126, 197, 185, 201,
-+		 154, 149, 154, 139, 154, 154, 154, 152,
-+		 110, 122,  95,  79,  63,  31,  31, 153,
-+		 153, 168, 140, 198,  79, 124, 138,  94,
-+		 153, 111, 149, 107, 167, 154, 154, 154,
-+		 154, 196, 196, 167, 154, 152, 167, 182,
-+		 182, 134, 149, 136, 153, 121, 136, 137,
-+		 169, 194, 166, 167, 154, 167, 137, 182,
-+		 125, 110,  94, 110,  95,  79, 125, 111,
-+		 110,  78, 110, 111, 111,  95,  94, 108,
-+		 123, 108, 125, 110,  94, 110,  95,  79,
-+		 125, 111, 110,  78, 110, 111, 111,  95,
-+		  94, 108, 123, 108, 121, 140,  61, 154,
-+		 107, 167,  91, 122, 107, 167, 139, 139,
-+		 155, 154, 139, 153, 139, 123, 123,  63,
-+		 153, 166, 183, 140, 136, 153, 154, 166,
-+		 183, 140, 136, 153, 154, 166, 183, 140,
-+		 136, 153, 154, 170, 153, 123, 123, 107,
-+		 121, 107, 121, 167, 151, 183, 140, 151,
-+		 183, 140,   0,   0,	},
-+	{
-+		 153, 160, 107, 139, 126, 197, 185, 201,
-+		 154, 134, 154, 139, 154, 154, 183, 152,
-+		 154, 137,  95,  79,  63,  31,  31, 153,
-+		 153, 168, 169, 198,  79, 224, 167, 122,
-+		 153, 111, 149,  92, 167, 154, 154, 154,
-+		 154, 196, 167, 167, 154, 152, 167, 182,
-+		 182, 134, 149, 136, 153, 121, 136, 122,
-+		 169, 208, 166, 167, 154, 152, 167, 182,
-+		 125, 110, 124, 110,  95,  94, 125, 111,
-+		 111,  79, 125, 126, 111, 111,  79, 108,
-+		 123,  93, 125, 110, 124, 110,  95,  94,
-+		 125, 111, 111,  79, 125, 126, 111, 111,
-+		  79, 108, 123,  93, 121, 140,  61, 154,
-+		 107, 167,  91, 107, 107, 167, 139, 139,
-+		 170, 154, 139, 153, 139, 123, 123,  63,
-+		 124, 166, 183, 140, 136, 153, 154, 166,
-+		 183, 140, 136, 153, 154, 166, 183, 140,
-+		 136, 153, 154, 170, 153, 138, 138, 122,
-+		 121, 122, 121, 167, 151, 183, 140, 151,
-+		 183, 140,   0,   0,	},
-+};
-+
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Phase 1 command and bit FIFOs
-+
-+// ???? uint16_t addr - put in uint32_t
-+static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
-+    if (de->cmd_len==de->cmd_max)
-+        av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
-+    de->cmd_fifo[de->cmd_len].addr = addr;
-+    de->cmd_fifo[de->cmd_len].data = data;
-+    return de->cmd_len++;
-+}
-+
-+static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
-+    if (de->bit_len==de->bit_max)
-+        av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
-+    de->bit_fifo[de->bit_len].cmd = cmd_idx;
-+    de->bit_fifo[de->bit_len].ptr = ptr;
-+    de->bit_fifo[de->bit_len].len = len;
-+    de->bit_len++;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Write probability and scaling factor memories
-+
-+#if 0
-+static void WriteProb(dec_env_t * const de) {
-+    int i;
-+    const uint8_t *p = (uint8_t *) &de->probabilities;
-+    for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
-+        p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
-+}
-+#endif
-+
-+static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
-+    uint8_t dst[RPI_PROB_ARRAY_SIZE];
-+
-+    const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
-+        s->sh.slice_type + 1 : 2 - s->sh.slice_type;
-+    const uint8_t * p = prob_init[init_type];
-+    const int q = av_clip(s->sh.slice_qp, 0, 51);
-+    unsigned int i;
-+
-+    for (i = 0; i < RPI_PROB_VALS; i++) {
-+        int init_value = p[i];
-+        int m = (init_value >> 4) * 5 - 45;
-+        int n = ((init_value & 15) << 3) - 16;
-+        int pre = 2 * (((m * q) >> 4) + n) - 127;
-+
-+        pre ^= pre >> 31;
-+        if (pre > 124)
-+            pre = 124 + (pre & 1);
-+        dst[i] = pre;
-+    }
-+    for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
-+        dst[i] = 0;
-+    }
-+
-+    for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
-+        p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
-+
-+}
-+
-+
-+static void WriteScalingFactors(dec_env_t * const de) {
-+    int i;
-+    const uint8_t *p = (uint8_t *) de->scaling_factors;
-+    for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
-+        p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
-+    int i;
-+    for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
-+    return i-1;
-+}
-+
-+static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
-+    if (ctb < bd[num-1]) return ctb_size;
-+    else if (width % ctb_size) return width % ctb_size;
-+    else return ctb_size;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Handle PU and COEFF stream overflow
-+
-+
-+// Returns:
-+// -2 Other error
-+// -1 Out of coeff space
-+//  0  OK
-+//  1  Out of PU space
-+
-+static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
-+    uint32_t status;
-+
-+    // this is the definition of successful completion of phase 1
-+    // it assures that status register is zero and all blocks in each tile have completed
-+    if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
-+        return 0;
-+
-+    status = apb_read(rpi, RPI_STATUS);
-+
-+    if ((status & 8) != 0)
-+        return -1;
-+
-+    if ((status & 0x10) != 0)
-+        return 1;
-+
-+    return -2;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Write STATUS register with expected end CTU address of previous slice
-+
-+static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
-+    const HEVCPPS * const pps = s->ps.pps;
-+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
-+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
-+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
-+}
-+
-+static void wpp_pause(dec_env_t * const de, int ctb_row) {
-+    p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
-+    p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
-+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
-+    p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
-+}
-+
-+static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
-+    const HEVCPPS *pps = s->ps.pps;
-+    int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
-+    int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
-+    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
-+    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
-+    if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
-+        wpp_pause(de, last_y);
-+    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
-+    if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
-+        p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
-+{
-+    const HEVCSPS *sps = s->ps.sps;
-+    const HEVCPPS *pps = s->ps.pps;
-+
-+    p1_apb_write(de, RPI_SPS0,
-+        (sps->log2_min_cb_size                    <<  0) +
-+        (sps->log2_ctb_size                       <<  4) +
-+        (sps->log2_min_tb_size                    <<  8) +
-+        (sps->log2_max_trafo_size                 << 12) +
-+        (sps->bit_depth                           << 16) +
-+        (sps->bit_depth                           << 20) +
-+        (sps->max_transform_hierarchy_depth_intra << 24) +
-+        (sps->max_transform_hierarchy_depth_inter << 28));
-+
-+    p1_apb_write(de, RPI_SPS1,
-+        (sps->pcm.bit_depth                                        <<  0) +
-+        (sps->pcm.bit_depth_chroma                                 <<  4) +
-+        (sps->pcm.log2_min_pcm_cb_size                             <<  8) +
-+        (sps->pcm.log2_max_pcm_cb_size                             << 12) +
-+        (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
-+        (sps->amp_enabled_flag                                     << 18) +
-+        (sps->pcm_enabled_flag                                     << 19) +
-+        (sps->scaling_list_enable_flag                             << 20) +
-+        (sps->sps_strong_intra_smoothing_enable_flag               << 21));
-+
-+    p1_apb_write(de, RPI_PPS,
-+        (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth   <<  0) +
-+        (pps->cu_qp_delta_enabled_flag                      <<  4) +
-+        (pps->transquant_bypass_enable_flag                 <<  5) +
-+        (pps->transform_skip_enabled_flag                   <<  6) +
-+        (pps->sign_data_hiding_flag                         <<  7) +
-+      (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) <<  8) +
-+      (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
-+        (pps->constrained_intra_pred_flag                   << 24));
-+
-+    if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
-+
-+    if (!s->sh.dependent_slice_segment_flag) {
-+        int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
-+        int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
-+        de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
-+    }
-+
-+    p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static void write_slice(dec_env_t * const de, const HEVCContext * const s,
-+                        const unsigned int slice_w, const unsigned int slice_h) {
-+    uint32_t u32 =
-+          (s->sh.slice_type                           << 12)
-+        + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
-+        + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
-+        + (slice_w                                    << 17)
-+        + (slice_h                                    << 24);
-+
-+    if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
-+          (s->sh.max_num_merge_cand << 0)
-+        + (s->sh.nb_refs[L0]        << 4)
-+        + (s->sh.nb_refs[L1]        << 8);
-+
-+    if (s->sh.slice_type==HEVC_SLICE_B)
-+        u32 |= s->sh.mvd_l1_zero_flag<<16;
-+    p1_apb_write(de, RPI_SLICE, u32);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Wavefront mode
-+
-+static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
-+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const HEVCPPS * const pps = s->ps.pps;
-+
-+    int ctb_size = 1<<sps->log2_ctb_size;
-+    int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+
-+    int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
-+    int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
-+
-+    int endx = de->PicWidthInCtbsY-1;
-+    int endy = ctb_row;
-+
-+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
-+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
-+
-+    p1_apb_write(de, RPI_TILESTART, 0);
-+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
-+
-+    if (do_bte)
-+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
-+
-+    write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
-+
-+    if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
-+
-+    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
-+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Tiles mode
-+
-+static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
-+                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const HEVCPPS * const pps = s->ps.pps;
-+
-+    int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
-+    int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
-+
-+    int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
-+    int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
-+
-+    int endx = pps->col_bd[tile_x+1] - 1;
-+    int endy = pps->row_bd[tile_y+1] - 1;
-+
-+    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
-+    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
-+
-+    p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
-+    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
-+
-+    if (do_bte)
-+        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
-+
-+    write_slice(de, s, slice_w, slice_h);
-+
-+    if (resetQPY)
-+        p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
-+
-+    p1_apb_write(de, RPI_MODE, (0xFFFF                            <<  0)
-+                              + (0x0                               << 16)
-+                              + ((tile_x==pps->num_tile_columns-1) << 17)
-+                              + ((tile_y==pps->num_tile_rows-1)    << 18));
-+
-+    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+// Doesn't attempt to remove from context as we should only do this at the end
-+// of time or on create error
-+static void
-+dec_env_delete(dec_env_t * const de)
-+{
-+//    gpu_free(&de->gbuf);
-+
-+    av_freep(&de->cmd_fifo);
-+    av_freep(&de->bit_fifo);
-+
-+    sem_destroy(&de->phase_wait);
-+    av_free(de);
-+}
-+
-+static dec_env_t *
-+dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
-+{
-+    dec_env_t * const de = av_mallocz(sizeof(*de));
-+    int i;
-+
-+    if (de == NULL)
-+        return NULL;
-+
-+    de->avctx = avctx;
-+    de->phase_no = RPIVID_PHASE_NEW;
-+
-+    sem_init(&de->phase_wait, 0, 0);
-+
-+    if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
-+        goto fail;
-+
-+    if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
-+        goto fail;
-+
-+    pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
-+    for (i = 0; i != avctx->thread_count; ++i) {
-+        if (rpi->dec_envs[i] == NULL)
-+        {
-+            rpi->dec_envs[i] = de;
-+            break;
-+        }
-+    }
-+    pthread_mutex_unlock(&rpi->phase_lock);
-+
-+    if (i == avctx->thread_count) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
-+        goto fail;
-+    }
-+
-+    return de;
-+
-+fail:
-+    dec_env_delete(de);
-+    return NULL;
-+}
-+
-+
-+static dec_env_t *
-+dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
-+{
-+    dec_env_t * de = NULL;
-+    const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
-+
-+    if (ref_count <= 0) {
-+        // Already dead
-+        av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
-+        return NULL;
-+    }
-+
-+    for (int i = 0; i != avctx->thread_count; ++i) {
-+        if (rpi->dec_envs[i] == NULL)
-+        {
-+            de = dec_env_new(avctx, rpi);
-+            break;
-+        }
-+        if (rpi->dec_envs[i]->avctx == avctx)
-+        {
-+            de = rpi->dec_envs[i];
-+            break;
-+        }
-+    }
-+    return de;
-+}
-+
-+// Call at end of fn
-+// Used to ensure we aren't in a worker thead when killed
-+static void
-+dec_env_release(RPI_T * const rpi, dec_env_t * const de)
-+{
-+    const int n = atomic_fetch_sub(&rpi->ref_count, 1);
-+    if (n == 1) {
-+        sem_post(&rpi->ref_zero);
-+    }
-+}
-+
-+//----------------------------------------------------------------------------
-+
-+// Wait for a slot in the given phase
-+// Any error return is probably fatal
-+static int
-+wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
-+{
-+    int needs_wait = 0;
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+
-+    pthread_mutex_lock(&rpi->phase_lock);
-+    if (p->last_order + 1 != de->decode_order) {
-+        de->phase_wait_q_next = p->q;
-+        p->q = de;
-+        needs_wait = 1;
-+    }
-+    pthread_mutex_unlock(&rpi->phase_lock);
-+
-+    if (needs_wait) {
-+        while (sem_wait(&de->phase_wait) == -1)
-+        {
-+            int err;
-+            if ((err = errno) != EINTR)
-+                return AVERROR(err);
-+        }
-+    }
-+
-+    de->phase_no = phase_no;
-+    return 0;
-+}
-+
-+static void
-+post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
-+{
-+    dec_env_t * next_de = NULL;
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+    dec_env_t ** q = &p->q;
-+
-+    pthread_mutex_lock(&rpi->phase_lock);
-+
-+    p->last_order = de->decode_order;
-+    while (*q != NULL) {
-+        dec_env_t * const t_de = *q;
-+
-+        if (t_de->decode_order == p->last_order + 1) {
-+            // This is us - remove from Q
-+            *q = t_de->phase_wait_q_next;
-+            t_de->phase_wait_q_next = NULL; // Tidy
-+            next_de = t_de;
-+            break;
-+        }
-+        q = &t_de->phase_wait_q_next;
-+    }
-+
-+    pthread_mutex_unlock(&rpi->phase_lock);
-+
-+    if (next_de != NULL)
-+        sem_post(&next_de->phase_wait);
-+}
-+
-+// Wait & signal stuff s.t. threads in other phases can continue
-+static void
-+abort_phases(RPI_T * const rpi, dec_env_t * const de)
-+{
-+    for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
-+        wait_phase(rpi, de, i);
-+        post_phase(rpi, de, i);
-+    }
-+    de->phase_no = RPIVID_PHASE_NEW;
-+}
-+
-+// Start timing for phase
-+// Stats only - no actual effect
-+static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
-+{
-+#if OPT_PHASE_TIMING
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+    const int64_t now = tus64();
-+    if (p->phase_time != 0)
-+        p->time_out_phase += now - p->phase_time;
-+    p->phase_time = now;
-+#endif
-+}
-+
-+#if OPT_PHASE_TIMING
-+static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
-+{
-+    uint64_t tsum = 0;
-+    unsigned int i;
-+    for (i = 0; i != avg_n; ++i)
-+        tsum += p->time_stash[(p->i3 - i) & 15];
-+    for (i = 0; i != 9; ++i) {
-+        if (time_thresholds[i] * 1000 * avg_n > tsum)
-+            break;
-+    }
-+    return i;
-+}
-+#endif
-+
-+// End timing for phase
-+// Stats only - no actual effect
-+static inline void tend_phase(RPI_T * const rpi, const int phase_no)
-+{
-+#if OPT_PHASE_TIMING
-+    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
-+    const uint64_t now = tus64();
-+    const uint64_t in_time = now - p->phase_time;
-+
-+    p->time_in_phase += in_time;
-+    p->phase_time = now;
-+    p->time_stash[p->i3] = in_time;
-+    if (in_time > p->max_phase_time) {
-+        p->max_phase_time = in_time;
-+        p->max_time_decode_order = p->last_order;
-+    }
-+    ++p->time_bins[tavg_bin_phase(p, 1)];
-+    ++p->time_bins3[tavg_bin_phase(p, 3)];
-+    ++p->time_bins5[tavg_bin_phase(p, 5)];
-+
-+    p->i3 = (p->i3 + 1) & 15;
-+#endif
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Start frame
-+
-+static int rpi_hevc_start_frame(
-+    AVCodecContext * avctx,
-+    const uint8_t *buffer,
-+    uint32_t size) {
-+
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    dec_env_t * const de = dec_env_get(avctx, rpi);
-+    const HEVCContext * const s = avctx->priv_data;
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s[%p]\n", __func__, de);
-+#endif
-+
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return -1;
-+    }
-+
-+    de->phase_no = RPIVID_PHASE_START;
-+    de->decode_order = ++rpi->decode_order;  // *** atomic?
-+
-+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
-+
-+    if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
-+        return -1;
-+    }
-+    de->state = RPIVID_DECODE_START;
-+
-+    de->PicWidthInCtbsY  = (sps->width + CtbSizeY - 1) / CtbSizeY;  //7-15
-+    de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY;  //7-17
-+    de->bit_len = 0;
-+    de->cmd_len = 0;
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p]\n", __func__, de);
-+#endif
-+
-+    dec_env_release(rpi, de);
-+    return 0;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Slice messages
-+
-+static void msg_slice(dec_env_t * const de, const uint16_t msg) {
-+    de->slice_msgs[de->num_slice_msgs++] = msg;
-+}
-+
-+static void program_slicecmds(dec_env_t * const de, const int sliceid) {
-+    int i;
-+    p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
-+    for(i=0; i < de->num_slice_msgs; i++) {
-+        p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
-+    }
-+}
-+
-+static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
-+    const HEVCSPS * const sps = s->ps.sps;
-+    const HEVCPPS * const pps = s->ps.pps;
-+    const SliceHeader *sh = &s->sh;
-+
-+    int weightedPredFlag, i, rIdx;
-+    uint16_t cmd_slice;
-+    unsigned int collocated_from_l0_flag;
-+
-+    de->num_slice_msgs=0;
-+    de->dpbno_col = 0;
-+    cmd_slice = 0;
-+    if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
-+    if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
-+    if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
-+
-+    if (sh->slice_type!=HEVC_SLICE_I) {
-+        cmd_slice += sh->nb_refs[L0]<<2;
-+        cmd_slice += sh->nb_refs[L1]<<6;
-+    }
-+
-+    if (sh->slice_type==HEVC_SLICE_P ||  sh->slice_type==HEVC_SLICE_B)
-+        cmd_slice |= sh->max_num_merge_cand<<11;
-+
-+    collocated_from_l0_flag =
-+        !sh->slice_temporal_mvp_enabled_flag ?
-+            0 :
-+        sh->slice_type == HEVC_SLICE_B ?
-+            (sh->collocated_list == L0) :
-+            (sh->slice_type==HEVC_SLICE_P);
-+    cmd_slice |= collocated_from_l0_flag<<14;
-+
-+    if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
-+
-+        int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
-+        for(i=L0; i<=L1; i++) {
-+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
-+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
-+                HEVCFrame *c = s->ref; // CurrentPicture
-+                if (c->poc < f->poc) NoBackwardPredFlag = 0;
-+            }
-+        }
-+
-+        if (sps->sps_temporal_mvp_enabled_flag)
-+        {
-+            const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
-+                s->ref->refPicList + 0 :
-+                s->ref->refPicList + 1;
-+            de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
-+        }
-+
-+        cmd_slice += NoBackwardPredFlag<<10;
-+        msg_slice(de, cmd_slice);
-+
-+        // Write reference picture descriptions
-+        weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
-+
-+        for(i=L0; i<=L1; i++)
-+            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
-+                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
-+                HEVCFrame *c = s->ref; // CurrentPicture
-+                int pic = f - s->DPB;
-+                // Make sure pictures are in range 0 to 15
-+                int adjusted_pic = f<c? pic : pic-1;
-+                int lt = s->ref->refPicList[i].isLongTerm[rIdx];
-+                msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
-+                msg_slice(de, f->poc);
-+                if (weightedPredFlag) {
-+                    msg_slice(de,   s->sh.luma_log2_weight_denom+(((i?s->  sh.luma_weight_l1:  s->sh.luma_weight_l0)[rIdx]   &0x1ff)<<3));
-+                    msg_slice(de,                                  (i?s->  sh.luma_offset_l1:  s->sh.luma_offset_l0)[rIdx]   & 0xff);
-+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
-+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
-+                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
-+                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
-+                }
-+            }
-+    }
-+    else
-+        msg_slice(de, cmd_slice);
-+
-+    msg_slice(de, ((sh->beta_offset/2)&15)
-+        + (((sh->tc_offset/2)&15)                           <<  4)
-+        + (sh->disable_deblocking_filter_flag               <<  8)
-+        + (sh->slice_loop_filter_across_slices_enabled_flag <<  9)
-+        + (pps->loop_filter_across_tiles_enabled_flag       << 10)); // CMD_DEBLOCK
-+
-+    msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
-+}
-+
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    dec_env_t * const de = dec_env_get(avctx,  rpi);
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s[%p]\n", __func__, de);
-+#endif
-+
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return;
-+    }
-+
-+    switch (de->state) {
-+        case RPIVID_DECODE_NEW:
-+        case RPIVID_DECODE_END:
-+            // Expected transition
-+            break;
-+
-+        case RPIVID_DECODE_SLICE:
-+            // Error transition
-+            av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
-+            break;
-+
-+        case RPIVID_DECODE_START:
-+        default:
-+            av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
-+            break;
-+    }
-+
-+    abort_phases(rpi, de);
-+    de->state = RPIVID_DECODE_NEW;
-+
-+    dec_env_release(rpi, de);
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// End frame
-+
-+static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    const HEVCContext * const s = avctx->priv_data;
-+    const HEVCPPS * const pps = s->ps.pps;
-+    const HEVCSPS * const sps = s->ps.sps;
-+    dec_env_t * const de = dec_env_get(avctx,  rpi);
-+    AVFrame * const f = s->ref->frame;
-+    const unsigned int dpbno_cur = s->ref - s->DPB;
-+    vid_vc_addr_t cmds_vc;
-+    vid_vc_addr_t pu_base_vc;
-+    unsigned int pu_stride;
-+    vid_vc_addr_t coeff_base_vc;
-+    unsigned int coeff_stride;
-+    unsigned int i;
-+    int rv = 0;
-+    int status = 0;
-+    int coeffbuf_sem_claimed = 0;
-+
-+#if TRACE_ENTRY
-+    fprintf("<<< %s[%p]\n", __func__, de);
-+#endif
-+
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return AVERROR_BUG;  // Should never happen
-+    }
-+
-+    if (de->state != RPIVID_DECODE_SLICE) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
-+        rv = AVERROR_UNKNOWN;
-+        goto fail;
-+    }
-+    de->state = RPIVID_DECODE_END;
-+
-+    // End of command compilation
-+    {
-+        const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
-+        const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
-+        if (pps->entropy_coding_sync_enabled_flag) {
-+            if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
-+                wpp_pause(de, last_y);
-+        }
-+        p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
-+    }
-+
-+    // Phase 0 ---------------------------------------------------------------
-+
-+    wait_phase(rpi, de, 0);
-+    rpi_sem_wait(&rpi->bitbuf_sem);
-+    tstart_phase(rpi, 0);
-+
-+    // Copy cmds & bits into gpu side buffer
-+    // Layout: CMDS, BITS
-+    {
-+        uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
-+        vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
-+        unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
-+
-+        uint8_t * p = armbase + rnd64(cmd_bytes);
-+        uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
-+
-+        cmds_vc = vcbase;
-+
-+        // Copy all the bits & update bitstream cmds to point at the right bits
-+        for (i = 0; i < de->bit_len; ++i)
-+        {
-+            const unsigned int seg_len = de->bit_fifo[i].len;
-+
-+            if (p + seg_len > eobits) {
-+                status = -1;
-+                break;
-+            }
-+
-+            memcpy(p, de->bit_fifo[i].ptr, seg_len);
-+            de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
-+
-+            p += rnd64(seg_len);
-+        }
-+
-+        memcpy(armbase, de->cmd_fifo, cmd_bytes);
-+    }
-+
-+    if (status == 0)
-+    {
-+        if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
-+            rpi->bitbuf_no = 0;
-+    }
-+    else
-+    {
-+        sem_post(&rpi->bitbuf_sem);
-+        av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
-+        rv = AVERROR_BUFFER_TOO_SMALL;
-+    }
-+
-+    tend_phase(rpi, 0);
-+    post_phase(rpi, de, 0);
-+
-+    if (status < 0)
-+        goto fail;
-+
-+    // Phase 1 ---------------------------------------------------------------
-+
-+    wait_phase(rpi, de, 1);
-+    rpi_sem_wait(&rpi->coeffbuf_sem);
-+    coeffbuf_sem_claimed = 1;
-+    tstart_phase(rpi, 1);
-+
-+    for (;;)
-+    {
-+        // (Re-)allocate PU/COEFF stream space
-+        const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
-+        unsigned int pu_size;
-+
-+        pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
-+        pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
-+        pu_size = pu_stride * de->PicHeightInCtbsY;
-+
-+        if (pu_size > total_size) {
-+            status = -1;
-+            break;
-+        }
-+
-+        // Allocate all remaining space to coeff
-+        coeff_base_vc = pu_base_vc + pu_size;
-+        coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63;  // Round down to multiple of 64
-+
-+        apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
-+        apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
-+        apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
-+        apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
-+
-+        // Trigger command FIFO
-+        apb_write(rpi, RPI_CFNUM, de->cmd_len);
-+#if TRACE_DEV
-+        apb_dump_regs(rpi, 0x0, 32);
-+        apb_dump_regs(rpi, 0x8000, 24);
-+        axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
-+#endif
-+        apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
-+
-+        int_wait(rpi, 1);
-+
-+        status = check_status(rpi, de);
-+
-+        if (status != 1)
-+            break;
-+
-+        // Status 1 means out of PU space so try again with more
-+        // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
-+        rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
-+    }
-+
-+    // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
-+    // may reuse a live buffer when we kick the coeff sem
-+    if (status == 0)
-+    {
-+        if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
-+            rpi->coeffbuf_no = 0;
-+    }
-+    else
-+    {
-+        if (status == -1)
-+        {
-+            av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
-+            rv = AVERROR_BUFFER_TOO_SMALL;
-+        }
-+        else
-+        {
-+            av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
-+            rv = AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    tend_phase(rpi, 1);
-+    sem_post(&rpi->bitbuf_sem);
-+    post_phase(rpi, de, 1);
-+
-+    if (status != 0)
-+        goto fail;
-+
-+    // Phase 2 ---------------------------------------------------------------
-+
-+    wait_phase(rpi, de, 2);
-+
-+    if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
-+    {
-+        // As we are in phase 2 already here we don't need to worry about
-+        // ceoffbuf_no despite the early exit
-+        post_phase(rpi, de, 2);
-+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
-+        goto fail;
-+    }
-+
-+    tstart_phase(rpi, 2);
-+
-+    apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
-+    apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
-+    apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
-+    apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
-+
-+    apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
-+    apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
-+    apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
-+    apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
-+
-+    // Keep the last thing we resolved as fallback for any ref we fail to
-+    // resolve.  As a final fallback use our current frame.  The pels might
-+    // not be there yet but at least the memory is valid.
-+    //
-+    // Attempt to resolve the entire DPB - we could note what we have used
-+    // in ref lists but probably simpler and more reliable to set the whole thing
-+    {
-+        AVFrame * fallback_frame = f;
-+        for (i = 0; i != 16; ++i) {
-+            // Avoid current frame
-+            const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
-+            AVFrame * fr = hevc_fr->frame;
-+
-+            if (fr != NULL &&
-+                av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
-+            {
-+                fallback_frame = fr;
-+            }
-+            else
-+            {
-+                fr = fallback_frame;
-+            }
-+
-+            apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
-+            apb_write(rpi, 0x9004+16*i, 0);
-+            apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
-+            apb_write(rpi, 0x900C+16*i, 0);
-+        }
-+    }
-+
-+    apb_write(rpi, RPI_CONFIG2,
-+          (sps->bit_depth                             << 0) // BitDepthY
-+        + (sps->bit_depth                             << 4) // BitDepthC
-+       + ((sps->bit_depth>8)                          << 8) // BitDepthY
-+       + ((sps->bit_depth>8)                          << 9) // BitDepthC
-+        + (sps->log2_ctb_size                         <<10)
-+        + (pps->constrained_intra_pred_flag           <<13)
-+        + (sps->sps_strong_intra_smoothing_enable_flag<<14)
-+        + (sps->sps_temporal_mvp_enabled_flag         <<15)
-+        + (pps->log2_parallel_merge_level             <<16)
-+        + (s->sh.slice_temporal_mvp_enabled_flag      <<19)
-+        + (sps->pcm.loop_filter_disable_flag          <<20)
-+       + ((pps->cb_qp_offset&31)                      <<21)
-+       + ((pps->cr_qp_offset&31)                      <<26));
-+
-+    apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
-+    apb_write(rpi, RPI_CURRPOC, s->poc);
-+
-+    // collocated reads/writes
-+    if (sps->sps_temporal_mvp_enabled_flag) {
-+        av_assert0(de->dpbno_col < RPIVID_COL_PICS);
-+        av_assert0(dpbno_cur < RPIVID_COL_PICS);
-+
-+        apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
-+        apb_write_vc_len(rpi, RPI_MVSTRIDE,  rpi->col_stride);
-+        apb_write_vc_addr(rpi, RPI_MVBASE,  rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
-+        apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
-+    }
-+
-+#if TRACE_DEV
-+    apb_dump_regs(rpi, 0x0, 32);
-+    apb_dump_regs(rpi, 0x8000, 24);
-+#endif
-+
-+    apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
-+    apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
-+
-+    int_wait(rpi, 2);
-+
-+    tend_phase(rpi, 2);
-+    coeffbuf_sem_claimed = 0;
-+    sem_post(&rpi->coeffbuf_sem);
-+    // Set valid here to avoid race in resolving in any pending phase 2
-+    av_rpi_zc_set_valid_frame(f);
-+
-+    post_phase(rpi, de, 2);
-+
-+    // Flush frame for CPU access
-+    // Arguably the best place would be at the start of phase 2 but here
-+    // will overlap with the wait
-+    //
-+    // * Even better would be to have better lock/unlock control in ZC for external access
-+    if (rpi->gpu_init_type == GPU_INIT_GPU)  // * CMA is currently always uncached
-+    {
-+        rpi_cache_buf_t cbuf;
-+        rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
-+        rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
-+        rpi_cache_flush_finish(fe);
-+    }
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p] OK\n", __func__, de);
-+#endif
-+
-+    dec_env_release(rpi, de);
-+    return 0;
-+
-+fail:
-+    av_rpi_zc_set_broken_frame(f);
-+    if (coeffbuf_sem_claimed)
-+        sem_post(&rpi->coeffbuf_sem);
-+    abort_phases(rpi, de);  // Dummy any unresolved phases
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p] FAIL\n", __func__, de);
-+#endif
-+
-+    dec_env_release(rpi, de);
-+    return rv;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
-+    const int rpi_use_emu = 0; // FFmpeg removes emulation prevention bytes
-+    const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
-+    const GetBitContext *gb = &s->HEVClc->gb;
-+    const int len = 1 + gb->size_in_bits/8 - gb->index/8;
-+    const void *ptr = &gb->buffer[gb->index/8];
-+
-+    p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
-+    p1_apb_write(de, RPI_BFNUM, len);
-+    p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
-+    p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Wavefront mode
-+
-+static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
-+{
-+    const HEVCPPS * const pps = s->ps.pps;
-+
-+    int i, resetQPY=1;
-+    int indep = !s->sh.dependent_slice_segment_flag;
-+    int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
-+
-+    if (ctb_addr_ts)
-+        wpp_end_previous_slice(de, s, ctb_addr_ts);
-+    pre_slice_decode(de, s);
-+    WriteBitstream(de, s);
-+    if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
-+        WriteProb(de, s);
-+    else if (ctb_col==0)
-+        p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
-+    else
-+        resetQPY=0;
-+    program_slicecmds(de, s->slice_idx);
-+    new_slice_segment(de, s);
-+    wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
-+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
-+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
-+        int last_x = de->PicWidthInCtbsY-1;
-+        if (de->PicWidthInCtbsY>2)
-+            wpp_pause(de, ctb_row);
-+        p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
-+        if (de->PicWidthInCtbsY==2)
-+            p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
-+        if (de->PicWidthInCtbsY==1)
-+            WriteProb(de, s);
-+        else
-+            p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
-+        ctb_addr_ts += pps->column_width[0];
-+        wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+// Tiles mode
-+
-+static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
-+    const HEVCPPS * const pps = s->ps.pps;
-+    int i, resetQPY;
-+
-+    if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
-+    pre_slice_decode(de, s);
-+    WriteBitstream(de, s);
-+    resetQPY = ctb_addr_ts==0
-+            || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
-+            || !s->sh.dependent_slice_segment_flag;
-+    if (resetQPY) WriteProb(de, s);
-+    program_slicecmds(de, s->slice_idx);
-+    new_slice_segment(de, s);
-+    new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
-+    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
-+        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+        int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
-+        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
-+        int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
-+        int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
-+        int last_x = pps->col_bd[tile_x+1]-1;
-+        int last_y = pps->row_bd[tile_y+1]-1;
-+        p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
-+        WriteProb(de, s);
-+        ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
-+        new_entry_point(de, s, 0, 1, ctb_addr_ts);
-+    }
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int cabac_start_align(HEVCContext *s)
-+{
-+    GetBitContext *gb = &s->HEVClc->gb;
-+    skip_bits(gb, 1);
-+    align_get_bits(gb);
-+    // Should look at getting rid of this
-+    return ff_init_cabac_decoder(&s->HEVClc->cc,
-+                          gb->buffer + get_bits_count(gb) / 8,
-+                          (get_bits_left(gb) + 7) / 8);
-+}
-+
-+static int rpi_hevc_decode_slice(
-+    AVCodecContext *avctx,
-+    const uint8_t *buffer,
-+    uint32_t size)
-+{
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    HEVCContext * const s = avctx->priv_data;
-+    dec_env_t * const de = dec_env_get(avctx, rpi);
-+    const HEVCPPS *pps = s->ps.pps;
-+    int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s[%p]\n", __func__, de);
-+#endif
-+    if (de == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
-+        return -1;
-+    }
-+
-+    if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
-+        return -1;
-+    }
-+    de->state = RPIVID_DECODE_SLICE;
-+
-+//    ff_hevc_cabac_init(s, ctb_addr_ts);
-+    cabac_start_align(s);
-+    if (s->ps.sps->scaling_list_enable_flag)
-+        populate_scaling_factors(de, s);
-+    pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
-+                                             : decode_slice(de, s, ctb_addr_ts);
-+#if TRACE_ENTRY
-+    printf(">>> %s[%p]\n", __func__, de);
-+#endif
-+    dec_env_release(rpi, de);
-+    return 0;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
-+{
-+    int rv;
-+    if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
-+        av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
-+    return rv;
-+}
-+
-+static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
-+{
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+    HEVCContext * const s = avctx->priv_data;
-+    // Frame buffering + 1 output.  Would need thread_count extra but we now
-+    // alloc at the start of phase 2 so that is the only thread we need the
-+    // extra buffer for.
-+    const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
-+    int rv;
-+
-+    if (av_rpi_zc_in_use(avctx))
-+    {
-+        const AVZcEnvPtr zc = avctx->opaque;
-+        av_rpi_zc_set_decoder_pool_size(zc, pool_req);
-+        av_rpi_zc_get_buffer(zc, frame);   // get_buffer2 would alloc
-+    }
-+    else
-+    {
-+        if (rpi->zc == NULL) {
-+            pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
-+            // Alloc inside lock to make sure we only ever alloc one
-+            if (rpi->zc == NULL) {
-+                rpi->zc = av_rpi_zc_int_env_alloc(s);
-+            }
-+            pthread_mutex_unlock(&rpi->phase_lock);
-+        }
-+        av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
-+        rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
-+            av_rpi_zc_get_buffer(rpi->zc, frame);
-+    }
-+
-+    if (rv == 0 &&
-+        (rv = ff_attach_decode_data(frame)) < 0)
-+    {
-+        av_frame_unref(frame);
-+    }
-+
-+    if (rv == 0)
-+    {
-+        FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
-+        fdd->post_process = rpivid_retrieve_data;
-+    }
-+
-+    return rv;
-+}
-+
-+#if OPT_PHASE_TIMING
-+static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
-+{
-+    av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
-+           bins[0],  bins[1], bins[2], bins[3],
-+           bins[4],  bins[5], bins[6], bins[7], bins[8]);
-+}
-+#endif
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int rpi_hevc_free(AVCodecContext *avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s\n", __func__);
-+#endif
-+
-+    dec_env_release(rpi, NULL);
-+
-+    // Wait for everything else to stop
-+    {
-+        struct timespec tt;
-+        clock_gettime(CLOCK_REALTIME, &tt);
-+        tt.tv_sec += 2;
-+        while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
-+            const int err = errno;
-+            if (err == ETIMEDOUT) {
-+                av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
-+                return -1;
-+            }
-+            if (err != EINTR) {
-+                av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
-+                break;
-+            }
-+        }
-+    }
-+
-+#if OPT_PHASE_TIMING
-+    {
-+        unsigned int i;
-+        for (i = 0; i != RPIVID_PHASES; ++i) {
-+            const phase_wait_env_t * const p = rpi->phase_reqs + i;
-+            av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
-+                   (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
-+                   (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
-+            av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d        >\n",
-+                   time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
-+                   time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
-+            log_bin_phase(avctx, p->time_bins);
-+            log_bin_phase(avctx, p->time_bins3);
-+            log_bin_phase(avctx, p->time_bins5);
-+            av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
-+                   (unsigned int)(p->max_phase_time / 1000),
-+                   p->max_time_decode_order);
-+        }
-+        av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
-+    }
-+#endif
-+
-+    if (rpi->dec_envs != NULL)
-+    {
-+        for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
-+            dec_env_delete(rpi->dec_envs[i]);
-+        }
-+        av_freep(&rpi->dec_envs);
-+    }
-+
-+    av_rpi_zc_int_env_freep(&rpi->zc);
-+
-+    gpu_free(&rpi->gcolbuf);
-+
-+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
-+        gpu_free(rpi->gbitbufs + i);
-+    }
-+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
-+        gpu_free(rpi->gcoeffbufs + i);
-+    }
-+
-+    unmap_devp(&rpi->regs, REGS_SIZE);
-+    unmap_devp(&rpi->ints, INTS_SIZE);
-+
-+    if (rpi->gpu_init_type > 0)
-+        rpi_mem_gpu_uninit();
-+
-+    if (rpi->mbox_fd >= 0) {
-+        mbox_release_clock(rpi->mbox_fd);
-+        mbox_close(rpi->mbox_fd);
-+    }
-+
-+    sem_destroy(&rpi->ref_zero);
-+    sem_destroy(&rpi->coeffbuf_sem);
-+    sem_destroy(&rpi->bitbuf_sem);
-+
-+#if TRACE_ENTRY
-+    printf(">>> %s\n", __func__);
-+#endif
-+    return 0;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+static int rpi_hevc_init(AVCodecContext *avctx) {
-+    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
-+//    const char *err;
-+
-+#if TRACE_ENTRY
-+    printf("<<< %s\n", __func__);
-+#endif
-+
-+    if (avctx->width>4096 || avctx->height>4096) {
-+        av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
-+        return AVERROR(ENOTSUP);
-+    }
-+
-+    memset(rpi, 0, sizeof(*rpi));
-+
-+    rpi->mbox_fd = -1;
-+    rpi->decode_order = 0;
-+
-+    // Initial PU/COEFF stream buffer split chosen as worst case seen so far
-+    rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
-+
-+
-+    atomic_store(&rpi->ref_count, 1);
-+    sem_init(&rpi->ref_zero, 0, 0);
-+
-+    sem_init(&rpi->bitbuf_sem,   0, RPIVID_BITBUFS);
-+    sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
-+
-+    pthread_mutex_init(&rpi->phase_lock, NULL);
-+
-+    if ((rpi->mbox_fd = mbox_open()) < 0)
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
-+        goto fail;
-+    }
-+    mbox_request_clock(rpi->mbox_fd);
-+
-+    if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
-+        (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
-+        goto fail;
-+    }
-+
-+    if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
-+        goto fail;
-+    }
-+
-+    if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
-+        goto fail;
-+    }
-+
-+    rpi->col_stride = rnd64(avctx->width);
-+    rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
-+    if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
-+        goto fail;
-+    }
-+
-+    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
-+        if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
-+        {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
-+            goto fail;
-+        }
-+    }
-+
-+    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
-+        if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
-+        {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
-+            goto fail;
-+        }
-+    }
-+
-+    return 0;
-+
-+fail:
-+    rpi_hevc_free(avctx);
-+    return AVERROR_EXTERNAL;
-+}
-+
-+//////////////////////////////////////////////////////////////////////////////
-+
-+const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
-+    .name           = "hevc_rpi4_8",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_RPI4_8,
-+    .alloc_frame    = rpivid_hevc_alloc_frame,
-+    .start_frame    = rpi_hevc_start_frame,
-+    .end_frame      = rpi_hevc_end_frame,
-+    .abort_frame    = rpi_hevc_abort_frame,
-+    .decode_slice   = rpi_hevc_decode_slice,
-+    .init           = rpi_hevc_init,
-+    .uninit         = rpi_hevc_free,
-+    .priv_data_size = sizeof(RPI_T),
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-+
-+const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
-+    .name           = "hevc_rpi4_10",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_RPI4_10,
-+    .alloc_frame    = rpivid_hevc_alloc_frame,
-+    .start_frame    = rpi_hevc_start_frame,
-+    .end_frame      = rpi_hevc_end_frame,
-+    .abort_frame    = rpi_hevc_abort_frame,
-+    .decode_slice   = rpi_hevc_decode_slice,
-+    .init           = rpi_hevc_init,
-+    .uninit         = rpi_hevc_free,
-+    .priv_data_size = sizeof(RPI_T),
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-+
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index aef911f3bb..927fa66528 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -21,6 +21,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include <drm_fourcc.h>
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- #include <sys/mman.h>
-@@ -29,6 +30,7 @@
- #include <poll.h>
- #include "libavcodec/avcodec.h"
- #include "libavcodec/internal.h"
-+#include "libavutil/hwcontext.h"
- #include "v4l2_context.h"
- #include "v4l2_buffers.h"
- #include "v4l2_m2m.h"
-@@ -203,7 +205,79 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
-     return AVCOL_TRC_UNSPECIFIED;
- }
- 
--static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
-+{
-+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-+    AVDRMLayerDescriptor *layer;
-+
-+    /* fill the DRM frame descriptor */
-+    drm_desc->nb_objects = avbuf->num_planes;
-+    drm_desc->nb_layers = 1;
-+
-+    layer = &drm_desc->layers[0];
-+    layer->nb_planes = avbuf->num_planes;
-+
-+    for (int i = 0; i < avbuf->num_planes; i++) {
-+        layer->planes[i].object_index = i;
-+        layer->planes[i].offset = 0;
-+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-+    }
-+
-+    switch (avbuf->context->av_pix_fmt) {
-+    case AV_PIX_FMT_YUYV422:
-+
-+        layer->format = DRM_FORMAT_YUYV;
-+        layer->nb_planes = 1;
-+
-+        break;
-+
-+    case AV_PIX_FMT_NV12:
-+    case AV_PIX_FMT_NV21:
-+
-+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
-+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 2;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-+        break;
-+
-+    case AV_PIX_FMT_YUV420P:
-+
-+        layer->format = DRM_FORMAT_YUV420;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 3;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+
-+        layer->planes[2].object_index = 0;
-+        layer->planes[2].offset = layer->planes[1].offset +
-+            ((avbuf->plane_info[0].bytesperline *
-+              avbuf->context->format.fmt.pix.height) >> 2);
-+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+        break;
-+
-+    default:
-+        drm_desc->nb_layers = 0;
-+        break;
-+    }
-+
-+    return (uint8_t *) drm_desc;
-+}
-+
-+static void v4l2_free_buffer(void *opaque, uint8_t *data)
- {
-     V4L2Buffer* avbuf = opaque;
-     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-@@ -227,27 +301,49 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-     }
- }
- 
--static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
- {
--    V4L2m2mContext *s = buf_to_m2mctx(in);
-+    struct v4l2_exportbuffer expbuf;
-+    int i, ret;
- 
--    if (plane >= in->num_planes)
--        return AVERROR(EINVAL);
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        memset(&expbuf, 0, sizeof(expbuf));
- 
--    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
--    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
--                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
--    if (!*buf)
--        return AVERROR(ENOMEM);
-+        expbuf.index = avbuf->buf.index;
-+        expbuf.type = avbuf->buf.type;
-+        expbuf.plane = i;
-+
-+        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
-+        if (ret < 0)
-+            return AVERROR(errno);
-+
-+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
-+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        } else {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
-+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int v4l2_buf_increase_ref(V4L2Buffer *in)
-+{
-+    V4L2m2mContext *s = buf_to_m2mctx(in);
- 
-     if (in->context_ref)
-         atomic_fetch_add(&in->context_refcount, 1);
-     else {
-         in->context_ref = av_buffer_ref(s->self_ref);
--        if (!in->context_ref) {
--            av_buffer_unref(buf);
-+        if (!in->context_ref)
-             return AVERROR(ENOMEM);
--        }
-+
-         in->context_refcount = 1;
-     }
- 
-@@ -257,6 +353,46 @@ static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-     return 0;
- }
- 
-+static int v4l2_buf_to_bufref_drm(V4L2Buffer *in, AVBufferRef **buf)
-+{
-+    int ret;
-+
-+    *buf = av_buffer_create((uint8_t *) &in->drm_frame,
-+                            sizeof(in->drm_frame),
-+                            v4l2_free_buffer,
-+                            in, AV_BUFFER_FLAG_READONLY);
-+    if (!*buf)
-+        return AVERROR(ENOMEM);
-+
-+    ret = v4l2_buf_increase_ref(in);
-+    if (ret)
-+         av_buffer_unref(buf);
-+
-+    return ret;
-+}
-+
-+static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-+{
-+    int ret;
-+
-+    if (plane >= in->num_planes)
-+        return AVERROR(EINVAL);
-+
-+    /* most encoders return 0 in data_offset but vp8 does require this value */
-+    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
-+                            in->plane_info[plane].length,
-+                            v4l2_free_buffer,
-+                            in, 0);
-+    if (!*buf)
-+        return AVERROR(ENOMEM);
-+
-+    ret = v4l2_buf_increase_ref(in);
-+    if (ret)
-+        av_buffer_unref(buf);
-+
-+    return ret;
-+}
-+
- static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, AVBufferRef* bref)
- {
-     unsigned int bytesused, length;
-@@ -267,7 +403,8 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
-     bytesused = FFMIN(size, out->plane_info[plane].length);
-     length = out->plane_info[plane].length;
- 
--    memcpy(out->plane_info[plane].mm_addr, data, FFMIN(size, out->plane_info[plane].length));
-+    memcpy(out->plane_info[plane].mm_addr, data,
-+           FFMIN(size, out->plane_info[plane].length));
- 
-     if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-         out->planes[plane].bytesused = bytesused;
-@@ -291,7 +428,10 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer* out)
-     int i, ret;
- 
-     for(i = 0; i < out->num_planes; i++) {
--        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, frame->buf[i]);
-+        ret = v4l2_bufref_to_buf(out, i,
-+                                frame->buf[i]->data,
-+                                frame->buf[i]->size,
-+                                frame->buf[i]);
-         if (ret)
-             return ret;
-     }
-@@ -308,34 +448,59 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
- 
-     av_frame_unref(frame);
- 
--    /* 1. get references to the actual data */
--    for (i = 0; i < avbuf->num_planes; i++) {
--        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        /* 1. get references to the actual data */
-+        ret = v4l2_buf_to_bufref_drm(avbuf, &frame->buf[0]);
-         if (ret)
-             return ret;
- 
--        frame->linesize[i] = avbuf->plane_info[i].bytesperline;
--        frame->data[i] = frame->buf[i]->data;
--    }
-+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
-+    } else {
-+        /* 1. get references to the actual data */
-+        for (i = 0; i < avbuf->num_planes; i++) {
-+            ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
-+            if (ret)
-+                return ret;
-+
-+            frame->linesize[i] = avbuf->plane_info[i].bytesperline;
-+            frame->data[i] = frame->buf[i]->data;
-+        }
- 
--    /* 1.1 fixup special cases */
--    switch (avbuf->context->av_pix_fmt) {
--    case AV_PIX_FMT_NV12:
--        if (avbuf->num_planes > 1)
-+        /* 1.1 fixup special cases where we expand monoplanar */
-+        switch (avbuf->context->av_pix_fmt) {
-+        case AV_PIX_FMT_YUV420P:
-+            if (avbuf->num_planes > 1)
-+                break;
-+            frame->linesize[1] = avbuf->plane_info[0].bytesperline / 2;
-+            frame->data[1] = frame->buf[0]->data +
-+                avbuf->plane_info[0].bytesperline *
-+                avbuf->context->format.fmt.pix.height;
-+            frame->linesize[2] = frame->linesize[1];
-+            frame->data[2] = frame->data[1] +
-+                frame->linesize[1] *
-+                avbuf->context->format.fmt.pix.height / 2;
-             break;
--        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
--        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
--        break;
--    default:
--        break;
-+        case AV_PIX_FMT_NV12:
-+            if (avbuf->num_planes > 1)
-+                break;
-+            frame->linesize[1] = avbuf->plane_info[0].bytesperline;
-+            frame->data[1] = frame->buf[0]->data +
-+                avbuf->plane_info[0].bytesperline *
-+                avbuf->context->format.fmt.pix.height;
-+            break;
-+        default:
-+            break;
-+        }
-+        frame->format = avbuf->context->av_pix_fmt;
-     }
- 
-     /* 2. get frame information */
-     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
--    frame->format = avbuf->context->av_pix_fmt;
-     frame->color_primaries = v4l2_get_color_primaries(avbuf);
--    frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_range = v4l2_get_color_range(avbuf);
-+    frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_trc = v4l2_get_color_trc(avbuf);
-     frame->pts = v4l2_get_pts(avbuf);
- 
-@@ -361,7 +526,8 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
-     if (ret)
-         return ret;
- 
--    pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-+    pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ?
-+        avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-     pkt->data = pkt->buf->data;
- 
-     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
-@@ -402,6 +568,27 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-     avbuf->buf.type = ctx->type;
-     avbuf->buf.index = index;
- 
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        AVHWFramesContext *hwframes;
-+
-+        av_buffer_unref(&ctx->frames_ref);
-+
-+        ctx->frames_ref = av_hwframe_ctx_alloc(buf_to_m2mctx(avbuf)->device_ref);
-+        if (!ctx->frames_ref) {
-+            ret = AVERROR(ENOMEM);
-+            return ret;
-+        }
-+
-+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
-+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
-+        hwframes->sw_format = ctx->av_pix_fmt;
-+        hwframes->width = ctx->width;
-+        hwframes->height = ctx->height;
-+        ret = av_hwframe_ctx_init(ctx->frames_ref);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->buf.length = VIDEO_MAX_PLANES;
-         avbuf->buf.m.planes = avbuf->planes;
-@@ -417,6 +604,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-             /* in MP, the V4L2 API states that buf.length means num_planes */
-             if (avbuf->num_planes >= avbuf->buf.length)
-                 break;
-+
-             if (avbuf->buf.m.planes[avbuf->num_planes].length)
-                 avbuf->num_planes++;
-         }
-@@ -431,14 +619,24 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
- 
-         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
--                                           PROT_READ | PROT_WRITE, MAP_SHARED,
--                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                               buf_to_m2mctx(avbuf)->fd,
-+                                               avbuf->buf.m.planes[i].m.mem_offset);
-+            }
-         } else {
-             avbuf->plane_info[i].length = avbuf->buf.length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
--                                          PROT_READ | PROT_WRITE, MAP_SHARED,
--                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-+                                              PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                              buf_to_m2mctx(avbuf)->fd,
-+                                              avbuf->buf.m.offset);
-+            }
-         }
- 
-         if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
-@@ -447,18 +645,23 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
- 
-     avbuf->status = V4L2BUF_AVAILABLE;
- 
--    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
--        return 0;
--
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
--        avbuf->buf.m.planes = avbuf->planes;
-         avbuf->buf.length   = avbuf->num_planes;
--
-+        avbuf->buf.m.planes = avbuf->planes;
-     } else {
-         avbuf->buf.bytesused = avbuf->planes[0].bytesused;
-         avbuf->buf.length    = avbuf->planes[0].length;
-     }
- 
-+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        return 0;
-+
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        ret = v4l2_buffer_export_drm(avbuf);
-+        if (ret)
-+                return ret;
-+    }
-+
-     return ff_v4l2_buffer_enqueue(avbuf);
- }
- 
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 7a57caf949..b6072baec8 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -27,6 +27,7 @@
- #include <stdatomic.h>
- #include <linux/videodev2.h>
- 
-+#include "libavutil/hwcontext_drm.h"
- #include "avcodec.h"
- 
- enum V4L2Buffer_status {
-@@ -42,6 +43,9 @@ typedef struct V4L2Buffer {
-     /* each buffer needs to have a reference to its context */
-     struct V4L2Context *context;
- 
-+    /* DRM descriptor */
-+    AVDRMFrameDescriptor drm_frame;
-+
-     /* This object is refcounted per-plane, so we need to keep track
-      * of how many context-refs we are holding. */
-     AVBufferRef *context_ref;
-@@ -127,5 +131,4 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
-  */
- int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
- 
--
- #endif // AVCODEC_V4L2_BUFFERS_H
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index efcb0426e4..5c51399a4c 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -263,6 +263,12 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
-     /* if we are draining and there are no more capture buffers queued in the driver we are done */
-     if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
-         for (i = 0; i < ctx->num_buffers; i++) {
-+            /* capture buffer initialization happens during decode hence
-+             * detection happens at runtime
-+             */
-+            if (!ctx->buffers)
-+                break;
-+
-             if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-                 goto start;
-         }
-@@ -393,22 +399,54 @@ static int v4l2_release_buffers(V4L2Context* ctx)
-     struct v4l2_requestbuffers req = {
-         .memory = V4L2_MEMORY_MMAP,
-         .type = ctx->type,
--        .count = 0, /* 0 -> unmaps buffers from the driver */
-+        .count = 0, /* 0 -> unmap all buffers from the driver */
-     };
--    int i, j;
-+    int ret, i, j;
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
-         V4L2Buffer *buffer = &ctx->buffers[i];
- 
-         for (j = 0; j < buffer->num_planes; j++) {
-             struct V4L2Plane_info *p = &buffer->plane_info[j];
-+
-+            if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+                /* output buffers are not EXPORTED */
-+                goto unmap;
-+            }
-+
-+            if (ctx_to_m2mctx(ctx)->output_drm) {
-+                /* use the DRM frame to close */
-+                if (buffer->drm_frame.objects[j].fd >= 0) {
-+                    if (close(buffer->drm_frame.objects[j].fd) < 0) {
-+                        av_log(logger(ctx), AV_LOG_ERROR, "%s close drm fd "
-+                            "[buffer=%2d, plane=%d, fd=%2d] - %s \n",
-+                            ctx->name, i, j, buffer->drm_frame.objects[j].fd,
-+                            av_err2str(AVERROR(errno)));
-+                    }
-+                }
-+            }
-+unmap:
-             if (p->mm_addr && p->length)
-                 if (munmap(p->mm_addr, p->length) < 0)
--                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
-+                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n",
-+                        ctx->name, av_err2str(AVERROR(errno)));
-         }
-     }
- 
--    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
-+    if (ret < 0) {
-+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
-+                ctx->name, av_err2str(AVERROR(errno)));
-+
-+            if (ctx_to_m2mctx(ctx)->output_drm)
-+                av_log(logger(ctx), AV_LOG_ERROR,
-+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
-+                    "for all buffers: \n"
-+                    "  1. drmModeRmFB(..)\n"
-+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
-+    }
-+
-+    return ret;
- }
- 
- static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -501,6 +539,24 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
-     return 0;
- }
- 
-+static AVRational v4l2_get_sar(V4L2Context* ctx)
-+{
-+    struct AVRational sar = { 1, 1 };
-+    struct v4l2_cropcap cropcap;
-+    int ret;
-+
-+    memset(&cropcap, 0, sizeof(cropcap));
-+    cropcap.type = ctx->type;
-+
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_CROPCAP, &cropcap);
-+    if (ret)
-+        return sar;
-+
-+    sar.num = cropcap.pixelaspect.numerator;
-+    sar.den = cropcap.pixelaspect.denominator;
-+    return sar;
-+}
-+
-  /*****************************************************************************
-   *
-   *             V4L2 Context Interface
-@@ -574,6 +630,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
- int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame)
- {
-     V4L2Buffer* avbuf = NULL;
-+    int ret;
- 
-     /*
-      * blocks until:
-@@ -588,7 +645,12 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame)
-         return AVERROR(EAGAIN);
-     }
- 
--    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+    ret = ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+    if (ret)
-+	    return ret;
-+
-+    frame->sample_aspect_ratio = v4l2_get_sar(ctx);
-+    return 0;
- }
- 
- int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-@@ -692,9 +754,8 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-         ctx->buffers[i].context = ctx;
-         ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
-         if (ret < 0) {
--            av_log(logger(ctx), AV_LOG_ERROR, "%s buffer initialization (%s)\n", ctx->name, av_err2str(ret));
--            av_free(ctx->buffers);
--            return ret;
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
-+            goto error;
-         }
-     }
- 
-@@ -707,4 +768,12 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
- 
-     return 0;
-+
-+error:
-+    v4l2_release_buffers(ctx);
-+
-+    av_free(ctx->buffers);
-+    ctx->buffers = NULL;
-+
-+    return ret;
- }
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 632f1d0aac..9a1dbbea60 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -91,6 +91,8 @@ typedef struct V4L2Context {
-      */
-     int done;
- 
-+    AVBufferRef *frames_ref;
-+
- } V4L2Context;
- 
- /**
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index 427e165f58..7896326e80 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -159,7 +159,9 @@ static int v4l2_configure_contexts(V4L2m2mContext* s)
-         goto error;
-     }
- 
--    /* decoder's buffers need to be updated at a later stage */
-+    /* decoder's capture buffers are updated during v4l2_try_start once we find
-+     * the valid format.
-+     */
-     if (!av_codec_is_decoder(s->avctx->codec)) {
-         ret = ff_v4l2_context_init(&s->capture);
-         if (ret) {
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 0d4671beb1..7e075e7e80 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -38,7 +38,7 @@
- 
- #define V4L_M2M_DEFAULT_OPTS \
-     { "num_output_buffers", "Number of buffers in the output context",\
--        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
-+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 2 }, 1, INT_MAX, FLAGS }
- 
- typedef struct V4L2m2mContext {
-     char devname[PATH_MAX];
-@@ -59,6 +59,11 @@ typedef struct V4L2m2mContext {
- 
-     /* Reference to self; only valid while codec is active. */
-     AVBufferRef *self_ref;
-+
-+    AVBufferRef *device_ref;
-+
-+    /* generate DRM frames */
-+    int output_drm;
- } V4L2m2mContext;
- 
- typedef struct V4L2m2mPriv
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index d0601f0e2f..df46d4af46 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -23,12 +23,20 @@
- 
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
-+
-+#include "config.h"
-+
-+#include "libavutil/hwcontext.h"
-+#include "libavutil/hwcontext_drm.h"
- #include "libavutil/pixfmt.h"
- #include "libavutil/pixdesc.h"
- #include "libavutil/opt.h"
- #include "libavcodec/avcodec.h"
- #include "libavcodec/decode.h"
- 
-+#include "libavcodec/hwaccel.h"
-+#include "libavcodec/internal.h"
-+
- #include "v4l2_context.h"
- #include "v4l2_m2m.h"
- #include "v4l2_fmt.h"
-@@ -86,8 +94,8 @@ static int v4l2_try_start(AVCodecContext *avctx)
-     if (!capture->buffers) {
-         ret = ff_v4l2_context_init(capture);
-         if (ret) {
--            av_log(avctx, AV_LOG_DEBUG, "can't request output buffers\n");
--            return ret;
-+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
-+            return AVERROR(ENOMEM);
-         }
-     }
- 
-@@ -125,6 +133,8 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
-     return 0;
- }
- 
-+static AVPacket saved_avpkt = { 0 };
-+
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-@@ -133,9 +143,14 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-     AVPacket avpkt = {0};
-     int ret;
- 
--    ret = ff_decode_get_packet(avctx, &avpkt);
--    if (ret < 0 && ret != AVERROR_EOF)
--        return ret;
-+    if (saved_avpkt.size) {
-+	avpkt = saved_avpkt;
-+	memset(&saved_avpkt, 0, sizeof(saved_avpkt));
-+    } else {
-+        ret = ff_decode_get_packet(avctx, &avpkt);
-+        if (ret < 0 && ret != AVERROR_EOF)
-+            return ret;
-+    }
- 
-     if (s->draining)
-         goto dequeue;
-@@ -144,6 +159,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-     if (ret < 0) {
-         if (ret != AVERROR(ENOMEM))
-            return ret;
-+
-+        saved_avpkt = avpkt;
-         /* no input buffers available, continue dequeing */
-     }
- 
-@@ -151,12 +168,18 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-         ret = v4l2_try_start(avctx);
-         if (ret) {
-             av_packet_unref(&avpkt);
-+
-+            /* cant recover */
-+            if (ret == AVERROR(ENOMEM))
-+                return ret;
-+
-             return 0;
-         }
-     }
- 
- dequeue:
--    av_packet_unref(&avpkt);
-+    if (!saved_avpkt.size)
-+        av_packet_unref(&avpkt);
-     return ff_v4l2_context_dequeue_frame(capture, frame);
- }
- 
-@@ -186,6 +209,36 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-     capture->av_pix_fmt = avctx->pix_fmt;
- 
-+#if !CONFIG_VOUT_DRM_KLUDGE
-+    /* the client requests the codec to generate DRM frames:
-+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-+     *       check the ff_v4l2_buffer_to_avframe conversion function.
-+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
-+     *       check the v4l2_get_drm_frame function.
-+     */
-+    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
-+    case AV_PIX_FMT_DRM_PRIME:
-+        s->output_drm = 1;
-+        break;
-+    case AV_PIX_FMT_NONE:
-+        return 0;
-+        break;
-+    default:
-+        break;
-+    }
-+#else
-+    s->output_drm = 1;
-+#endif
-+
-+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
-+    if (!s->device_ref) {
-+        ret = AVERROR(ENOMEM);
-+        return ret;
-+    }
-+    ret = av_hwdevice_ctx_init(s->device_ref);
-+    if (ret < 0)
-+        return ret;
-+
-     ret = ff_v4l2_m2m_codec_init(avctx);
-     if (ret) {
-         V4L2m2mPriv *priv = avctx->priv_data;
-@@ -199,6 +252,25 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     return v4l2_prepare_decoder(s);
- }
- 
-+static void v4l2_flush(AVCodecContext *avctx)
-+{
-+    V4L2m2mPriv *priv = avctx->priv_data;
-+    V4L2m2mContext* s = priv->context;
-+    int ret;
-+
-+    /* wait for pending buffer references */
-+    if (atomic_load(&s->refcount))
-+        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
-+
-+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
-+    if (ret)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
-+
-+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-+    if (ret)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
-+}
-+
- #define OFFSET(x) offsetof(V4L2m2mPriv, x)
- #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
- 
-@@ -209,30 +281,41 @@ static const AVOption options[] = {
-     { NULL},
- };
- 
--#define M2MDEC(NAME, LONGNAME, CODEC, bsf_name) \
--static const AVClass v4l2_m2m_ ## NAME ## _dec_class = {\
--    .class_name = #NAME "_v4l2_m2m_decoder",\
--    .item_name  = av_default_item_name,\
--    .option     = options,\
--    .version    = LIBAVUTIL_VERSION_INT,\
--};\
--\
--AVCodec ff_ ## NAME ## _v4l2m2m_decoder = { \
--    .name           = #NAME "_v4l2m2m" ,\
--    .long_name      = NULL_IF_CONFIG_SMALL("V4L2 mem2mem " LONGNAME " decoder wrapper"),\
--    .type           = AVMEDIA_TYPE_VIDEO,\
--    .id             = CODEC ,\
--    .priv_data_size = sizeof(V4L2m2mPriv),\
--    .priv_class     = &v4l2_m2m_ ## NAME ## _dec_class,\
--    .init           = v4l2_decode_init,\
--    .receive_frame  = v4l2_receive_frame,\
--    .close          = ff_v4l2_m2m_codec_end,\
--    .bsfs           = bsf_name, \
--    .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | \
--                      AV_CODEC_CAP_AVOID_PROBING, \
--    .wrapper_name   = "v4l2m2m", \
-+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-+    HW_CONFIG_INTERNAL(DRM_PRIME),
-+    NULL
- };
- 
-+#define M2MDEC_CLASS(NAME) \
-+    static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
-+        .class_name = #NAME "_v4l2_m2m_decoder", \
-+        .item_name  = av_default_item_name, \
-+        .option     = options, \
-+        .version    = LIBAVUTIL_VERSION_INT, \
-+    };
-+
-+#define M2MDEC(NAME, LONGNAME, CODEC, bsf_name) \
-+    M2MDEC_CLASS(NAME) \
-+    AVCodec ff_ ## NAME ## _v4l2m2m_decoder = { \
-+        .name           = #NAME "_v4l2m2m" , \
-+        .long_name      = NULL_IF_CONFIG_SMALL("V4L2 mem2mem " LONGNAME " decoder wrapper"), \
-+        .type           = AVMEDIA_TYPE_VIDEO, \
-+        .id             = CODEC , \
-+        .priv_data_size = sizeof(V4L2m2mPriv), \
-+        .priv_class     = &v4l2_m2m_ ## NAME ## _dec_class, \
-+        .init           = v4l2_decode_init, \
-+        .receive_frame  = v4l2_receive_frame, \
-+        .close          = ff_v4l2_m2m_codec_end, \
-+        .flush          = v4l2_flush, \
-+        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
-+                                                         AV_PIX_FMT_NV12, \
-+                                                         AV_PIX_FMT_NONE}, \
-+        .bsfs           = bsf_name, \
-+        .hw_configs     = v4l2_m2m_hw_configs, \
-+        .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
-+        .wrapper_name   = "v4l2m2m", \
-+    };
-+
- M2MDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");
- M2MDEC(hevc,  "HEVC",  AV_CODEC_ID_HEVC,       "hevc_mp4toannexb");
- M2MDEC(mpeg1, "MPEG1", AV_CODEC_ID_MPEG1VIDEO, NULL);
-diff --git a/libavcodec/v4l2_phase.c b/libavcodec/v4l2_phase.c
-new file mode 100644
-index 0000000000..0a7f6abd33
---- /dev/null
-+++ b/libavcodec/v4l2_phase.c
-@@ -0,0 +1,140 @@
-+// v4l2_phase.c
-+
-+#include <stdio.h>
-+#include <semaphore.h>
-+#include <pthread.h>
-+
-+#include "libavutil/log.h"
-+#include "v4l2_phase.h"
-+
-+typedef struct phase_envss {
-+    unsigned int last_order;
-+    pthread_mutex_t lock;
-+    pthread_cond_t cond;
-+} phase_env;
-+
-+struct V4L2PhaseControl {
-+    unsigned int order;
-+    unsigned int phase_count;
-+    phase_env p[V4L2PHASE_PHASE_COUNT];
-+};
-+
-+
-+unsigned int ff_v4l2_phase_order_next(V4L2PhaseControl * const pc)
-+{
-+    return ++pc->order;
-+}
-+
-+// Phase isn't required but it acts as a check that we know what we are doing
-+int
-+ff_v4l2_phase_claim(V4L2PhaseInfo * const pi, unsigned int phase)
-+{
-+    V4L2PhaseControl *const pc = pi->ctrl;
-+    phase_env * const p = pc->p + phase;
-+
-+    if (pi->n2 != phase * 2) {
-+        av_log(NULL, AV_LOG_ERROR, "%s: Unexpected phase: req=%d, cur=%d/%d\n", __func__, phase, pi->n2 >> 1, pi->n2 & 1);
-+        return -1;
-+    }
-+
-+    pthread_mutex_lock(&p->lock);
-+
-+    while (pi->order != p->last_order + 1) {
-+        pthread_cond_wait(&p->cond, &p->lock);
-+    }
-+
-+    pi->n2++;
-+    pthread_mutex_unlock(&p->lock);
-+    return 0;
-+}
-+
-+int
-+ff_v4l2_phase_release(V4L2PhaseInfo * const pi, unsigned int phase)
-+{
-+    V4L2PhaseControl *const pc = pi->ctrl;
-+    phase_env * const p = pc->p + phase;
-+
-+    if (pi->n2 != ((phase << 1) | 1)) {
-+        av_log(NULL, AV_LOG_ERROR, "%s: Unexpected phase: req=%d, cur=%d/%d\n", __func__, phase, pi->n2 >> 1, pi->n2 & 1);
-+        return -1;
-+    }
-+
-+    if (pi->order != p->last_order + 1) {
-+        av_log(NULL, AV_LOG_ERROR, "%s: order_mismatch\n", __func__);
-+        return -1;
-+    }
-+
-+    pthread_mutex_lock(&p->lock);
-+    p->last_order = pi->order;
-+    pi->n2++;
-+    pthread_cond_broadcast(&p->cond);
-+    pthread_mutex_unlock(&p->lock);
-+    return 0;
-+}
-+
-+// Init the PhaseInfo, assign a new order, claim phase 0
-+int
-+ff_v4l2_phase_start(V4L2PhaseInfo * const pi, V4L2PhaseControl * const pc)
-+{
-+    pi->n2 = 0;
-+    pi->ctrl = pc;
-+    pi->order = ff_v4l2_phase_order_next(pc);
-+    return ff_v4l2_phase_claim(pi, 0);
-+}
-+
-+// Release any claimed phase and claim+release all remaining phases
-+void ff_v4l2_phase_abort(V4L2PhaseInfo * const pi)
-+{
-+    V4L2PhaseControl *const pc = pi->ctrl;
-+
-+    // Nothing to do
-+    if (pi->n2 == 0 || pi->n2 >= pc->phase_count * 2)
-+        return;
-+
-+    // Run through all remaining phases
-+    do {
-+        if ((pi->n2 & 1) == 0)
-+            ff_v4l2_phase_claim(pi, pi->n2 >> 1);
-+        else
-+            ff_v4l2_phase_release(pi, pi->n2 >> 1);
-+    } while (pi->n2 < pc->phase_count * 2);
-+}
-+
-+
-+V4L2PhaseControl *
-+ff_v4l2_phase_control_new(unsigned int phase_count)
-+{
-+    V4L2PhaseControl * pc;
-+    unsigned int i;
-+    if (phase_count > V4L2PHASE_PHASE_COUNT)
-+        return NULL;
-+    if ((pc = av_mallocz(sizeof(*pc))) == NULL)
-+        return NULL;
-+    pc->phase_count = phase_count;
-+    for (i = 0; i != phase_count; ++i) {
-+        phase_env * const p = pc->p + i;
-+        p->last_order = 0;
-+        pthread_mutex_init(&p->lock, NULL);
-+        pthread_cond_init(&p->cond, NULL);
-+    }
-+    return pc;
-+}
-+
-+void
-+ff_v4l2_phase_control_deletez(V4L2PhaseControl ** const ppc)
-+{
-+    V4L2PhaseControl * const pc = *ppc;
-+    unsigned int i;
-+
-+    if (pc == NULL)
-+        return;
-+    *ppc = NULL;
-+
-+    for (i = 0; i != pc->phase_count; ++i) {
-+        phase_env * const p = pc->p + i;
-+        pthread_mutex_destroy(&p->lock);
-+        pthread_cond_destroy(&p->cond);
-+    }
-+}
-+
-+
-diff --git a/libavcodec/v4l2_phase.h b/libavcodec/v4l2_phase.h
-new file mode 100644
-index 0000000000..392f22b988
---- /dev/null
-+++ b/libavcodec/v4l2_phase.h
-@@ -0,0 +1,37 @@
-+// v4l2_phase.h
-+#ifndef AVCODEC_V4L2_PHASE_H
-+#define AVCODEC_V4L2_PHASE_H
-+
-+#define V4L2PHASE_PHASE_COUNT 2
-+
-+struct V4L2PhaseControl;
-+typedef struct V4L2PhaseControl V4L2PhaseControl;
-+
-+typedef struct V4L2PhaseInfo {
-+    unsigned int n2;  // (phase << 1) | (claimed)
-+    unsigned int order;
-+    V4L2PhaseControl * ctrl;
-+} V4L2PhaseInfo;
-+
-+unsigned int ff_v4l2_phase_order_next(V4L2PhaseControl * const pc);
-+
-+static inline int ff_v4l2_phase_started(const V4L2PhaseInfo * const pi)
-+{
-+    return pi->n2 != 0;
-+}
-+
-+// Init the PhaseInfo, assign a new order, claim phase 0
-+int ff_v4l2_phase_start(V4L2PhaseInfo * const pi, V4L2PhaseControl * const pc);
-+
-+// Phase isn't required but it acts as a check that we know what we are doing
-+int ff_v4l2_phase_claim(V4L2PhaseInfo * const pi, unsigned int phase);
-+int ff_v4l2_phase_release(V4L2PhaseInfo * const pi, unsigned int phase);
-+
-+// Release any claimed phase and claim+release all remaining phases
-+void ff_v4l2_phase_abort(V4L2PhaseInfo * const pi);
-+
-+
-+V4L2PhaseControl * ff_v4l2_phase_control_new(unsigned int phase_count);
-+void ff_v4l2_phase_control_deletez(V4L2PhaseControl ** const ppc);
-+
-+#endif
-diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c
-new file mode 100644
-index 0000000000..56fafbb637
---- /dev/null
-+++ b/libavcodec/v4l2_request.c
-@@ -0,0 +1,1028 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <drm/drm_fourcc.h>
-+#include <linux/media.h>
-+#include <sys/mman.h>
-+#include <sys/types.h>
-+#include <sys/stat.h>
-+#include <fcntl.h>
-+
-+#include <sys/sysmacros.h>
-+#include <libudev.h>
-+
-+#include "decode.h"
-+#include "internal.h"
-+#include "v4l2_request.h"
-+#include "v4l2_phase.h"
-+
-+uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    return req ? v4l2_timeval_to_ns(&req->capture.buffer.timestamp) : 0;
-+}
-+
-+int ff_v4l2_request_start_phase_control(AVFrame *frame, struct V4L2PhaseControl * ctrl)
-+{
-+    V4L2RequestDescriptor * const req = (V4L2RequestDescriptor*)frame->data[0];
-+    return ff_v4l2_phase_start(&req->phase, ctrl);
-+}
-+
-+void ff_v4l2_request_abort_phase_control(AVFrame *frame)
-+{
-+    if (frame != NULL && frame->data[0] != NULL) {
-+        V4L2RequestDescriptor *const req = (V4L2RequestDescriptor *)frame->data[0];
-+        ff_v4l2_phase_abort(&req->phase);
-+    }
-+}
-+
-+int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    memset(&req->drm, 0, sizeof(AVDRMFrameDescriptor));
-+    req->output.used = 0;
-+    return 0;
-+}
-+
-+int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    memcpy(req->output.addr + req->output.used, data, size);
-+    req->output.used += size;
-+    return 0;
-+}
-+
-+static int v4l2_request_set_controls(V4L2RequestContext *ctx, int request_fd, struct v4l2_ext_control *control, int count)
-+{
-+    struct v4l2_ext_controls controls = {
-+        .controls = control,
-+        .count = count,
-+        .request_fd = request_fd,
-+        .which = (request_fd >= 0) ? V4L2_CTRL_WHICH_REQUEST_VAL : 0,
-+    };
-+
-+    if (!control || !count)
-+        return 0;
-+
-+    return ioctl(ctx->video_fd, VIDIOC_S_EXT_CTRLS, &controls);
-+}
-+
-+int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+
-+    ret = v4l2_request_set_controls(ctx, -1, control, count);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return ret;
-+}
-+
-+int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id)
-+{
-+    int ret;
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    struct v4l2_queryctrl control = {
-+        .id = id,
-+    };
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCTRL, &control);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return control.default_value;
-+}
-+
-+static int v4l2_request_queue_buffer(V4L2RequestContext *ctx, int request_fd, V4L2RequestBuffer *buf, uint32_t flags)
-+{
-+    struct v4l2_plane planes[1] = {};
-+    struct v4l2_buffer buffer = {
-+        .type = buf->buffer.type,
-+        .memory = buf->buffer.memory,
-+        .index = buf->index,
-+        .timestamp.tv_usec = ctx->timestamp,
-+        .bytesused = buf->used,
-+        .request_fd = request_fd,
-+        .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags,
-+    };
-+
-+    buf->buffer.timestamp = buffer.timestamp;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
-+        planes[0].bytesused = buf->used;
-+        buffer.bytesused = 0;
-+        buffer.length = 1;
-+        buffer.m.planes = planes;
-+    }
-+
-+    return ioctl(ctx->video_fd, VIDIOC_QBUF, &buffer);
-+}
-+
-+static int v4l2_request_dequeue_buffer(V4L2RequestContext *ctx, V4L2RequestBuffer *buf)
-+{
-+    int ret;
-+    struct v4l2_plane planes[1] = {};
-+    struct v4l2_buffer buffer = {
-+        .type = buf->buffer.type,
-+        .memory = buf->buffer.memory,
-+        .index = buf->index,
-+    };
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
-+        buffer.length = 1;
-+        buffer.m.planes = planes;
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_DQBUF, &buffer);
-+    if (ret < 0)
-+        return ret;
-+
-+    buf->buffer.timestamp = buffer.timestamp;
-+    return 0;
-+}
-+
-+const uint32_t v4l2_request_capture_pixelformats[] = {
-+    V4L2_PIX_FMT_NV12_COL128,
-+    V4L2_PIX_FMT_NV12_10_COL128,
-+    V4L2_PIX_FMT_NV12,
-+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
-+    V4L2_PIX_FMT_SUNXI_TILED_NV12,
-+#endif
-+};
-+
-+static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4l2_format *format)
-+{
-+    AVDRMFrameDescriptor *desc = &req->drm;
-+    AVDRMLayerDescriptor *layer = &desc->layers[0];
-+    uint32_t pixelformat = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat;
-+
-+    switch (pixelformat) {
-+    case V4L2_PIX_FMT_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+    case V4L2_PIX_FMT_NV12_COL128:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(format->fmt.pix.bytesperline);
-+        break;
-+    case V4L2_PIX_FMT_NV12_10_COL128:
-+        layer->format = DRM_FORMAT_P030;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(format->fmt.pix.bytesperline);
-+        break;
-+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
-+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
-+        break;
-+#endif
-+    default:
-+        return -1;
-+    }
-+
-+    desc->nb_objects = 1;
-+    desc->objects[0].fd = req->capture.fd;
-+    desc->objects[0].size = req->capture.size;
-+
-+    desc->nb_layers = 1;
-+    layer->nb_planes = 2;
-+
-+    layer->planes[0].object_index = 0;
-+    layer->planes[0].offset = 0;
-+    layer->planes[0].pitch = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.plane_fmt[0].bytesperline : format->fmt.pix.bytesperline;
-+
-+    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = format->fmt.pix.height * 128;
-+        layer->planes[0].pitch = format->fmt.pix.width;
-+        layer->planes[1].pitch = format->fmt.pix.width;
-+    }
-+    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = format->fmt.pix.height * 128;
-+        layer->planes[0].pitch = format->fmt.pix.width * 2; // Lies but it keeps DRM import happy
-+        layer->planes[1].pitch = format->fmt.pix.width * 2;
-+    }
-+    else {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = layer->planes[0].pitch * (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.height : format->fmt.pix.height);
-+        layer->planes[1].pitch = layer->planes[0].pitch;
-+    }
-+
-+    return 0;
-+}
-+
-+static int v4l2_request_queue_decode(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+    struct timeval tv = { 2, 0 };
-+    fd_set except_fds;
-+    int ret;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice);
-+
-+    if (first_slice)
-+        ctx->timestamp++;
-+
-+    ret = v4l2_request_set_controls(ctx, req->request_fd, control, count);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    memset(req->output.addr + req->output.used, 0, AV_INPUT_BUFFER_PADDING_SIZE);
-+
-+    ret = v4l2_request_queue_buffer(ctx, req->request_fd, &req->output, last_slice ? 0 : V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: queue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    if (first_slice) {
-+        ret = v4l2_request_queue_buffer(ctx, -1, &req->capture, 0);
-+        if (ret < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: queue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
-+            return -1;
-+        }
-+    }
-+
-+    // NOTE: do we need to dequeue when request fails/timeout?
-+
-+    // 4. queue request and wait
-+    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_QUEUE, NULL);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: queue request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        goto fail;
-+    }
-+
-+    FD_ZERO(&except_fds);
-+    FD_SET(req->request_fd, &except_fds);
-+
-+    ret = select(req->request_fd + 1, NULL, NULL, &except_fds, &tv);
-+    if (ret == 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: request %d timeout\n", __func__, req->request_fd);
-+        goto fail;
-+    } else if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: select request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    if (last_slice) {
-+        if (ff_v4l2_phase_started(&req->phase)) {
-+            ff_v4l2_phase_release(&req->phase, 0);
-+            ff_v4l2_phase_claim(&req->phase, 1);
-+        }
-+
-+        ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
-+
-+        if (ff_v4l2_phase_started(&req->phase)) {
-+            ff_v4l2_phase_release(&req->phase, 1);
-+        }
-+
-+        if (ret < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
-+            return -1;
-+        }
-+    }
-+
-+    // TODO: check errors
-+    // buffer.flags & V4L2_BUF_FLAG_ERROR
-+
-+    if (last_slice)
-+        return v4l2_request_set_drm_descriptor(req, &ctx->format);
-+
-+    return 0;
-+
-+fail:
-+    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
-+
-+    ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
-+
-+    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
-+
-+    return -1;
-+}
-+
-+int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
-+{
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
-+
-+    // fall back to queue each slice as a full frame
-+    if ((req->output.capabilities & V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) != V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
-+        return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
-+
-+    return v4l2_request_queue_decode(avctx, frame, control, count, first_slice, last_slice);
-+}
-+
-+int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count)
-+{
-+    return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
-+}
-+
-+
-+static inline char safechar(unsigned int x)
-+{
-+    x &= 0xff;
-+    return x > 0x20 && x <= 0x7e ? x : '.';
-+}
-+
-+static const char * str_fourcc(char * buf, const unsigned int fcc)
-+{
-+    if (fcc == 0) {
-+        return "----";
-+    }
-+    buf[0] = safechar(fcc >> 0);
-+    buf[1] = safechar(fcc >> 8);
-+    buf[2] = safechar(fcc >> 16);
-+    buf[3] = safechar(fcc >> 24);
-+    buf[4] = 0;
-+    return buf;
-+}
-+
-+static int v4l2_request_try_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    char b0[5], b1[5];
-+    struct v4l2_fmtdesc fmtdesc = {
-+        .index = 0,
-+        .type = type,
-+    };
-+
-+    if (V4L2_TYPE_IS_OUTPUT(type)) {
-+        struct v4l2_create_buffers buffers = {
-+            .count = 0,
-+            .memory = V4L2_MEMORY_MMAP,
-+            .format.type = type,
-+        };
-+
-+        if (ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers) < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
-+            return -1;
-+        }
-+
-+        if ((buffers.capabilities & V4L2_BUF_CAP_SUPPORTS_REQUESTS) != V4L2_BUF_CAP_SUPPORTS_REQUESTS) {
-+            av_log(avctx, AV_LOG_INFO, "%s: output buffer type do not support requests, capabilities %u\n", __func__, buffers.capabilities);
-+            return -1;
-+        }
-+    }
-+
-+    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
-+        av_log(avctx, AV_LOG_INFO, "%s: pixelformat found: %s wants %s\n", __func__, str_fourcc(b0, fmtdesc.pixelformat), str_fourcc(b1, pixelformat));
-+        if (fmtdesc.pixelformat == pixelformat)
-+            return 0;
-+
-+        fmtdesc.index++;
-+    }
-+
-+    av_log(avctx, AV_LOG_INFO, "%s: pixelformat %s not supported for type %u\n", __func__, str_fourcc(b0, pixelformat), type);
-+    return -1;
-+}
-+
-+static int v4l2_request_set_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat, uint32_t buffersize)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    struct v4l2_format format = {
-+        .type = type,
-+    };
-+    int rv;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
-+        format.fmt.pix_mp.width = avctx->coded_width;
-+        format.fmt.pix_mp.height = avctx->coded_height;
-+        format.fmt.pix_mp.pixelformat = pixelformat;
-+        format.fmt.pix_mp.plane_fmt[0].sizeimage = buffersize;
-+        format.fmt.pix_mp.num_planes = 1;
-+    } else {
-+        format.fmt.pix.width = avctx->coded_width;
-+        format.fmt.pix.height = avctx->coded_height;
-+        format.fmt.pix.pixelformat = pixelformat;
-+        format.fmt.pix.sizeimage = buffersize;
-+    }
-+
-+    rv = ioctl(ctx->video_fd, VIDIOC_S_FMT, &format);
-+    av_log(avctx, AV_LOG_INFO, "%s: rv=%d\n", __func__, rv);
-+    return rv;
-+}
-+
-+static int v4l2_request_select_capture_format(AVCodecContext *avctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    enum v4l2_buf_type type = ctx->format.type;
-+
-+#if 0
-+    struct v4l2_format format = {
-+        .type = type,
-+    };
-+    struct v4l2_fmtdesc fmtdesc = {
-+        .index = 0,
-+        .type = type,
-+    };
-+    uint32_t pixelformat;
-+    int i;
-+
-+    if (ioctl(ctx->video_fd, VIDIOC_G_FMT, &format) < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        return -1;
-+    }
-+
-+    pixelformat = V4L2_TYPE_IS_MULTIPLANAR(type) ? format.fmt.pix_mp.pixelformat : format.fmt.pix.pixelformat;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
-+        if (pixelformat == v4l2_request_capture_pixelformats[i])
-+            return v4l2_request_set_format(avctx, type, pixelformat, 0);
-+    }
-+
-+    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
-+        for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
-+            if (fmtdesc.pixelformat == v4l2_request_capture_pixelformats[i])
-+                return v4l2_request_set_format(avctx, type, fmtdesc.pixelformat, 0);
-+        }
-+
-+        fmtdesc.index++;
-+    }
-+#else
-+    for (int i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
-+        uint32_t pixelformat = v4l2_request_capture_pixelformats[i];
-+        if (!v4l2_request_try_format(avctx, type, pixelformat))
-+            return v4l2_request_set_format(avctx, type, pixelformat, 0);
-+    }
-+#endif
-+
-+    return -1;
-+}
-+
-+static int v4l2_request_probe_video_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret = AVERROR(EINVAL);
-+    struct v4l2_capability capability = {0};
-+    unsigned int capabilities = 0;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get video device devnode failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+//    ctx->video_fd = open(path, O_RDWR | O_NONBLOCK, 0);
-+    ctx->video_fd = open(path, O_RDWR, 0);
-+    if (ctx->video_fd < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCAP, &capability);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get video capability failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if (capability.capabilities & V4L2_CAP_DEVICE_CAPS)
-+        capabilities = capability.device_caps;
-+    else
-+        capabilities = capability.capabilities;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s capabilities=%u\n", __func__, avctx, ctx, path, capabilities);
-+
-+    if ((capabilities & V4L2_CAP_STREAMING) != V4L2_CAP_STREAMING) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: missing required streaming capability\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) == V4L2_CAP_VIDEO_M2M_MPLANE) {
-+        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-+        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-+    } else if ((capabilities & V4L2_CAP_VIDEO_M2M) == V4L2_CAP_VIDEO_M2M) {
-+        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+    } else {
-+        av_log(avctx, AV_LOG_ERROR, "%s: missing required mem2mem capability\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_try_format(avctx, ctx->output_type, pixelformat);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: try output format failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_set_controls(ctx, -1, control, count);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_set_format(avctx, ctx->output_type, pixelformat, buffersize);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: set output format failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_request_select_capture_format(avctx);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_WARNING, "%s: select capture format failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    if (ctx->video_fd >= 0) {
-+        close(ctx->video_fd);
-+        ctx->video_fd = -1;
-+    }
-+    return ret;
-+}
-+
-+static int v4l2_request_init_context(AVCodecContext *avctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &ctx->format);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, ctx->format.fmt.pix_mp.pixelformat, ctx->format.fmt.pix_mp.width, ctx->format.fmt.pix_mp.height, ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline, ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage, ctx->format.fmt.pix_mp.num_planes);
-+    } else {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, ctx->format.fmt.pix.pixelformat, ctx->format.fmt.pix.width, ctx->format.fmt.pix.height, ctx->format.fmt.pix.bytesperline, ctx->format.fmt.pix.sizeimage);
-+    }
-+
-+    ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM);
-+    if (ret < 0)
-+        goto fail;
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->output_type);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: output stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->format.type);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: capture stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    ff_v4l2_request_uninit(avctx);
-+    return ret;
-+}
-+
-+static int v4l2_request_probe_media_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+    struct media_device_info device_info = {0};
-+    struct media_v2_topology topology = {0};
-+    struct media_v2_interface *interfaces = NULL;
-+    struct udev *udev = udev_device_get_udev(device);
-+    struct udev_device *video_device;
-+    dev_t devnum;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media device devnode failed\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ctx->media_fd = open(path, O_RDWR, 0);
-+    if (ctx->media_fd < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media device info failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s driver=%s\n", __func__, avctx, ctx, path, device_info.driver);
-+
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    if (topology.num_interfaces <= 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: media device has no interfaces\n", __func__);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    interfaces = av_mallocz(topology.num_interfaces * sizeof(struct media_v2_interface));
-+    if (!interfaces) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: allocating media interface struct failed\n", __func__);
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        ret = AVERROR(EINVAL);
-+        goto fail;
-+    }
-+
-+    ret = AVERROR(EINVAL);
-+    for (int i = 0; i < topology.num_interfaces; i++) {
-+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
-+            continue;
-+
-+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
-+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
-+        if (!video_device) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: video_device=%p\n", __func__, video_device);
-+            continue;
-+        }
-+
-+        ret = v4l2_request_probe_video_device(video_device, avctx, pixelformat, buffersize, control, count);
-+        udev_device_unref(video_device);
-+
-+        if (!ret)
-+            break;
-+    }
-+
-+    av_freep(&interfaces);
-+    return ret;
-+
-+fail:
-+    av_freep(&interfaces);
-+    if (ctx->media_fd >= 0) {
-+        close(ctx->media_fd);
-+        ctx->media_fd = -1;
-+    }
-+    return ret;
-+}
-+
-+int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret = AVERROR(EINVAL);
-+    struct udev *udev;
-+    struct udev_enumerate *enumerate;
-+    struct udev_list_entry *devices;
-+    struct udev_list_entry *entry;
-+    struct udev_device *device;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p hw_device_ctx=%p hw_frames_ctx=%p\n", __func__, avctx, avctx->hw_device_ctx, avctx->hw_frames_ctx);
-+
-+    ctx->media_fd = -1;
-+    ctx->video_fd = -1;
-+    ctx->timestamp = 0;
-+
-+    udev = udev_new();
-+    if (!udev) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev context failed\n", __func__);
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    enumerate = udev_enumerate_new(udev);
-+    if (!enumerate) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev enumerator failed\n", __func__);
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    udev_enumerate_add_match_subsystem(enumerate, "media");
-+    udev_enumerate_scan_devices(enumerate);
-+
-+    devices = udev_enumerate_get_list_entry(enumerate);
-+    udev_list_entry_foreach(entry, devices) {
-+        const char *path = udev_list_entry_get_name(entry);
-+        if (!path)
-+            continue;
-+
-+        device = udev_device_new_from_syspath(udev, path);
-+        if (!device)
-+            continue;
-+
-+        ret = v4l2_request_probe_media_device(device, avctx, pixelformat, buffersize, control, count);
-+        udev_device_unref(device);
-+
-+        if (!ret)
-+            break;
-+    }
-+
-+    udev_enumerate_unref(enumerate);
-+
-+    if (!ret)
-+        ret = v4l2_request_init_context(avctx);
-+
-+fail:
-+    udev_unref(udev);
-+    return ret;
-+}
-+
-+int ff_v4l2_request_uninit(AVCodecContext *avctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p\n", __func__, avctx, ctx);
-+
-+    if (ctx->video_fd >= 0) {
-+        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->output_type);
-+        if (ret < 0)
-+            av_log(avctx, AV_LOG_ERROR, "%s: output stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
-+
-+        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->format.type);
-+        if (ret < 0)
-+            av_log(avctx, AV_LOG_ERROR, "%s: capture stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
-+    }
-+
-+    if (avctx->hw_frames_ctx) {
-+        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-+        av_buffer_pool_flush(hwfc->pool);
-+    }
-+
-+    if (ctx->video_fd >= 0)
-+        close(ctx->video_fd);
-+
-+    if (ctx->media_fd >= 0)
-+        close(ctx->media_fd);
-+
-+    return 0;
-+}
-+
-+static int v4l2_request_buffer_alloc(AVCodecContext *avctx, V4L2RequestBuffer *buf, enum v4l2_buf_type type)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret;
-+    struct v4l2_plane planes[1] = {};
-+    struct v4l2_create_buffers buffers = {
-+        .count = 1,
-+        .memory = V4L2_MEMORY_MMAP,
-+        .format.type = type,
-+    };
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p buf=%p type=%u\n", __func__, avctx, buf, type);
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &buffers.format);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: get format failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
-+        return ret;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buffers.format.type)) {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, buffers.format.fmt.pix_mp.pixelformat, buffers.format.fmt.pix_mp.width, buffers.format.fmt.pix_mp.height, buffers.format.fmt.pix_mp.plane_fmt[0].bytesperline, buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage, buffers.format.fmt.pix_mp.num_planes);
-+    } else {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, buffers.format.fmt.pix.pixelformat, buffers.format.fmt.pix.width, buffers.format.fmt.pix.height, buffers.format.fmt.pix.bytesperline, buffers.format.fmt.pix.sizeimage);
-+    }
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
-+        return ret;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
-+        buf->width = buffers.format.fmt.pix_mp.width;
-+        buf->height = buffers.format.fmt.pix_mp.height;
-+        buf->size = buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage;
-+        buf->buffer.length = 1;
-+        buf->buffer.m.planes = planes;
-+    } else {
-+        buf->width = buffers.format.fmt.pix.width;
-+        buf->height = buffers.format.fmt.pix.height;
-+        buf->size = buffers.format.fmt.pix.sizeimage;
-+    }
-+
-+    buf->index = buffers.index;
-+    buf->capabilities = buffers.capabilities;
-+    buf->used = 0;
-+
-+    buf->buffer.type = type;
-+    buf->buffer.memory = V4L2_MEMORY_MMAP;
-+    buf->buffer.index = buf->index;
-+
-+    ret = ioctl(ctx->video_fd, VIDIOC_QUERYBUF, &buf->buffer);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: query buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
-+        return ret;
-+    }
-+
-+    if (V4L2_TYPE_IS_OUTPUT(type)) {
-+        void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset);
-+        if (addr == MAP_FAILED) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: mmap failed, %s (%d)\n", __func__, strerror(errno), errno);
-+            return -1;
-+        }
-+
-+        buf->addr = (uint8_t*)addr;
-+    } else {
-+        struct v4l2_exportbuffer exportbuffer = {
-+            .type = type,
-+            .index = buf->index,
-+            .flags = O_RDONLY,
-+        };
-+
-+        ret = ioctl(ctx->video_fd, VIDIOC_EXPBUF, &exportbuffer);
-+        if (ret < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: export buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
-+            return ret;
-+        }
-+
-+        buf->fd = exportbuffer.fd;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
-+    return 0;
-+}
-+
-+static void v4l2_request_buffer_free(V4L2RequestBuffer *buf)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
-+
-+    if (buf->addr)
-+        munmap(buf->addr, buf->size);
-+
-+    if (buf->fd >= 0)
-+        close(buf->fd);
-+}
-+
-+static void v4l2_request_frame_free(void *opaque, uint8_t *data)
-+{
-+    AVCodecContext *avctx = opaque;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)data;
-+
-+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p request_fd=%d\n", __func__, avctx, data, req->request_fd);
-+
-+    if (req->request_fd >= 0)
-+        close(req->request_fd);
-+
-+    v4l2_request_buffer_free(&req->capture);
-+    v4l2_request_buffer_free(&req->output);
-+
-+    av_free(data);
-+}
-+
-+static AVBufferRef *v4l2_request_frame_alloc(void *opaque, int size)
-+{
-+    AVCodecContext *avctx = opaque;
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2RequestDescriptor *req;
-+    AVBufferRef *ref;
-+    uint8_t *data;
-+    int ret;
-+
-+    data = av_mallocz(size);
-+    if (!data)
-+        return NULL;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
-+
-+    ref = av_buffer_create(data, size, v4l2_request_frame_free, avctx, 0);
-+    if (!ref) {
-+        av_freep(&data);
-+        return NULL;
-+    }
-+
-+    req = (V4L2RequestDescriptor*)data;
-+    req->request_fd = -1;
-+    req->output.fd = -1;
-+    req->capture.fd = -1;
-+
-+    ret = v4l2_request_buffer_alloc(avctx, &req->output, ctx->output_type);
-+    if (ret < 0) {
-+        av_buffer_unref(&ref);
-+        return NULL;
-+    }
-+
-+    ret = v4l2_request_buffer_alloc(avctx, &req->capture, ctx->format.type);
-+    if (ret < 0) {
-+        av_buffer_unref(&ref);
-+        return NULL;
-+    }
-+
-+    ret = ioctl(ctx->media_fd, MEDIA_IOC_REQUEST_ALLOC, &req->request_fd);
-+    if (ret < 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: request alloc failed, %s (%d)\n", __func__, strerror(errno), errno);
-+        av_buffer_unref(&ref);
-+        return NULL;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p request_fd=%d\n", __func__, avctx, size, data, req->request_fd);
-+    return ref;
-+}
-+
-+static void v4l2_request_pool_free(void *opaque)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
-+}
-+
-+static void v4l2_request_hwframe_ctx_free(AVHWFramesContext *hwfc)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
-+
-+    av_buffer_pool_flush(hwfc->pool);
-+    av_buffer_pool_uninit(&hwfc->pool);
-+}
-+
-+int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
-+{
-+    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
-+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
-+
-+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
-+    hwfc->sw_format = AV_PIX_FMT_NV12;
-+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
-+        hwfc->width = ctx->format.fmt.pix_mp.width;
-+        hwfc->height = ctx->format.fmt.pix_mp.height;
-+    } else {
-+        hwfc->width = ctx->format.fmt.pix.width;
-+        hwfc->height = ctx->format.fmt.pix.height;
-+        if (ctx->format.fmt.pix.pixelformat == V4L2_PIX_FMT_NV12_COL128) {
-+            hwfc->sw_format = AV_PIX_FMT_RPI4_8;
-+        }
-+        else if (ctx->format.fmt.pix.pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
-+            hwfc->sw_format = AV_PIX_FMT_RPI4_10;
-+        }
-+    }
-+
-+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2RequestDescriptor), avctx, v4l2_request_frame_alloc, v4l2_request_pool_free);
-+    if (!hwfc->pool)
-+        return AVERROR(ENOMEM);
-+
-+    hwfc->free = v4l2_request_hwframe_ctx_free;
-+
-+    hwfc->initial_pool_size = 1;
-+
-+    switch (avctx->codec_id) {
-+    case AV_CODEC_ID_VP9:
-+        hwfc->initial_pool_size += 8;
-+        break;
-+    case AV_CODEC_ID_VP8:
-+        hwfc->initial_pool_size += 3;
-+        break;
-+    default:
-+        hwfc->initial_pool_size += 2;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
-+
-+    return 0;
-+}
-diff --git a/libavcodec/v4l2_request.h b/libavcodec/v4l2_request.h
-new file mode 100644
-index 0000000000..be1e9111a6
---- /dev/null
-+++ b/libavcodec/v4l2_request.h
-@@ -0,0 +1,92 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_V4L2_REQUEST_H
-+#define AVCODEC_V4L2_REQUEST_H
-+
-+#include <linux/videodev2.h>
-+
-+#include "libavutil/hwcontext_drm.h"
-+#include "v4l2_phase.h"
-+
-+typedef struct V4L2RequestContext {
-+    int video_fd;
-+    int media_fd;
-+    enum v4l2_buf_type output_type;
-+    struct v4l2_format format;
-+    int timestamp;
-+} V4L2RequestContext;
-+
-+typedef struct V4L2RequestBuffer {
-+    int index;
-+    int fd;
-+    uint8_t *addr;
-+    uint32_t width;
-+    uint32_t height;
-+    uint32_t size;
-+    uint32_t used;
-+    uint32_t capabilities;
-+    struct v4l2_buffer buffer;
-+} V4L2RequestBuffer;
-+
-+struct V4l2PhaseControl;
-+
-+typedef struct V4L2PhaseEnv {
-+    struct V4L2PhaseEnv * next;
-+    struct V4L2PhaseControl * ctrl;
-+    unsigned int order;
-+} V4L2PhaseEnv;
-+
-+typedef struct V4L2RequestDescriptor {
-+    AVDRMFrameDescriptor drm;
-+    int request_fd;
-+    V4L2RequestBuffer output;
-+    V4L2RequestBuffer capture;
-+
-+    // Phase control
-+    V4L2PhaseInfo phase;
-+} V4L2RequestDescriptor;
-+
-+uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame);
-+
-+// Sets phase control on this frame & gives it an order
-+int ff_v4l2_request_start_phase_control(AVFrame *frame, struct V4L2PhaseControl * phase);
-+
-+// Had error - release all phases
-+void ff_v4l2_request_abort_phase_control(AVFrame *frame);
-+
-+
-+int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame);
-+
-+int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size);
-+
-+int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count);
-+
-+int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id);
-+
-+int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice);
-+
-+int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count);
-+
-+int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count);
-+
-+int ff_v4l2_request_uninit(AVCodecContext *avctx);
-+
-+int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
-+
-+#endif /* AVCODEC_V4L2_REQUEST_H */
-diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
-new file mode 100644
-index 0000000000..13fac3f6f9
---- /dev/null
-+++ b/libavcodec/v4l2_request_h264.c
-@@ -0,0 +1,448 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "h264dec.h"
-+#include "hwaccel.h"
-+#include "v4l2_request.h"
-+#include "h264-ctrls.h"
-+
-+typedef struct V4L2RequestControlsH264 {
-+    struct v4l2_ctrl_h264_sps sps;
-+    struct v4l2_ctrl_h264_pps pps;
-+    struct v4l2_ctrl_h264_scaling_matrix scaling_matrix;
-+    struct v4l2_ctrl_h264_decode_params decode_params;
-+    struct v4l2_ctrl_h264_slice_params slice_params[16];
-+    int first_slice;
-+} V4L2RequestControlsH264;
-+
-+typedef struct V4L2RequestContextH264 {
-+    V4L2RequestContext base;
-+    int decode_mode;
-+    int start_code;
-+} V4L2RequestContextH264;
-+
-+static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
-+
-+static void fill_weight_factors(struct v4l2_h264_weight_factors *factors, int list, const H264SliceContext *sl)
-+{
-+    for (int i = 0; i < sl->ref_count[list]; i++) {
-+        if (sl->pwt.luma_weight_flag[list]) {
-+            factors->luma_weight[i] = sl->pwt.luma_weight[i][list][0];
-+            factors->luma_offset[i] = sl->pwt.luma_weight[i][list][1];
-+        } else {
-+            factors->luma_weight[i] = 1 << sl->pwt.luma_log2_weight_denom;
-+            factors->luma_offset[i] = 0;
-+        }
-+        for (int j = 0; j < 2; j++) {
-+            if (sl->pwt.chroma_weight_flag[list]) {
-+                factors->chroma_weight[i][j] = sl->pwt.chroma_weight[i][list][j][0];
-+                factors->chroma_offset[i][j] = sl->pwt.chroma_weight[i][list][j][1];
-+            } else {
-+                factors->chroma_weight[i][j] = 1 << sl->pwt.chroma_log2_weight_denom;
-+                factors->chroma_offset[i][j] = 0;
-+            }
-+        }
-+    }
-+}
-+
-+static void fill_dpb_entry(struct v4l2_h264_dpb_entry *entry, const H264Picture *pic)
-+{
-+    entry->reference_ts = ff_v4l2_request_get_capture_timestamp(pic->f);
-+    entry->frame_num = pic->frame_num;
-+    entry->pic_num = pic->pic_id;
-+    entry->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
-+    entry->flags |= (pic->reference & 3) << 4;
-+    if (pic->reference)
-+        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE;
-+    if (pic->long_ref)
-+        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
-+    if (pic->field_picture)
-+        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_FIELD_PICTURE;
-+    if (pic->field_poc[0] != INT_MAX)
-+        entry->top_field_order_cnt = pic->field_poc[0];
-+    if (pic->field_poc[1] != INT_MAX)
-+        entry->bottom_field_order_cnt = pic->field_poc[1];
-+}
-+
-+static void fill_dpb(struct v4l2_ctrl_h264_decode_params *decode, const H264Context *h)
-+{
-+    int entries = 0;
-+
-+    for (int i = 0; i < h->short_ref_count; i++) {
-+        const H264Picture *pic = h->short_ref[i];
-+        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
-+            fill_dpb_entry(&decode->dpb[entries++], pic);
-+    }
-+
-+    if (!h->long_ref_count)
-+        return;
-+
-+    for (int i = 0; i < FF_ARRAY_ELEMS(h->long_ref); i++) {
-+        const H264Picture *pic = h->long_ref[i];
-+        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
-+            fill_dpb_entry(&decode->dpb[entries++], pic);
-+    }
-+}
-+
-+static uint8_t get_dpb_index(struct v4l2_ctrl_h264_decode_params *decode, const H264Ref *ref)
-+{
-+    uint64_t timestamp;
-+
-+    if (!ref->parent)
-+        return 0;
-+
-+    timestamp = ff_v4l2_request_get_capture_timestamp(ref->parent->f);
-+
-+    for (uint8_t i = 0; i < FF_ARRAY_ELEMS(decode->dpb); i++) {
-+        struct v4l2_h264_dpb_entry *entry = &decode->dpb[i];
-+        if ((entry->flags & V4L2_H264_DPB_ENTRY_FLAG_VALID) &&
-+            entry->reference_ts == timestamp)
-+            // TODO: signal reference type, possible using top 2 bits
-+            return i | ((ref->reference & 3) << 6);
-+    }
-+
-+    return 0;
-+}
-+
-+static void fill_sps(struct v4l2_ctrl_h264_sps *ctrl, const H264Context *h)
-+{
-+    const SPS *sps = h->ps.sps;
-+
-+    *ctrl = (struct v4l2_ctrl_h264_sps) {
-+        .profile_idc = sps->profile_idc,
-+        .constraint_set_flags = sps->constraint_set_flags,
-+        .level_idc = sps->level_idc,
-+        .seq_parameter_set_id = sps->sps_id,
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .bit_depth_luma_minus8 = sps->bit_depth_luma - 8,
-+        .bit_depth_chroma_minus8 = sps->bit_depth_chroma - 8,
-+        .log2_max_frame_num_minus4 = sps->log2_max_frame_num - 4,
-+        .pic_order_cnt_type = sps->poc_type,
-+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
-+        .max_num_ref_frames = sps->ref_frame_count,
-+        .num_ref_frames_in_pic_order_cnt_cycle = sps->poc_cycle_length,
-+        //.offset_for_ref_frame[255] - not required? not set by libva-v4l2-request - copy sps->offset_for_ref_frame
-+        .offset_for_non_ref_pic = sps->offset_for_non_ref_pic,
-+        .offset_for_top_to_bottom_field = sps->offset_for_top_to_bottom_field,
-+        .pic_width_in_mbs_minus1 = h->mb_width - 1,
-+        .pic_height_in_map_units_minus1 = sps->frame_mbs_only_flag ? h->mb_height - 1 : h->mb_height / 2 - 1,
-+    };
-+
-+    if (sps->residual_color_transform_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE;
-+    if (sps->transform_bypass)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS;
-+    if (sps->delta_pic_order_always_zero_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO;
-+    if (sps->gaps_in_frame_num_allowed_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED;
-+    if (sps->frame_mbs_only_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY;
-+    if (sps->mb_aff)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD;
-+    if (sps->direct_8x8_inference_flag)
-+        ctrl->flags |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE;
-+}
-+
-+static void fill_pps(struct v4l2_ctrl_h264_pps *ctrl, const H264Context *h)
-+{
-+    const PPS *pps = h->ps.pps;
-+    const H264SliceContext *sl = &h->slice_ctx[0];
-+
-+    *ctrl = (struct v4l2_ctrl_h264_pps) {
-+        .pic_parameter_set_id = sl->pps_id,
-+        .seq_parameter_set_id = pps->sps_id,
-+        .num_slice_groups_minus1 = pps->slice_group_count - 1,
-+        .num_ref_idx_l0_default_active_minus1 = pps->ref_count[0] - 1,
-+        .num_ref_idx_l1_default_active_minus1 = pps->ref_count[1] - 1,
-+        .weighted_bipred_idc = pps->weighted_bipred_idc,
-+        .pic_init_qp_minus26 = pps->init_qp - 26,
-+        .pic_init_qs_minus26 = pps->init_qs - 26,
-+        .chroma_qp_index_offset = pps->chroma_qp_index_offset[0],
-+        .second_chroma_qp_index_offset = pps->chroma_qp_index_offset[1],
-+    };
-+
-+    if (pps->cabac)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE;
-+    if (pps->pic_order_present)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT;
-+    if (pps->weighted_pred)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED;
-+    if (pps->deblocking_filter_parameters_present)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT;
-+    if (pps->constrained_intra_pred)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED;
-+    if (pps->redundant_pic_cnt_present)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT;
-+    if (pps->transform_8x8_mode)
-+        ctrl->flags |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE;
-+}
-+
-+static int v4l2_request_h264_start_frame(AVCodecContext *avctx,
-+                                         av_unused const uint8_t *buffer,
-+                                         av_unused uint32_t size)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    const PPS *pps = h->ps.pps;
-+    const SPS *sps = h->ps.sps;
-+    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
-+
-+    fill_sps(&controls->sps, h);
-+    fill_pps(&controls->pps, h);
-+
-+    memcpy(controls->scaling_matrix.scaling_list_4x4, pps->scaling_matrix4, sizeof(controls->scaling_matrix.scaling_list_4x4));
-+    memcpy(controls->scaling_matrix.scaling_list_8x8[0], pps->scaling_matrix8[0], sizeof(controls->scaling_matrix.scaling_list_8x8[0]));
-+    memcpy(controls->scaling_matrix.scaling_list_8x8[1], pps->scaling_matrix8[3], sizeof(controls->scaling_matrix.scaling_list_8x8[1]));
-+
-+    if (sps->chroma_format_idc == 3) {
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[2], pps->scaling_matrix8[1], sizeof(controls->scaling_matrix.scaling_list_8x8[2]));
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[3], pps->scaling_matrix8[4], sizeof(controls->scaling_matrix.scaling_list_8x8[3]));
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[4], pps->scaling_matrix8[2], sizeof(controls->scaling_matrix.scaling_list_8x8[4]));
-+        memcpy(controls->scaling_matrix.scaling_list_8x8[5], pps->scaling_matrix8[5], sizeof(controls->scaling_matrix.scaling_list_8x8[5]));
-+    }
-+
-+    controls->decode_params = (struct v4l2_ctrl_h264_decode_params) {
-+        .num_slices = 0,
-+        .nal_ref_idc = h->nal_ref_idc,
-+        .top_field_order_cnt = h->cur_pic_ptr->field_poc[0] != INT_MAX ? h->cur_pic_ptr->field_poc[0] : 0,
-+        .bottom_field_order_cnt = h->cur_pic_ptr->field_poc[1] != INT_MAX ? h->cur_pic_ptr->field_poc[1] : 0,
-+    };
-+
-+    if (h->picture_idr)
-+        controls->decode_params.flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC;
-+
-+    fill_dpb(&controls->decode_params, h);
-+
-+    controls->first_slice = !FIELD_PICTURE(h) || h->first_field;
-+
-+    return ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
-+}
-+
-+static int v4l2_request_h264_queue_decode(AVCodecContext *avctx, int last_slice)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
-+    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
-+            .ptr = &controls->sps,
-+            .size = sizeof(controls->sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
-+            .ptr = &controls->pps,
-+            .size = sizeof(controls->pps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS,
-+            .ptr = &controls->slice_params,
-+            .size = sizeof(controls->slice_params[0]) * FFMIN(controls->decode_params.num_slices, 16),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS,
-+            .ptr = &controls->decode_params,
-+            .size = sizeof(controls->decode_params),
-+        },
-+    };
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED)
-+        return ff_v4l2_request_decode_slice(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
-+
-+    return ff_v4l2_request_decode_frame(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    const PPS *pps = h->ps.pps;
-+    const H264SliceContext *sl = &h->slice_ctx[0];
-+    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
-+    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->cur_pic_ptr->f->data[0];
-+    int i, ret, count, slice = FFMIN(controls->decode_params.num_slices, 15);
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED && slice) {
-+        ret = v4l2_request_h264_queue_decode(avctx, 0);
-+        if (ret)
-+            return ret;
-+
-+        ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
-+        slice = controls->decode_params.num_slices = 0;
-+        controls->first_slice = 0;
-+    }
-+
-+    controls->slice_params[slice] = (struct v4l2_ctrl_h264_slice_params) {
-+        /* Size in bytes, including header */
-+        .size = 0,
-+        .start_byte_offset = req->output.used,
-+        /* Offset in bits to slice_data() from the beginning of this slice. */
-+        .header_bit_size = get_bits_count(&sl->gb),
-+
-+        .first_mb_in_slice = sl->first_mb_addr,
-+        .slice_type = ff_h264_get_slice_type(sl),
-+        .pic_parameter_set_id = sl->pps_id,
-+        .colour_plane_id = 0, /* what is this? */
-+        .frame_num = h->poc.frame_num,
-+        .idr_pic_id = sl->idr_pic_id,
-+        .pic_order_cnt_lsb = sl->poc_lsb,
-+        .delta_pic_order_cnt_bottom = sl->delta_poc_bottom,
-+        .delta_pic_order_cnt0 = sl->delta_poc[0],
-+        .delta_pic_order_cnt1 = sl->delta_poc[1],
-+        .redundant_pic_cnt = sl->redundant_pic_count,
-+
-+        /* Size in bits of dec_ref_pic_marking() syntax element. */
-+        .dec_ref_pic_marking_bit_size = sl->ref_pic_marking_size_in_bits,
-+        /* Size in bits of pic order count syntax. */
-+        .pic_order_cnt_bit_size = sl->pic_order_cnt_bit_size,
-+
-+        .cabac_init_idc = sl->cabac_init_idc,
-+        .slice_qp_delta = sl->qscale - pps->init_qp,
-+        .slice_qs_delta = 0, /* XXX not implemented by FFmpeg */
-+        .disable_deblocking_filter_idc = sl->deblocking_filter < 2 ? !sl->deblocking_filter : sl->deblocking_filter,
-+        .slice_alpha_c0_offset_div2 = sl->slice_alpha_c0_offset / 2,
-+        .slice_beta_offset_div2 = sl->slice_beta_offset / 2,
-+        .slice_group_change_cycle = 0, /* what is this? */
-+
-+        .num_ref_idx_l0_active_minus1 = sl->list_count > 0 ? sl->ref_count[0] - 1 : 0,
-+        .num_ref_idx_l1_active_minus1 = sl->list_count > 1 ? sl->ref_count[1] - 1 : 0,
-+    };
-+
-+    if (FIELD_PICTURE(h))
-+        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_FIELD_PIC;
-+    if (h->picture_structure == PICT_BOTTOM_FIELD)
-+        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_BOTTOM_FIELD;
-+    if (sl->slice_type == AV_PICTURE_TYPE_B && sl->direct_spatial_mv_pred)
-+        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED;
-+
-+    controls->slice_params[slice].pred_weight_table.chroma_log2_weight_denom = sl->pwt.chroma_log2_weight_denom;
-+    controls->slice_params[slice].pred_weight_table.luma_log2_weight_denom = sl->pwt.luma_log2_weight_denom;
-+
-+    count = sl->list_count > 0 ? sl->ref_count[0] : 0;
-+    for (i = 0; i < count; i++)
-+        controls->slice_params[slice].ref_pic_list0[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[0][i]);
-+    if (count)
-+        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[0], 0, sl);
-+
-+    count = sl->list_count > 1 ? sl->ref_count[1] : 0;
-+    for (i = 0; i < count; i++)
-+        controls->slice_params[slice].ref_pic_list1[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[1][i]);
-+    if (count)
-+        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[1], 1, sl);
-+
-+    if (ctx->start_code == V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
-+        ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, nalu_slice_start_code, 3);
-+        if (ret)
-+            return ret;
-+    }
-+
-+    ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, buffer, size);
-+    if (ret)
-+        return ret;
-+
-+    controls->slice_params[slice].size = req->output.used - controls->slice_params[slice].start_byte_offset;
-+    controls->decode_params.num_slices++;
-+    return 0;
-+}
-+
-+static int v4l2_request_h264_end_frame(AVCodecContext *avctx)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    return v4l2_request_h264_queue_decode(avctx, !FIELD_PICTURE(h) || !h->first_field);
-+}
-+
-+static int v4l2_request_h264_set_controls(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_H264_START_CODE, },
-+    };
-+
-+    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE);
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_START_CODE);
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    control[0].value = ctx->decode_mode;
-+    control[1].value = ctx->start_code;
-+
-+    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_h264_init(AVCodecContext *avctx)
-+{
-+    const H264Context *h = avctx->priv_data;
-+    struct v4l2_ctrl_h264_sps sps;
-+    struct v4l2_ctrl_h264_pps pps;
-+    int ret;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
-+            .ptr = &sps,
-+            .size = sizeof(sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
-+            .ptr = &pps,
-+            .size = sizeof(pps),
-+        },
-+    };
-+
-+    fill_sps(&sps, h);
-+    fill_pps(&pps, h);
-+
-+    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_H264_SLICE, 2 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
-+    if (ret)
-+        return ret;
-+
-+    return v4l2_request_h264_set_controls(avctx);
-+}
-+
-+const AVHWAccel ff_h264_v4l2request_hwaccel = {
-+    .name           = "h264_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_H264,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame    = v4l2_request_h264_start_frame,
-+    .decode_slice   = v4l2_request_h264_decode_slice,
-+    .end_frame      = v4l2_request_h264_end_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsH264),
-+    .init           = v4l2_request_h264_init,
-+    .uninit         = ff_v4l2_request_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContextH264),
-+    .frame_params   = ff_v4l2_request_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
-+};
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-new file mode 100644
-index 0000000000..58b018a56f
---- /dev/null
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,587 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hevcdec.h"
-+#include "hwaccel.h"
-+#include "v4l2_request.h"
-+#include "hevc-ctrls.h"
-+
-+#include "v4l2_phase.h"
-+
-+typedef struct V4L2RequestControlsHEVC {
-+    struct v4l2_ctrl_hevc_sps sps;
-+    struct v4l2_ctrl_hevc_pps pps;
-+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
-+    struct v4l2_ctrl_hevc_slice_params slice_params[16];
-+    int first_slice;
-+    int num_slices; //TODO: this should be in control
-+} V4L2RequestControlsHEVC;
-+
-+typedef struct V4L2RequestContextHEVC {
-+    V4L2RequestContext base;
-+    int decode_mode;
-+    int start_code;
-+
-+    unsigned int order;
-+    V4L2PhaseControl * pctrl;
-+} V4L2RequestContextHEVC;
-+
-+static void v4l2_request_hevc_fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
-+{
-+    int32_t luma_weight_denom, chroma_weight_denom;
-+    const SliceHeader *sh = &h->sh;
-+
-+    if (sh->slice_type == HEVC_SLICE_I ||
-+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
-+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
-+        return;
-+
-+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
-+
-+    if (h->ps.sps->chroma_format_idc)
-+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
-+
-+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
-+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
-+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
-+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
-+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
-+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
-+    }
-+
-+    if (sh->slice_type != HEVC_SLICE_B)
-+        return;
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
-+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
-+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
-+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
-+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
-+    }
-+}
-+
-+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
-+{
-+    const HEVCFrame *frame;
-+    int i;
-+
-+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_BEF].ref[i];
-+        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
-+    }
-+
-+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_AFT].ref[i];
-+        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
-+    }
-+
-+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
-+        frame = h->rps[LT_CURR].ref[i];
-+        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
-+    }
-+
-+    return 0;
-+}
-+
-+static uint8_t get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
-+                                 struct v4l2_ctrl_hevc_slice_params *slice_params)
-+{
-+    uint64_t timestamp;
-+
-+    if (!frame)
-+        return 0;
-+
-+    timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
-+
-+    for (uint8_t i = 0; i < slice_params->num_active_dpb_entries; i++) {
-+        struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[i];
-+        if (entry->timestamp == timestamp)
-+            return i;
-+    }
-+
-+    return 0;
-+}
-+
-+static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
-+                                                struct v4l2_ctrl_hevc_slice_params *slice_params)
-+{
-+    const HEVCFrame *pic = h->ref;
-+    const SliceHeader *sh = &h->sh;
-+    int i, entries = 0;
-+    RefPicList *rpl;
-+
-+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
-+        .bit_size = 0,
-+        .data_bit_offset = get_bits_count(&h->HEVClc->gb),
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_segment_addr = sh->slice_segment_addr,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+        .nal_unit_type = h->nal_unit_type,
-+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_type = sh->slice_type,
-+        .colour_plane_id = sh->colour_plane_id,
-+        .slice_pic_order_cnt = pic->poc,
-+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
-+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
-+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
-+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
-+        .slice_qp_delta = sh->slice_qp_delta,
-+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
-+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
-+        .slice_act_y_qp_offset = 0,
-+        .slice_act_cb_qp_offset = 0,
-+        .slice_act_cr_qp_offset = 0,
-+        .slice_beta_offset_div2 = sh->beta_offset / 2,
-+        .slice_tc_offset_div2 = sh->tc_offset / 2,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+        .pic_struct = h->sei.picture_timing.picture_struct,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
-+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
-+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
-+    };
-+
-+    if (sh->slice_sample_adaptive_offset_flag[0])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
-+
-+    if (sh->slice_sample_adaptive_offset_flag[1])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
-+
-+    if (sh->slice_temporal_mvp_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
-+
-+    if (sh->mvd_l1_zero_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
-+
-+    if (sh->cabac_init_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
-+
-+    if (sh->collocated_list == L0)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
-+
-+    if (sh->disable_deblocking_filter_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
-+
-+    if (sh->slice_loop_filter_across_slices_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    if (sh->dependent_slice_segment_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
-+        const HEVCFrame *frame = &h->DPB[i];
-+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
-+            struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[entries++];
-+
-+            entry->timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
-+            entry->rps = find_frame_rps_type(h, entry->timestamp);
-+            entry->field_pic = frame->frame->interlaced_frame;
-+
-+            /* TODO: Interleaved: Get the POC for each field. */
-+            entry->pic_order_cnt[0] = frame->poc;
-+            entry->pic_order_cnt[1] = frame->poc;
-+        }
-+    }
-+
-+    slice_params->num_active_dpb_entries = entries;
-+
-+    if (sh->slice_type != HEVC_SLICE_I) {
-+        rpl = &h->ref->refPicList[0];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
-+    }
-+
-+    if (sh->slice_type == HEVC_SLICE_B) {
-+        rpl = &h->ref->refPicList[1];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
-+    }
-+
-+    v4l2_request_hevc_fill_pred_table(h, &slice_params->pred_weight_table);
-+
-+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
-+    if (slice_params->num_entry_point_offsets > 256) {
-+        slice_params->num_entry_point_offsets = 256;
-+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
-+    }
-+
-+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
-+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
-+}
-+
-+static struct v4l2_ctrl_hevc_sps make_v4l2_sps(
-+    const HEVCSPS * const sps)
-+{
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+    struct v4l2_ctrl_hevc_sps dst = {
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .pic_width_in_luma_samples = sps->width,
-+        .pic_height_in_luma_samples = sps->height,
-+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
-+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
-+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
-+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
-+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
-+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
-+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
-+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
-+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
-+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
-+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
-+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
-+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
-+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
-+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
-+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
-+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
-+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
-+        .flags = 0  // Set below
-+    };
-+
-+    if (sps->separate_colour_plane_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
-+
-+    if (sps->scaling_list_enable_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
-+
-+    if (sps->amp_enabled_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
-+
-+    if (sps->sao_enabled)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
-+
-+    if (sps->pcm_enabled_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
-+
-+    if (sps->pcm.loop_filter_disable_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
-+
-+    if (sps->long_term_ref_pics_present_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
-+
-+    if (sps->sps_temporal_mvp_enabled_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
-+
-+    if (sps->sps_strong_intra_smoothing_enable_flag)
-+        dst.flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
-+
-+    return dst;
-+}
-+
-+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
-+                                         av_unused const uint8_t *buffer,
-+                                         av_unused uint32_t size)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    const HEVCSPS *sps = h->ps.sps;
-+    const HEVCPPS *pps = h->ps.pps;
-+    const ScalingList *sl = pps->scaling_list_data_present_flag ?
-+                            &pps->scaling_list :
-+                            sps->scaling_list_enable_flag ?
-+                            &sps->scaling_list : NULL;
-+    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    int rv;
-+
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+    controls->sps = make_v4l2_sps(sps);
-+
-+    if (sl) {
-+        for (int i = 0; i < 6; i++) {
-+            for (int j = 0; j < 16; j++)
-+                controls->scaling_matrix.scaling_list_4x4[i][j] = sl->sl[0][i][j];
-+            for (int j = 0; j < 64; j++) {
-+                controls->scaling_matrix.scaling_list_8x8[i][j]   = sl->sl[1][i][j];
-+                controls->scaling_matrix.scaling_list_16x16[i][j] = sl->sl[2][i][j];
-+                if (i < 2)
-+                    controls->scaling_matrix.scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
-+            }
-+            controls->scaling_matrix.scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
-+            if (i < 2)
-+                controls->scaling_matrix.scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
-+        }
-+    }
-+
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+    controls->pps = (struct v4l2_ctrl_hevc_pps) {
-+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
-+        .init_qp_minus26 = pps->pic_init_qp_minus26,
-+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
-+        .pps_cb_qp_offset = pps->cb_qp_offset,
-+        .pps_cr_qp_offset = pps->cr_qp_offset,
-+        .pps_beta_offset_div2 = pps->beta_offset / 2,
-+        .pps_tc_offset_div2 = pps->tc_offset / 2,
-+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
-+    };
-+
-+    if (pps->dependent_slice_segments_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT;
-+
-+    if (pps->output_flag_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
-+
-+    if (pps->sign_data_hiding_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
-+
-+    if (pps->cabac_init_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
-+
-+    if (pps->constrained_intra_pred_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
-+
-+    if (pps->transform_skip_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
-+
-+    if (pps->cu_qp_delta_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
-+
-+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
-+
-+    if (pps->weighted_pred_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
-+
-+    if (pps->weighted_bipred_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
-+
-+    if (pps->transquant_bypass_enable_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
-+
-+    if (pps->tiles_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
-+
-+    if (pps->entropy_coding_sync_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
-+
-+    if (pps->loop_filter_across_tiles_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
-+
-+    if (pps->seq_loop_filter_across_slices_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    if (pps->deblocking_filter_override_enabled_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
-+
-+    if (pps->disable_dbf)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
-+
-+    if (pps->lists_modification_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
-+
-+    if (pps->slice_header_extension_present_flag)
-+        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
-+
-+    if (pps->tiles_enabled_flag) {
-+        controls->pps.num_tile_columns_minus1 = pps->num_tile_columns - 1;
-+        controls->pps.num_tile_rows_minus1 = pps->num_tile_rows - 1;
-+
-+        av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p tiles_enabled_flag=%d num_tile_columns=%d num_tile_rows=%d\n",
-+	       __func__, avctx, pps->tiles_enabled_flag, pps->num_tile_columns, pps->num_tile_rows);
-+
-+        for (int i = 0; i < pps->num_tile_columns; i++)
-+            controls->pps.column_width_minus1[i] = pps->column_width[i] - 1;
-+
-+        for (int i = 0; i < pps->num_tile_rows; i++)
-+            controls->pps.row_height_minus1[i] = pps->row_height[i] - 1;
-+    }
-+
-+    controls->first_slice = 1;
-+    controls->num_slices = 0;
-+
-+    if ((rv = ff_v4l2_request_reset_frame(avctx, h->ref->frame)) != 0)
-+        return rv;
-+
-+    ff_v4l2_request_start_phase_control(h->ref->frame, ctx->pctrl);
-+
-+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
-+
-+    return 0;
-+}
-+
-+static int v4l2_request_hevc_queue_decode(AVCodecContext *avctx, int last_slice)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
-+            .ptr = &controls->sps,
-+            .size = sizeof(controls->sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
-+            .ptr = &controls->pps,
-+            .size = sizeof(controls->pps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
-+            .ptr = &controls->slice_params,
-+            .size = sizeof(controls->slice_params[0]) * FFMIN(controls->num_slices, 16),
-+        },
-+    };
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED)
-+        return ff_v4l2_request_decode_slice(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
-+
-+    return ff_v4l2_request_decode_frame(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) {
-+    const HEVCContext *h = avctx->priv_data;
-+
-+    if (h->ref != NULL)
-+        ff_v4l2_request_abort_phase_control(h->ref->frame);
-+}
-+
-+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
-+{
-+    int rv = v4l2_request_hevc_queue_decode(avctx, 1);
-+    if (rv < 0)
-+        v4l2_request_hevc_abort_frame(avctx);
-+    return rv;
-+}
-+
-+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->ref->frame->data[0];
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    int ret, slice = FFMIN(controls->num_slices, 15);
-+
-+    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && slice) {
-+        ret = v4l2_request_hevc_queue_decode(avctx, 0);
-+        if (ret)
-+            return ret;
-+
-+	ff_v4l2_request_reset_frame(avctx, h->ref->frame);
-+        slice = controls->num_slices = 0;
-+        controls->first_slice = 0;
-+    }
-+
-+    v4l2_request_hevc_fill_slice_params(h, &controls->slice_params[slice]);
-+
-+    ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, buffer, size);
-+    if (ret)
-+        return ret;
-+
-+    controls->slice_params[slice].bit_size = req->output.used * 8; //FIXME
-+    controls->num_slices++;
-+
-+    return 0;
-+}
-+
-+static int v4l2_request_hevc_set_controls(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ext_control control[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+    };
-+
-+    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE);
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_START_CODE);
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    control[0].value = ctx->decode_mode;
-+    control[1].value = ctx->start_code;
-+
-+    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    ff_v4l2_phase_control_deletez(&ctx->pctrl);
-+    return ff_v4l2_request_uninit(avctx);
-+}
-+
-+static int v4l2_request_hevc_init(AVCodecContext *avctx)
-+{
-+    int ret;
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct v4l2_ctrl_hevc_sps sps = make_v4l2_sps(h->ps.sps);
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
-+            .ptr = &sps,
-+            .size = sizeof(sps),
-+        },
-+    };
-+
-+    ctx->pctrl = ff_v4l2_phase_control_new(2);
-+
-+    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_HEVC_SLICE, 3 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
-+    if (ret)
-+        return ret;
-+
-+    return v4l2_request_hevc_set_controls(avctx);
-+}
-+
-+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
-+    .name           = "hevc_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame    = v4l2_request_hevc_start_frame,
-+    .decode_slice   = v4l2_request_hevc_decode_slice,
-+    .end_frame      = v4l2_request_hevc_end_frame,
-+    .abort_frame    = v4l2_request_hevc_abort_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsHEVC),
-+    .init           = v4l2_request_hevc_init,
-+    .uninit         = v4l2_request_hevc_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
-+    .frame_params   = ff_v4l2_request_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-diff --git a/libavcodec/v4l2_request_mpeg2.c b/libavcodec/v4l2_request_mpeg2.c
-new file mode 100644
-index 0000000000..37a4eae62c
---- /dev/null
-+++ b/libavcodec/v4l2_request_mpeg2.c
-@@ -0,0 +1,155 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hwaccel.h"
-+#include "mpegvideo.h"
-+#include "v4l2_request.h"
-+#include "mpeg2-ctrls.h"
-+
-+typedef struct V4L2RequestControlsMPEG2 {
-+    struct v4l2_ctrl_mpeg2_slice_params slice_params;
-+    struct v4l2_ctrl_mpeg2_quantization quantization;
-+} V4L2RequestControlsMPEG2;
-+
-+static int v4l2_request_mpeg2_start_frame(AVCodecContext *avctx,
-+                                          av_unused const uint8_t *buffer,
-+                                          av_unused uint32_t size)
-+{
-+    const MpegEncContext *s = avctx->priv_data;
-+    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
-+
-+    controls->slice_params = (struct v4l2_ctrl_mpeg2_slice_params) {
-+        .bit_size = 0,
-+        .data_bit_offset = 0,
-+
-+        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
-+        .quantiser_scale_code = s->qscale >> 1,
-+
-+        .sequence = {
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
-+            .horizontal_size = s->width,
-+            .vertical_size = s->height,
-+            .vbv_buffer_size = req->output.size,
-+
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
-+            .profile_and_level_indication = 0,
-+            .progressive_sequence = s->progressive_sequence,
-+            .chroma_format = s->chroma_format,
-+        },
-+
-+        .picture = {
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
-+            .picture_coding_type = s->pict_type,
-+
-+            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
-+            .f_code[0][0] = s->mpeg_f_code[0][0],
-+            .f_code[0][1] = s->mpeg_f_code[0][1],
-+            .f_code[1][0] = s->mpeg_f_code[1][0],
-+            .f_code[1][1] = s->mpeg_f_code[1][1],
-+            .intra_dc_precision = s->intra_dc_precision,
-+            .picture_structure = s->picture_structure,
-+            .top_field_first = s->top_field_first,
-+            .frame_pred_frame_dct = s->frame_pred_frame_dct,
-+            .concealment_motion_vectors = s->concealment_motion_vectors,
-+            .q_scale_type = s->q_scale_type,
-+            .intra_vlc_format = s->intra_vlc_format,
-+            .alternate_scan = s->alternate_scan,
-+            .repeat_first_field = s->repeat_first_field,
-+            .progressive_frame = s->progressive_frame,
-+        },
-+    };
-+
-+    switch (s->pict_type) {
-+    case AV_PICTURE_TYPE_B:
-+        controls->slice_params.backward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->next_picture.f);
-+        // fall-through
-+    case AV_PICTURE_TYPE_P:
-+        controls->slice_params.forward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->last_picture.f);
-+    }
-+
-+    controls->quantization = (struct v4l2_ctrl_mpeg2_quantization) {
-+        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
-+        .load_intra_quantiser_matrix = 1,
-+        .load_non_intra_quantiser_matrix = 1,
-+        .load_chroma_intra_quantiser_matrix = 1,
-+        .load_chroma_non_intra_quantiser_matrix = 1,
-+    };
-+
-+    for (int i = 0; i < 64; i++) {
-+        int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
-+        controls->quantization.intra_quantiser_matrix[i] = s->intra_matrix[n];
-+        controls->quantization.non_intra_quantiser_matrix[i] = s->inter_matrix[n];
-+        controls->quantization.chroma_intra_quantiser_matrix[i] = s->chroma_intra_matrix[n];
-+        controls->quantization.chroma_non_intra_quantiser_matrix[i] = s->chroma_inter_matrix[n];
-+    }
-+
-+    return ff_v4l2_request_reset_frame(avctx, s->current_picture_ptr->f);
-+}
-+
-+static int v4l2_request_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const MpegEncContext *s = avctx->priv_data;
-+
-+    return ff_v4l2_request_append_output_buffer(avctx, s->current_picture_ptr->f, buffer, size);
-+}
-+
-+static int v4l2_request_mpeg2_end_frame(AVCodecContext *avctx)
-+{
-+    const MpegEncContext *s = avctx->priv_data;
-+    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
-+    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS,
-+            .ptr = &controls->slice_params,
-+            .size = sizeof(controls->slice_params),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION,
-+            .ptr = &controls->quantization,
-+            .size = sizeof(controls->quantization),
-+        },
-+    };
-+
-+    controls->slice_params.bit_size = req->output.used * 8;
-+
-+    return ff_v4l2_request_decode_frame(avctx, s->current_picture_ptr->f, control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_mpeg2_init(AVCodecContext *avctx)
-+{
-+    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_MPEG2_SLICE, 1024 * 1024, NULL, 0);
-+}
-+
-+const AVHWAccel ff_mpeg2_v4l2request_hwaccel = {
-+    .name           = "mpeg2_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_MPEG2VIDEO,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame    = v4l2_request_mpeg2_start_frame,
-+    .decode_slice   = v4l2_request_mpeg2_decode_slice,
-+    .end_frame      = v4l2_request_mpeg2_end_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsMPEG2),
-+    .init           = v4l2_request_mpeg2_init,
-+    .uninit         = ff_v4l2_request_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContext),
-+    .frame_params   = ff_v4l2_request_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
-+};
-diff --git a/libavcodec/v4l2_request_vp8.c b/libavcodec/v4l2_request_vp8.c
-new file mode 100644
-index 0000000000..c290fe8b9a
---- /dev/null
-+++ b/libavcodec/v4l2_request_vp8.c
-@@ -0,0 +1,181 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hwaccel.h"
-+#include "v4l2_request.h"
-+#include "vp8.h"
-+#include "vp8-ctrls.h"
-+
-+typedef struct V4L2RequestControlsVP8 {
-+	struct v4l2_ctrl_vp8_frame_header ctrl;
-+} V4L2RequestControlsVP8;
-+
-+static int v4l2_request_vp8_start_frame(AVCodecContext          *avctx,
-+                                 av_unused const uint8_t *buffer,
-+                                 av_unused uint32_t       size)
-+{
-+    const VP8Context *s = avctx->priv_data;
-+    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
-+
-+    memset(&controls->ctrl, 0, sizeof(controls->ctrl));
-+    return ff_v4l2_request_reset_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f);
-+}
-+
-+static int v4l2_request_vp8_end_frame(AVCodecContext *avctx)
-+{
-+    const VP8Context *s = avctx->priv_data;
-+    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER,
-+	    .ptr = &controls->ctrl,
-+	    .size = sizeof(controls->ctrl),
-+        },
-+    };
-+
-+    return ff_v4l2_request_decode_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f,
-+					control, FF_ARRAY_ELEMS(control));
-+}
-+
-+static int v4l2_request_vp8_decode_slice(AVCodecContext *avctx,
-+					 const uint8_t *buffer,
-+					 uint32_t size)
-+{
-+    const VP8Context *s = avctx->priv_data;
-+    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
-+    struct v4l2_ctrl_vp8_frame_header *hdr = &controls->ctrl;
-+    const uint8_t *data = buffer + 3 + 7 * s->keyframe;
-+    unsigned int i, j, k;
-+
-+    hdr->version = s->profile & 0x3;
-+    hdr->width = avctx->width;
-+    hdr->height = avctx->height;
-+    /* FIXME: set ->xx_scale */
-+    hdr->prob_skip_false = s->prob->mbskip;
-+    hdr->prob_intra = s->prob->intra;
-+    hdr->prob_gf = s->prob->golden;
-+    hdr->prob_last = s->prob->last;
-+    hdr->first_part_size = s->header_partition_size;
-+    hdr->first_part_header_bits = (8 * (s->coder_state_at_header_end.input - data) -
-+                                   s->coder_state_at_header_end.bit_count - 8);
-+    hdr->num_dct_parts = s->num_coeff_partitions;
-+    for (i = 0; i < 8; i++)
-+	    hdr->dct_part_sizes[i] = s->coeff_partition_size[i];
-+
-+    hdr->coder_state.range = s->coder_state_at_header_end.range;
-+    hdr->coder_state.value = s->coder_state_at_header_end.value;
-+    hdr->coder_state.bit_count = s->coder_state_at_header_end.bit_count;
-+    if (s->framep[VP56_FRAME_PREVIOUS])
-+        hdr->last_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_PREVIOUS]->tf.f);
-+    if (s->framep[VP56_FRAME_GOLDEN])
-+        hdr->golden_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN]->tf.f);
-+    if (s->framep[VP56_FRAME_GOLDEN2])
-+        hdr->alt_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN2]->tf.f);
-+    hdr->flags |= s->invisible ? 0 : V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME;
-+    hdr->flags |= s->mbskip_enabled ? V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF : 0;
-+    hdr->flags |= (s->profile & 0x4) ? V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL : 0;
-+    hdr->flags |= s->keyframe ? V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME : 0;
-+    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN	: 0;
-+    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN2] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT : 0;
-+    hdr->segment_header.flags |= s->segmentation.enabled ? V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED : 0;
-+    hdr->segment_header.flags |= s->segmentation.update_map ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP : 0;
-+    hdr->segment_header.flags |= s->segmentation.update_feature_data ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA : 0;
-+    hdr->segment_header.flags |= s->segmentation.absolute_vals ? 0 : V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE;
-+    for (i = 0; i < 4; i++) {
-+        hdr->segment_header.quant_update[i] = s->segmentation.base_quant[i];
-+        hdr->segment_header.lf_update[i] = s->segmentation.filter_level[i];
-+    }
-+
-+    for (i = 0; i < 3; i++)
-+        hdr->segment_header.segment_probs[i] = s->prob->segmentid[i];
-+
-+    hdr->lf_header.level = s->filter.level;
-+    hdr->lf_header.sharpness_level = s->filter.sharpness;
-+    hdr->lf_header.flags |= s->lf_delta.enabled ? V4L2_VP8_LF_HEADER_ADJ_ENABLE : 0;
-+    hdr->lf_header.flags |= s->lf_delta.update ? V4L2_VP8_LF_HEADER_DELTA_UPDATE : 0;
-+    hdr->lf_header.flags |= s->filter.simple ? V4L2_VP8_LF_FILTER_TYPE_SIMPLE : 0;
-+    for (i = 0; i < 4; i++) {
-+        hdr->lf_header.ref_frm_delta[i] = s->lf_delta.ref[i];
-+        hdr->lf_header.mb_mode_delta[i] = s->lf_delta.mode[i + MODE_I4x4];
-+    }
-+
-+    // Probabilites
-+    if (s->keyframe) {
-+        static const uint8_t keyframe_y_mode_probs[4] = {
-+            145, 156, 163, 128
-+        };
-+        static const uint8_t keyframe_uv_mode_probs[3] = {
-+            142, 114, 183
-+        };
-+
-+        memcpy(hdr->entropy_header.y_mode_probs, keyframe_y_mode_probs,  4);
-+        memcpy(hdr->entropy_header.uv_mode_probs, keyframe_uv_mode_probs, 3);
-+    } else {
-+        for (i = 0; i < 4; i++)
-+            hdr->entropy_header.y_mode_probs[i] = s->prob->pred16x16[i];
-+        for (i = 0; i < 3; i++)
-+            hdr->entropy_header.uv_mode_probs[i] = s->prob->pred8x8c[i];
-+    }
-+    for (i = 0; i < 2; i++)
-+        for (j = 0; j < 19; j++)
-+            hdr->entropy_header.mv_probs[i][j] = s->prob->mvc[i][j];
-+
-+    for (i = 0; i < 4; i++) {
-+        for (j = 0; j < 8; j++) {
-+            static const int coeff_bands_inverse[8] = {
-+                0, 1, 2, 3, 5, 6, 4, 15
-+            };
-+            int coeff_pos = coeff_bands_inverse[j];
-+
-+            for (k = 0; k < 3; k++) {
-+                memcpy(hdr->entropy_header.coeff_probs[i][j][k],
-+                       s->prob->token[i][coeff_pos][k], 11);
-+            }
-+        }
-+    }
-+
-+    hdr->quant_header.y_ac_qi = s->quant.yac_qi;
-+    hdr->quant_header.y_dc_delta = s->quant.ydc_delta;
-+    hdr->quant_header.y2_dc_delta = s->quant.y2dc_delta;
-+    hdr->quant_header.y2_ac_delta = s->quant.y2ac_delta;
-+    hdr->quant_header.uv_dc_delta = s->quant.uvdc_delta;
-+    hdr->quant_header.uv_ac_delta = s->quant.uvac_delta;
-+
-+    return ff_v4l2_request_append_output_buffer(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f, buffer, size);
-+}
-+
-+static int v4l2_request_vp8_init(AVCodecContext *avctx)
-+{
-+    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP8_FRAME, 1024 * 1024, NULL, 0);
-+}
-+
-+const AVHWAccel ff_vp8_v4l2request_hwaccel = {
-+    .name                 = "vp8_v4l2request",
-+    .type                 = AVMEDIA_TYPE_VIDEO,
-+    .id                   = AV_CODEC_ID_VP8,
-+    .pix_fmt              = AV_PIX_FMT_DRM_PRIME,
-+    .start_frame          = v4l2_request_vp8_start_frame,
-+    .decode_slice         = v4l2_request_vp8_decode_slice,
-+    .end_frame            = v4l2_request_vp8_end_frame,
-+    .frame_priv_data_size = sizeof(V4L2RequestControlsVP8),
-+    .init                 = v4l2_request_vp8_init,
-+    .uninit               = ff_v4l2_request_uninit,
-+    .priv_data_size       = sizeof(V4L2RequestContext),
-+    .frame_params         = ff_v4l2_request_frame_params,
-+    .caps_internal        = HWACCEL_CAP_ASYNC_SAFE,
-+};
-diff --git a/libavcodec/vp8-ctrls.h b/libavcodec/vp8-ctrls.h
-new file mode 100644
-index 0000000000..53cba826e4
---- /dev/null
-+++ b/libavcodec/vp8-ctrls.h
-@@ -0,0 +1,112 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the VP8 state controls for use with stateless VP8
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _VP8_CTRLS_H_
-+#define _VP8_CTRLS_H_
-+
-+#include <linux/types.h>
-+
-+#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F')
-+
-+#define V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (V4L2_CID_MPEG_BASE + 2000)
-+#define V4L2_CTRL_TYPE_VP8_FRAME_HEADER 0x301
-+
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED              0x01
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP           0x02
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA  0x04
-+#define V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE     0x08
-+
-+struct v4l2_vp8_segment_header {
-+	__s8 quant_update[4];
-+	__s8 lf_update[4];
-+	__u8 segment_probs[3];
-+	__u8 padding;
-+	__u32 flags;
-+};
-+
-+#define V4L2_VP8_LF_HEADER_ADJ_ENABLE	0x01
-+#define V4L2_VP8_LF_HEADER_DELTA_UPDATE	0x02
-+#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE	0x04
-+struct v4l2_vp8_loopfilter_header {
-+	__s8 ref_frm_delta[4];
-+	__s8 mb_mode_delta[4];
-+	__u8 sharpness_level;
-+	__u8 level;
-+	__u16 padding;
-+	__u32 flags;
-+};
-+
-+struct v4l2_vp8_quantization_header {
-+	__u8 y_ac_qi;
-+	__s8 y_dc_delta;
-+	__s8 y2_dc_delta;
-+	__s8 y2_ac_delta;
-+	__s8 uv_dc_delta;
-+	__s8 uv_ac_delta;
-+	__u16 padding;
-+};
-+
-+struct v4l2_vp8_entropy_header {
-+	__u8 coeff_probs[4][8][3][11];
-+	__u8 y_mode_probs[4];
-+	__u8 uv_mode_probs[3];
-+	__u8 mv_probs[2][19];
-+	__u8 padding[3];
-+};
-+
-+struct v4l2_vp8_entropy_coder_state {
-+	__u8 range;
-+	__u8 value;
-+	__u8 bit_count;
-+	__u8 padding;
-+};
-+
-+#define V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME		0x01
-+#define V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL		0x02
-+#define V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME		0x04
-+#define V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF	0x08
-+#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN	0x10
-+#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT	0x20
-+
-+#define VP8_FRAME_IS_KEY_FRAME(hdr) \
-+	(!!((hdr)->flags & V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME))
-+
-+struct v4l2_ctrl_vp8_frame_header {
-+	struct v4l2_vp8_segment_header segment_header;
-+	struct v4l2_vp8_loopfilter_header lf_header;
-+	struct v4l2_vp8_quantization_header quant_header;
-+	struct v4l2_vp8_entropy_header entropy_header;
-+	struct v4l2_vp8_entropy_coder_state coder_state;
-+
-+	__u16 width;
-+	__u16 height;
-+
-+	__u8 horizontal_scale;
-+	__u8 vertical_scale;
-+
-+	__u8 version;
-+	__u8 prob_skip_false;
-+	__u8 prob_intra;
-+	__u8 prob_last;
-+	__u8 prob_gf;
-+	__u8 num_dct_parts;
-+
-+	__u32 first_part_size;
-+	__u32 first_part_header_bits;
-+	__u32 dct_part_sizes[8];
-+
-+	__u64 last_frame_ts;
-+	__u64 golden_frame_ts;
-+	__u64 alt_frame_ts;
-+
-+	__u64 flags;
-+};
-+
-+#endif
-diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
-index 3ddc349a4d..d4289320a9 100644
---- a/libavcodec/vp8.c
-+++ b/libavcodec/vp8.c
-@@ -175,6 +175,9 @@ static enum AVPixelFormat get_pixel_format(VP8Context *s)
- #endif
- #if CONFIG_VP8_NVDEC_HWACCEL
-         AV_PIX_FMT_CUDA,
-+#endif
-+#if CONFIG_VP8_V4L2REQUEST_HWACCEL
-+        AV_PIX_FMT_DRM_PRIME,
- #endif
-         AV_PIX_FMT_YUV420P,
-         AV_PIX_FMT_NONE,
-@@ -198,7 +201,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
-             return ret;
-     }
- 
--    if (!s->actually_webp && !is_vp7) {
-+    if (!s->actually_webp && !is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
-         s->pix_fmt = get_pixel_format(s);
-         if (s->pix_fmt < 0)
-             return AVERROR(EINVAL);
-@@ -2981,6 +2984,9 @@ AVCodec ff_vp8_decoder = {
- #endif
- #if CONFIG_VP8_NVDEC_HWACCEL
-                                HWACCEL_NVDEC(vp8),
-+#endif
-+#if CONFIG_VP8_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(vp8),
- #endif
-                                NULL
-                            },
-diff --git a/libavdevice/Makefile b/libavdevice/Makefile
-index 6ea62b914e..19f7f5353c 100644
---- a/libavdevice/Makefile
-+++ b/libavdevice/Makefile
-@@ -45,6 +45,8 @@ OBJS-$(CONFIG_SNDIO_INDEV)               += sndio_dec.o sndio.o
- OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
- OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
- OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
-+OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
-+OBJS-$(CONFIG_VOUT_RPI_OUTDEV)           += rpi_vout.o
- OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
- OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
- OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
-diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
-index 8633433254..1df47be492 100644
---- a/libavdevice/alldevices.c
-+++ b/libavdevice/alldevices.c
-@@ -52,6 +52,8 @@ extern AVOutputFormat ff_sndio_muxer;
- extern AVInputFormat  ff_v4l2_demuxer;
- extern AVOutputFormat ff_v4l2_muxer;
- extern AVInputFormat  ff_vfwcap_demuxer;
-+extern AVOutputFormat ff_vout_drm_muxer;
-+extern AVOutputFormat ff_vout_rpi_muxer;
- extern AVInputFormat  ff_xcbgrab_demuxer;
- extern AVOutputFormat ff_xv_muxer;
- 
-diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
-new file mode 100644
-index 0000000000..db795d3825
---- /dev/null
-+++ b/libavdevice/drm_vout.c
-@@ -0,0 +1,508 @@
-+/*
-+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+// *** This module is a work in progress and its utility is strictly
-+//     limited to testing.
-+//     Amongst other issues it doesn't wait for the pic to be displayed before
-+//     returning the buffer so flikering does occur.
-+
-+#include "libavutil/opt.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/imgutils.h"
-+#include "libavutil/hwcontext_drm.h"
-+#include "libavformat/internal.h"
-+#include "avdevice.h"
-+
-+#include <stdatomic.h>
-+
-+#include "drm_fourcc.h"
-+#include <drm.h>
-+#include <drm_mode.h>
-+#include <xf86drm.h>
-+#include <xf86drmMode.h>
-+
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#define TRACE_ALL 0
-+
-+#define NUM_BUFFERS 4
-+#define RPI_DISPLAY_ALL 0
-+
-+#define DRM_MODULE "vc4"
-+
-+#define ERRSTR strerror(errno)
-+
-+struct drm_setup {
-+   int conId;
-+   uint32_t crtcId;
-+   int crtcIdx;
-+   uint32_t planeId;
-+   unsigned int out_fourcc;
-+   struct {
-+       int x, y, width, height;
-+   } compose;
-+};
-+
-+typedef struct drm_aux_s {
-+    int fd;
-+    uint32_t bo_handles[4];
-+    unsigned int fb_handle;
-+} drm_aux_t;
-+
-+typedef struct drm_display_env_s
-+{
-+    AVClass *class;
-+
-+    int drm_fd;
-+    uint32_t con_id;
-+    struct drm_setup setup;
-+    enum AVPixelFormat avfmt;
-+
-+    drm_aux_t aux[32];
-+
-+} drm_display_env_t;
-+
-+
-+static int drm_vout_write_trailer(AVFormatContext *s)
-+{
-+#if TRACE_ALL
-+    drm_display_env_t * const de = s->priv_data;
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+
-+    return 0;
-+}
-+
-+static int drm_vout_write_header(AVFormatContext *s)
-+{
-+    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+    if (   s->nb_streams > 1
-+        || par->codec_type != AVMEDIA_TYPE_VIDEO
-+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return 0;
-+}
-+
-+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
-+{
-+    AVFrame * const frame = (AVFrame *)pkt->data;
-+    drm_display_env_t * const de = s->priv_data;
-+    int ret = 0;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+
-+    if (frame->format != AV_PIX_FMT_DRM_PRIME) {
-+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", frame->format);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    {
-+        const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
-+        drm_aux_t * da = NULL;
-+        unsigned int i;
-+
-+        for (i = 0; i != 32; ++i) {
-+            if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
-+                da = de->aux + i;
-+                break;
-+            }
-+        }
-+
-+        if (da == NULL) {
-+            av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
-+            return AVERROR(EINVAL);
-+        }
-+
-+        if (da->fd == -1) {
-+            uint32_t pitches[4] = {0};
-+            uint32_t offsets[4] = {0};
-+            uint64_t modifiers[4] = {0};
-+            uint32_t bo_plane_handles[4] = {0};
-+            int i, j, n;
-+
-+            for (i = 0; i < desc->nb_objects; ++i) {
-+                if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
-+                    av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle failed: %s\n", ERRSTR);
-+                    return -1;
-+                }
-+            }
-+
-+            n = 0;
-+            for (i = 0; i < desc->nb_layers; ++i) {
-+                for (j = 0; j < desc->layers[i].nb_planes; ++j) {
-+                    const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
-+                    const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
-+                    pitches[n] = p->pitch;
-+                    offsets[n] = p->offset;
-+                    modifiers[n] = obj->format_modifier;
-+                    bo_plane_handles[n] = da->bo_handles[p->object_index];
-+                    ++n;
-+                }
-+            }
-+
-+#if 0
-+            av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
-+                   " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
-+                   av_frame_cropped_width(frame),
-+                   av_frame_cropped_height(frame),
-+                   desc->layers[0].format,
-+                   bo_plane_handles[0],
-+                   bo_plane_handles[1],
-+                   bo_plane_handles[2],
-+                   bo_plane_handles[3],
-+                   pitches[0],
-+                   pitches[1],
-+                   pitches[2],
-+                   pitches[3],
-+                   offsets[0],
-+                   offsets[1],
-+                   offsets[2],
-+                   offsets[3],
-+                   (long long)modifiers[0],
-+                   (long long)modifiers[1],
-+                   (long long)modifiers[2],
-+                   (long long)modifiers[3]
-+                   );
-+#endif
-+
-+            if (drmModeAddFB2WithModifiers(de->drm_fd,
-+                                             av_frame_cropped_width(frame),
-+                                             av_frame_cropped_height(frame),
-+                                             desc->layers[0].format, bo_plane_handles,
-+                                             pitches, offsets, modifiers,
-+                                             &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
-+                av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
-+                return -1;
-+            }
-+
-+            da->fd = desc->objects[0].fd;
-+        }
-+
-+        ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
-+                                  da->fb_handle, 0,
-+                    de->setup.compose.x, de->setup.compose.y,
-+                    de->setup.compose.width,
-+                    de->setup.compose.height,
-+                    0, 0,
-+                    av_frame_cropped_width(frame) << 16,
-+                    av_frame_cropped_height(frame) << 16);
-+
-+        if (ret != 0) {
-+            av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-+                          unsigned flags)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-+#endif
-+
-+    /* drm_vout_write_header() should have accepted only supported formats */
-+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
-+        return 0;
-+
-+    return 0;
-+}
-+
-+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
-+#endif
-+    switch(type) {
-+    case AV_APP_TO_DEV_WINDOW_REPAINT:
-+        return 0;
-+    default:
-+        break;
-+    }
-+    return AVERROR(ENOSYS);
-+}
-+
-+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
-+{
-+   int ret = -1;
-+   int i;
-+   drmModeRes *res = drmModeGetResources(drmfd);
-+   drmModeConnector *c;
-+
-+   if(!res)
-+   {
-+      printf( "drmModeGetResources failed: %s\n", ERRSTR);
-+      return -1;
-+   }
-+
-+   if (res->count_crtcs <= 0)
-+   {
-+      printf( "drm: no crts\n");
-+      goto fail_res;
-+   }
-+
-+   if (!s->conId) {
-+      fprintf(stderr,
-+         "No connector ID specified.  Choosing default from list:\n");
-+
-+      for (i = 0; i < res->count_connectors; i++) {
-+         drmModeConnector *con =
-+            drmModeGetConnector(drmfd, res->connectors[i]);
-+         drmModeEncoder *enc = NULL;
-+         drmModeCrtc *crtc = NULL;
-+
-+         if (con->encoder_id) {
-+            enc = drmModeGetEncoder(drmfd, con->encoder_id);
-+            if (enc->crtc_id) {
-+               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
-+            }
-+         }
-+
-+         if (!s->conId && crtc) {
-+            s->conId = con->connector_id;
-+            s->crtcId = crtc->crtc_id;
-+         }
-+
-+         av_log(avctx, AV_LOG_INFO, "Connector %d (crtc %d): type %d, %dx%d%s\n",
-+                con->connector_id,
-+                crtc ? crtc->crtc_id : 0,
-+                con->connector_type,
-+                crtc ? crtc->width : 0,
-+                crtc ? crtc->height : 0,
-+                (s->conId == (int)con->connector_id ?
-+            " (chosen)" : ""));
-+      }
-+
-+      if (!s->conId) {
-+         av_log(avctx, AV_LOG_ERROR,
-+            "No suitable enabled connector found.\n");
-+         return -1;;
-+      }
-+   }
-+
-+   s->crtcIdx = -1;
-+
-+   for (i = 0; i < res->count_crtcs; ++i) {
-+      if (s->crtcId == res->crtcs[i]) {
-+         s->crtcIdx = i;
-+         break;
-+      }
-+   }
-+
-+   if (s->crtcIdx == -1)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
-+       goto fail_res;
-+   }
-+
-+   if (res->count_connectors <= 0)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
-+       goto fail_res;
-+   }
-+
-+   c = drmModeGetConnector(drmfd, s->conId);
-+   if (!c)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
-+       goto fail_res;
-+   }
-+
-+   if (!c->count_modes)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
-+       goto fail_conn;
-+   }
-+
-+   {
-+      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
-+      s->compose.x = crtc->x;
-+      s->compose.y = crtc->y;
-+      s->compose.width = crtc->width;
-+      s->compose.height = crtc->height;
-+      drmModeFreeCrtc(crtc);
-+   }
-+
-+   if (pConId)
-+      *pConId = c->connector_id;
-+   ret = 0;
-+
-+fail_conn:
-+   drmModeFreeConnector(c);
-+
-+fail_res:
-+   drmModeFreeResources(res);
-+
-+   return ret;
-+}
-+
-+static int find_plane(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s)
-+{
-+   drmModePlaneResPtr planes;
-+   drmModePlanePtr plane;
-+   unsigned int i;
-+   unsigned int j;
-+   int ret = 0;
-+
-+   planes = drmModeGetPlaneResources(drmfd);
-+   if (!planes)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
-+       return -1;
-+   }
-+
-+   for (i = 0; i < planes->count_planes; ++i) {
-+      plane = drmModeGetPlane(drmfd, planes->planes[i]);
-+      if (!planes)
-+      {
-+          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
-+          break;
-+      }
-+
-+      if (!(plane->possible_crtcs & (1 << s->crtcIdx))) {
-+         drmModeFreePlane(plane);
-+         continue;
-+      }
-+
-+      for (j = 0; j < plane->count_formats; ++j) {
-+         if (plane->formats[j] == s->out_fourcc)
-+            break;
-+      }
-+
-+      if (j == plane->count_formats) {
-+         drmModeFreePlane(plane);
-+         continue;
-+      }
-+
-+      s->planeId = plane->plane_id;
-+      drmModeFreePlane(plane);
-+      break;
-+   }
-+
-+   if (i == planes->count_planes)
-+      ret = -1;
-+
-+   drmModeFreePlaneResources(planes);
-+   return ret;
-+}
-+
-+// deinit is called if init fails so no need to clean up explicity here
-+static int drm_vout_init(struct AVFormatContext * s)
-+{
-+    drm_display_env_t * const de = s->priv_data;
-+    unsigned int i;
-+
-+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
-+
-+    de->drm_fd = -1;
-+    de->con_id = 0;
-+    de->setup = (struct drm_setup){0};
-+
-+    de->setup.out_fourcc = DRM_FORMAT_NV12; // **** Need some sort of select
-+
-+    for (i = 0; i != 32; ++i) {
-+        de->aux[i].fd = -1;
-+    }
-+
-+    if ((de->drm_fd = drmOpen(DRM_MODULE, NULL)) < 0)
-+    {
-+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s\n", DRM_MODULE);
-+        return -1;
-+    }
-+
-+    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
-+    {
-+        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
-+        return -1;
-+    }
-+
-+    if (find_plane(s, de->drm_fd, &de->setup) != 0)
-+    {
-+        av_log(s, AV_LOG_ERROR, "failed to find compatible plane\n");
-+        return -1;
-+    }
-+
-+    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
-+
-+    return 0;
-+}
-+
-+static void drm_vout_deinit(struct AVFormatContext * s)
-+{
-+    drm_display_env_t * const de = s->priv_data;
-+
-+    if (de->drm_fd >= 0) {
-+        close(de->drm_fd);
-+        de->drm_fd = -1;
-+    }
-+
-+}
-+
-+
-+#define OFFSET(x) offsetof(drm_display_env_t, x)
-+static const AVOption options[] = {
-+#if 0
-+    { "display_name", "set display name",       OFFSET(display_name), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_id",    "set existing window id", OFFSET(window_id),    AV_OPT_TYPE_INT64,  {.i64 = 0 }, 0, INT64_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_title", "set window title",       OFFSET(window_title), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+#endif
-+    { NULL }
-+
-+};
-+
-+static const AVClass drm_vout_class = {
-+    .class_name = "drm vid outdev",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
-+};
-+
-+AVOutputFormat ff_vout_drm_muxer = {
-+    .name           = "vout_drm",
-+    .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
-+    .priv_data_size = sizeof(drm_display_env_t),
-+    .audio_codec    = AV_CODEC_ID_NONE,
-+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
-+    .write_header   = drm_vout_write_header,
-+    .write_packet   = drm_vout_write_packet,
-+    .write_uncoded_frame = drm_vout_write_frame,
-+    .write_trailer  = drm_vout_write_trailer,
-+    .control_message = drm_vout_control_message,
-+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
-+    .priv_class     = &drm_vout_class,
-+    .init           = drm_vout_init,
-+    .deinit         = drm_vout_deinit,
-+};
-diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c
-new file mode 100644
-index 0000000000..60fe8a7075
---- /dev/null
-+++ b/libavdevice/rpi_vout.c
-@@ -0,0 +1,534 @@
-+/*
-+ * Copyright (c) 2013 Jeff Moguillansky
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * XVideo output device
-+ *
-+ * TODO:
-+ * - add support to more formats
-+ */
-+
-+#include "libavutil/opt.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/imgutils.h"
-+#include "libavformat/internal.h"
-+#include "avdevice.h"
-+
-+#include <stdatomic.h>
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <bcm_host.h>
-+#include <interface/mmal/mmal.h>
-+#include <interface/mmal/mmal_parameters_camera.h>
-+#include <interface/mmal/mmal_buffer.h>
-+#include <interface/mmal/mmal_port.h>
-+#include <interface/mmal/util/mmal_util.h>
-+#include <interface/mmal/util/mmal_default_components.h>
-+#include <interface/mmal/util/mmal_connection.h>
-+#include <interface/mmal/util/mmal_util_params.h>
-+#pragma GCC diagnostic pop
-+#include "libavutil/rpi_sand_fns.h"
-+#include "libavcodec/rpi_zc.h"
-+
-+#define TRACE_ALL 0
-+
-+#define RPI_DISPLAY_ALL 0
-+#define DISPLAY_PORT_DEPTH 4
-+
-+typedef struct rpi_display_env_s
-+{
-+    AVClass *class;
-+
-+    MMAL_COMPONENT_T* display;
-+    MMAL_COMPONENT_T* isp;
-+    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
-+    MMAL_CONNECTION_T * conn;
-+
-+    MMAL_POOL_T *rpi_pool;
-+    volatile int rpi_display_count;
-+
-+    MMAL_FOURCC_T req_fmt;
-+    MMAL_VIDEO_FORMAT_T req_vfmt;
-+
-+    AVZcEnvPtr zc;
-+
-+    int window_width, window_height;
-+    int window_x, window_y;
-+    int layer, fullscreen;
-+} rpi_display_env_t;
-+
-+
-+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+    mmal_buffer_header_release(buffer);
-+}
-+
-+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-+    mmal_buffer_header_release(buffer);
-+}
-+
-+
-+static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
-+{
-+    switch (fmt) {
-+    case AV_PIX_FMT_SAND128:
-+    case AV_PIX_FMT_RPI4_8:
-+        return MMAL_ENCODING_YUVUV128;
-+    case AV_PIX_FMT_RPI4_10:
-+        return MMAL_ENCODING_YUV10_COL;
-+    case AV_PIX_FMT_SAND64_10:
-+        return MMAL_ENCODING_YUVUV64_10;
-+    case AV_PIX_FMT_SAND64_16:
-+        return MMAL_ENCODING_YUVUV64_16;
-+    case AV_PIX_FMT_YUV420P:
-+        return MMAL_ENCODING_I420;
-+
-+    default:
-+        break;
-+    }
-+    return 0;
-+}
-+
-+
-+static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
-+                                       const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
-+{
-+    MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
-+    const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
-+    if (av_rpi_is_sand_format(geo->format)) {
-+        // Sand formats are a bit "special"
-+        // stride1 implicit in format
-+        // width = stride2
-+        vfmt->width = geo->stripe_is_yc ?
-+            geo->height_y + geo->height_c : geo->height_y;
-+//        es->height = geo->video_height;  //*** When we get the FLAG this will change
-+        vfmt->height = geo->height_y;
-+        es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
-+    }
-+    else {
-+        vfmt->width = geo->stride_y / geo->bytes_per_pel;
-+        vfmt->height = geo->height_y;
-+        es_fmt->flags = 0;
-+    }
-+
-+    es_fmt->type = MMAL_ES_TYPE_VIDEO;
-+    es_fmt->encoding = mmfmt_from_avfmt(geo->format);
-+    es_fmt->encoding_variant = 0;
-+    es_fmt->bitrate = 0;
-+
-+    vfmt->crop.x = frame->crop_left;
-+    vfmt->crop.y = frame->crop_top;
-+    vfmt->crop.width = av_frame_cropped_width(frame);
-+    vfmt->crop.height = av_frame_cropped_height(frame);
-+
-+    vfmt->frame_rate.den = 0;  // Don't think I know it here
-+    vfmt->frame_rate.num = 0;
-+
-+    vfmt->par.den = frame->sample_aspect_ratio.den;
-+    vfmt->par.num = frame->sample_aspect_ratio.num;
-+
-+    vfmt->color_space = 0;  // Unknown currently
-+}
-+
-+static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
-+{
-+    rpi_display_env_t * const de = userdata;
-+    if (buf->user_data != NULL) {
-+        av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
-+        buf->user_data = NULL;
-+    }
-+    atomic_fetch_add(&de->rpi_display_count, -1);
-+    return MMAL_FALSE;
-+}
-+
-+static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
-+{
-+    return avfmt == AV_PIX_FMT_SAND64_10;
-+}
-+
-+static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
-+{
-+    if (de->isp != NULL)
-+    {
-+        if (de->isp->input[0]->is_enabled)
-+            mmal_port_disable(de->isp->input[0]);
-+        if (de->isp->control->is_enabled)
-+            mmal_port_disable(de->isp->control);
-+    }
-+    if (de->conn != NULL) {
-+        mmal_connection_destroy(de->conn);
-+        de->conn = NULL;
-+    }
-+    if (de->isp != NULL) {
-+        mmal_component_destroy(de->isp);
-+        de->isp = NULL;
-+    }
-+}
-+
-+static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
-+{
-+    MMAL_BUFFER_HEADER_T* buf = NULL;
-+    AVRpiZcRefPtr fr_buf = NULL;
-+
-+    if (de == NULL)
-+        return;
-+
-+    if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
-+        return;
-+    }
-+
-+    if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
-+        return;
-+    }
-+
-+    buf = mmal_queue_get(de->rpi_pool->queue);
-+    if (!buf) {
-+        // Running too fast so drop the frame (unexpected)
-+        goto fail;
-+    }
-+
-+    buf->cmd = 0;
-+    buf->offset = 0;
-+    buf->flags = 0;
-+    mmal_buffer_header_reset(buf);
-+
-+    atomic_fetch_add(&de->rpi_display_count, 1);  // Deced on release
-+    mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
-+
-+    buf->user_data = fr_buf;
-+    buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
-+    buf->offset = av_rpi_zc_offset(fr_buf);
-+    buf->length = av_rpi_zc_length(fr_buf);
-+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+
-+#if RPI_DISPLAY_ALL
-+    while (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+        usleep(5000);
-+    }
-+#endif
-+
-+    {
-+        MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
-+        MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
-+		MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
-+
-+        video_format_from_zc_frame(&new_es, fr, fr_buf);
-+        if (de->req_fmt != new_es.encoding ||
-+            de->req_vfmt.width       != new_vfmt->width ||
-+            de->req_vfmt.height      != new_vfmt->height ||
-+            de->req_vfmt.crop.x      != new_vfmt->crop.x ||
-+            de->req_vfmt.crop.y      != new_vfmt->crop.y ||
-+            de->req_vfmt.crop.width  != new_vfmt->crop.width ||
-+            de->req_vfmt.crop.height != new_vfmt->crop.height) {
-+            // Something has changed
-+
-+            // If we have an ISP tear it down
-+            isp_remove(s, de);
-+            de->port_in = de->display->input[0];
-+
-+            // If we still need an ISP create it now
-+            if (avfmt_needs_isp(fr->format))
-+            {
-+                if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
-+                {
-+                    av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
-+                    goto fail;
-+                }
-+                de->port_in = de->isp->input[0];
-+            }
-+
-+            mmal_format_copy(de->port_in->format, &new_es);
-+
-+            if (mmal_port_format_commit(de->port_in)) {
-+                av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
-+                goto fail;
-+            }
-+
-+            // If we have an ISP then we must want to use it
-+            if (de->isp != NULL) {
-+                MMAL_PORT_T * const port_out = de->isp->output[0];
-+                MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
-+                MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
-+
-+                port_out->format->type = MMAL_ES_TYPE_VIDEO;
-+                port_out->format->encoding  = MMAL_ENCODING_YUVUV128;
-+                port_out->format->encoding_variant = 0;
-+                port_out->format->bitrate = 0;
-+                port_out->format->flags = 0;
-+                port_out->format->extradata = NULL;
-+                port_out->format->extradata_size = 0;
-+
-+                vfmt_out->width       = (vfmt_in->crop.width + 31) & ~31;
-+                vfmt_out->height      = (vfmt_in->crop.height + 15) & ~15;
-+                vfmt_out->crop.x      = 0;
-+                vfmt_out->crop.y      = 0;
-+                vfmt_out->crop.width  = vfmt_in->crop.width;
-+                vfmt_out->crop.height = vfmt_in->crop.height;
-+                vfmt_out->frame_rate  = vfmt_in->frame_rate;
-+                vfmt_out->par         = vfmt_in->par;
-+                vfmt_out->color_space = vfmt_in->color_space;
-+
-+                if (mmal_port_format_commit(port_out)) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
-+                    goto fail;
-+                }
-+
-+                if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
-+                    goto fail;
-+                }
-+                if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
-+                    goto fail;
-+                }
-+                mmal_port_enable(de->isp->control,display_cb_control);
-+                mmal_component_enable(de->isp);
-+            }
-+
-+            // Number of slots in my port Q
-+            de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
-+            // Size to keep it happy - isn't used for anything other than error checking
-+            de->port_in->buffer_size = buf->alloc_size;
-+            if (!de->port_in->is_enabled)
-+            {
-+                mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
-+                if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
-+                    av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
-+                    goto fail;
-+                }
-+            }
-+
-+            de->req_fmt  = new_es.encoding;
-+            de->req_vfmt = *new_vfmt;
-+        }
-+    }
-+
-+    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
-+    {
-+        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
-+        goto fail;
-+    }
-+    return;
-+
-+fail:
-+    // If we have a buf then fr_buf is held by that
-+    if (buf != NULL)
-+        mmal_buffer_header_release(buf);
-+    else if (fr_buf != NULL)
-+        av_rpi_zc_unref(fr_buf);
-+}
-+
-+
-+static int xv_write_trailer(AVFormatContext *s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+    if (de->port_in != NULL && de->port_in->is_enabled) {
-+        mmal_port_disable(de->port_in);
-+    }
-+
-+    // The above disable should kick out all buffers - check that
-+    if (atomic_load(&de->rpi_display_count) != 0) {
-+        av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
-+    }
-+
-+    isp_remove(s, de);
-+    if (de->rpi_pool != NULL) {
-+        mmal_pool_destroy(de->rpi_pool);
-+        de->rpi_pool = NULL;
-+    }
-+    if (de->display != NULL) {
-+        mmal_component_destroy(de->display);
-+        de->display = NULL;
-+    }
-+
-+    return 0;
-+}
-+
-+static int xv_write_header(AVFormatContext *s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+    const unsigned int w = de->window_width ? de->window_width : par->width;
-+    const unsigned int h = de->window_height ? de->window_height : par->height;
-+    const unsigned int x = de->window_x;
-+    const unsigned int y = de->window_y;
-+    const int layer = de->layer ? de->layer : 2;
-+    const MMAL_BOOL_T fullscreen = de->fullscreen;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
-+#endif
-+    if (   s->nb_streams > 1
-+        || par->codec_type != AVMEDIA_TYPE_VIDEO
-+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    {
-+        MMAL_DISPLAYREGION_T region =
-+        {
-+            .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-+            .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
-+                MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
-+            .layer = layer,
-+            .fullscreen = fullscreen,
-+            .dest_rect = {x, y, w, h},
-+            .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
-+        };
-+
-+        bcm_host_init();  // Needs to be done by someone...
-+
-+        if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
-+            goto fail;
-+        }
-+        de->port_in = de->display->input[0];
-+
-+        mmal_port_parameter_set(de->display->input[0], &region.hdr);
-+
-+        if (mmal_component_enable(de->display) != MMAL_SUCCESS)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
-+            goto fail;
-+        }
-+        if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
-+            goto fail;
-+        }
-+
-+        if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
-+        {
-+            av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
-+            goto fail;
-+        }
-+    }
-+
-+    return 0;
-+
-+fail:
-+    xv_write_trailer(s);
-+    return AVERROR_UNKNOWN;
-+}
-+
-+static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
-+{
-+    AVFrame * const frame = (AVFrame *)pkt->data;
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+    display_frame(s, s->priv_data, frame);
-+    return 0;
-+}
-+
-+static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-+                          unsigned flags)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-+#endif
-+
-+    /* xv_write_header() should have accepted only supported formats */
-+    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
-+        return 0;
-+//    return write_picture(s, (*frame)->data, (*frame)->linesize);
-+
-+    display_frame(s, s->priv_data, *ppframe);
-+    return 0;
-+}
-+
-+static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
-+#endif
-+    switch(type) {
-+    case AV_APP_TO_DEV_WINDOW_REPAINT:
-+        return 0;
-+    default:
-+        break;
-+    }
-+    return AVERROR(ENOSYS);
-+}
-+
-+// deinit is called if init fails so no need to clean up explicity here
-+static int rpi_vout_init(struct AVFormatContext * s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+
-+    // Get a ZC context in case we need one - has little overhead if unused
-+    if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
-+        return 1;
-+
-+    return 0;
-+}
-+
-+static void rpi_vout_deinit(struct AVFormatContext * s)
-+{
-+    rpi_display_env_t * const de = s->priv_data;
-+
-+    av_rpi_zc_int_env_freep(&de->zc);
-+}
-+
-+
-+#define OFFSET(x) offsetof(rpi_display_env_t, x)
-+static const AVOption options[] = {
-+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "display_layer","set display layer",      OFFSET(layer),        AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { NULL }
-+
-+};
-+
-+static const AVClass xv_class = {
-+    .class_name = "rpi vid outdev",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
-+};
-+
-+AVOutputFormat ff_vout_rpi_muxer = {
-+    .name           = "vout_rpi",
-+    .long_name      = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
-+    .priv_data_size = sizeof(rpi_display_env_t),
-+    .audio_codec    = AV_CODEC_ID_NONE,
-+    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
-+    .write_header   = xv_write_header,
-+    .write_packet   = xv_write_packet,
-+    .write_uncoded_frame = xv_write_frame,
-+    .write_trailer  = xv_write_trailer,
-+    .control_message = xv_control_message,
-+    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
-+    .priv_class     = &xv_class,
-+    .init           = rpi_vout_init,
-+    .deinit         = rpi_vout_deinit,
-+};
-diff --git a/libavfilter/Makefile b/libavfilter/Makefile
-index 455c809b15..087cab98ee 100644
---- a/libavfilter/Makefile
-+++ b/libavfilter/Makefile
-@@ -406,6 +406,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)       += vf_transpose_opencl.o opencl.o o
- OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
- OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
- OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
-+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
- OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
- OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
-                                                 opencl/unsharp.o
-diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
-index 04a3df7d56..c27426c015 100644
---- a/libavfilter/allfilters.c
-+++ b/libavfilter/allfilters.c
-@@ -386,6 +386,7 @@ extern AVFilter ff_vf_transpose_opencl;
- extern AVFilter ff_vf_transpose_vaapi;
- extern AVFilter ff_vf_trim;
- extern AVFilter ff_vf_unpremultiply;
-+extern AVFilter ff_vf_unsand;
- extern AVFilter ff_vf_unsharp;
- extern AVFilter ff_vf_unsharp_opencl;
- extern AVFilter ff_vf_uspp;
-diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
-index a149f8fb6d..776e3bb9ab 100644
---- a/libavfilter/avfiltergraph.c
-+++ b/libavfilter/avfiltergraph.c
-@@ -32,6 +32,9 @@
- #include "libavutil/internal.h"
- #include "libavutil/opt.h"
- #include "libavutil/pixdesc.h"
-+#if CONFIG_UNSAND_FILTER
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- #define FF_INTERNAL_FIELDS 1
- #include "framequeue.h"
-@@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFormats *a_arg,
-     }
- }
- 
-+#if CONFIG_UNSAND_FILTER
-+static int has_sand_format(const AVFilterFormats * const ff)
-+{
-+    int i;
-+    for (i = 0; i != ff->nb_formats; ++i) {
-+        if (av_rpi_is_sand_format(ff->formats[i])) {
-+            return 1;
-+        }
-+    }
-+    return 0;
-+}
-+#endif
-+
- /**
-  * Perform one round of query_formats() and merging formats lists on the
-  * filter graph.
-@@ -467,6 +483,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-         for (j = 0; j < filter->nb_inputs; j++) {
-             AVFilterLink *link = filter->inputs[j];
-             int convert_needed = 0;
-+            unsigned int extra_convert_tried = 0;
- 
-             if (!link)
-                 continue;
-@@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-             )
- #undef MERGE_DISPATCH
- 
--            if (convert_needed) {
-+            while (convert_needed) {
-                 AVFilterContext *convert;
-                 const AVFilter *filter;
-                 AVFilterLink *inlink, *outlink;
-                 char inst_name[30];
-+                int can_retry = 0;
-+
-+                convert_needed = 0;
- 
-                 if (graph->disable_auto_convert) {
-                     av_log(log_ctx, AV_LOG_ERROR,
-@@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-                 /* couldn't merge format lists. auto-insert conversion filter */
-                 switch (link->type) {
-                 case AVMEDIA_TYPE_VIDEO:
--                    if (!(filter = avfilter_get_by_name("scale"))) {
--                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
--                               "not present, cannot convert pixel formats.\n");
--                        return AVERROR(EINVAL);
--                    }
--
--                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
--                             scaler_count++);
-+#if CONFIG_UNSAND_FILTER
-+                    // Only try each extra conversion once
-+                    // The unsand output pad should never trigger has_sand_format
-+                    // but it is better to be safe
-+                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
-+                        if (!(filter = avfilter_get_by_name("unsand"))) {
-+                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
-+                                   "not present, cannot convert pixel formats.\n");
-+                            return AVERROR(EINVAL);
-+                        }
-+
-+                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
-+                                 scaler_count++);
-+
-+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
-+                                                                inst_name, "", NULL,
-+                                                                graph)) < 0)
-+                            return ret;
- 
--                    if ((ret = avfilter_graph_create_filter(&convert, filter,
--                                                            inst_name, graph->scale_sws_opts, NULL,
--                                                            graph)) < 0)
--                        return ret;
-+                        extra_convert_tried |= 1;
-+                        can_retry = 1;
-+                    }
-+                    else
-+#endif
-+                    {
-+                        if (!(filter = avfilter_get_by_name("scale"))) {
-+                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
-+                                   "not present, cannot convert pixel formats.\n");
-+                            return AVERROR(EINVAL);
-+                        }
-+
-+                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
-+                                 scaler_count++);
-+
-+                        if ((ret = avfilter_graph_create_filter(&convert, filter,
-+                                                                inst_name, graph->scale_sws_opts, NULL,
-+                                                                graph)) < 0)
-+                            return ret;
-+                    }
-                     break;
-                 case AVMEDIA_TYPE_AUDIO:
-                     if (!(filter = avfilter_get_by_name("aresample"))) {
-@@ -585,9 +631,19 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
-                     av_assert0(outlink-> in_channel_layouts->refcount > 0);
-                     av_assert0(outlink->out_channel_layouts->refcount > 0);
-                 }
--                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type) ||
--                    !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+                // If we have added an extra filter we must merge the input
-+                // side but we can have another go at the output
-+                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type))
-+                    ret = AVERROR(ENOSYS);
-+                else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+                {
-+                    if (can_retry) {
-+                        link = outlink;
-+                        convert_needed = 1;
-+                        continue;
-+                    }
-                     ret = AVERROR(ENOSYS);
-+                }
-                 if (inlink->type == AVMEDIA_TYPE_AUDIO &&
-                     (!ff_merge_samplerates(inlink->in_samplerates,
-                                            inlink->out_samplerates) ||
-diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
-index e0ff7e4dd8..77bc3d83fe 100644
---- a/libavfilter/buffersrc.c
-+++ b/libavfilter/buffersrc.c
-@@ -213,7 +213,7 @@ static int av_buffersrc_add_frame_internal(AVFilterContext *ctx,
- 
-         switch (ctx->outputs[0]->type) {
-         case AVMEDIA_TYPE_VIDEO:
--            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
-+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
-                                      frame->format, frame->pts);
-             break;
-         case AVMEDIA_TYPE_AUDIO:
-diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
-new file mode 100644
-index 0000000000..fbea56dd09
---- /dev/null
-+++ b/libavfilter/vf_unsand.c
-@@ -0,0 +1,234 @@
-+/*
-+ * Copyright (c) 2007 Bobby Bingham
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * format and noformat video filters
-+ */
-+
-+#include <string.h>
-+
-+#include "libavutil/internal.h"
-+#include "libavutil/mem.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
-+
-+typedef struct UnsandContext {
-+    const AVClass *class;
-+} UnsandContext;
-+
-+static av_cold void uninit(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+}
-+
-+static av_cold int init(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+
-+    return 0;
-+}
-+
-+
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterLink * const outlink = link->dst->outputs[0];
-+    AVFrame *out = NULL;
-+    int rv = 0;
-+
-+    if (outlink->format == in->format) {
-+        // If nothing to do then do nothing
-+        out = in;
-+    }
-+    else
-+    {
-+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
-+        {
-+            rv = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
-+        {
-+            rv = -1;
-+            goto fail;
-+        }
-+
-+        av_frame_free(&in);
-+    }
-+
-+    return ff_filter_frame(outlink, out);
-+
-+fail:
-+    av_frame_free(&out);
-+    av_frame_free(&in);
-+    return rv;
-+}
-+
-+#if 0
-+static void dump_fmts(const AVFilterFormats * fmts)
-+{
-+    int i;
-+    if (fmts== NULL) {
-+        printf("NULL\n");
-+        return;
-+    }
-+    for (i = 0; i < fmts->nb_formats; ++i) {
-+        printf(" %d", fmts->formats[i]);
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+static int query_formats(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+    int ret;
-+
-+    // If we aren't connected at both ends then just do nothing
-+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
-+        return 0;
-+
-+//    printf("Unsand: %s in: ", __func__);
-+//    dump_fmts(ctx->inputs[0]->in_formats);
-+//    printf("Unsand: %s out: ", __func__);
-+//    dump_fmts(ctx->outputs[0]->out_formats);
-+
-+    // Our output formats depend on our input formats and we can't/don't
-+    // want to convert between bit depths so we need to wait for the source
-+    // to have an opinion before we do
-+    if (ctx->inputs[0]->in_formats == NULL)
-+        return AVERROR(EAGAIN);
-+
-+    // Accept anything
-+    if (ctx->inputs[0]->out_formats == NULL &&
-+        (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
-+        return ret;
-+
-+    // Filter out sand formats
-+
-+    // Generate a container if we don't already have one
-+    if (ctx->outputs[0]->in_formats == NULL)
-+    {
-+        // Somewhat rubbish way of ensuring we have a good structure
-+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
-+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
-+
-+        if (formats == NULL)
-+            return AVERROR(ENOMEM);
-+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
-+            return ret;
-+    }
-+
-+    // Replace old format list with new filtered list derived from what our
-+    // input says it can do
-+    {
-+        const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
-+        AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
-+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
-+        int i;
-+        int n = 0;
-+        int seen_420p = 0;
-+        int seen_420p10 = 0;
-+
-+        for (i = 0; i < src_ff->nb_formats; ++i) {
-+            const enum AVPixelFormat f = src_ff->formats[i];
-+
-+            switch (f){
-+                case AV_PIX_FMT_YUV420P:
-+                case AV_PIX_FMT_SAND128:
-+                case AV_PIX_FMT_RPI4_8:
-+                    if (!seen_420p) {
-+                        seen_420p = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
-+                    }
-+                    break;
-+                case AV_PIX_FMT_SAND64_10:
-+                case AV_PIX_FMT_YUV420P10:
-+                case AV_PIX_FMT_RPI4_10:
-+                    if (!seen_420p10) {
-+                        seen_420p10 = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
-+                    }
-+                    break;
-+                default:
-+                    dst_fmts[n++] = f;
-+                    break;
-+            }
-+        }
-+
-+        av_freep(&dst_ff->formats);
-+        dst_ff->formats = dst_fmts;
-+        dst_ff->nb_formats = n;
-+    }
-+
-+//    printf("Unsand: %s calc: ", __func__);
-+//    dump_fmts(ctx->outputs[0]->in_formats);
-+
-+    return 0;
-+}
-+
-+
-+#define OFFSET(x) offsetof(UnsandContext, x)
-+static const AVOption unsand_options[] = {
-+    { NULL }
-+};
-+
-+
-+AVFILTER_DEFINE_CLASS(unsand);
-+
-+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
-+    {
-+        .name             = "default",
-+        .type             = AVMEDIA_TYPE_VIDEO,
-+        .filter_frame = filter_frame,
-+    },
-+    { NULL }
-+};
-+
-+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
-+    {
-+        .name = "default",
-+        .type = AVMEDIA_TYPE_VIDEO
-+    },
-+    { NULL }
-+};
-+
-+AVFilter ff_vf_unsand = {
-+    .name          = "unsand",
-+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
-+
-+    .init          = init,
-+    .uninit        = uninit,
-+
-+    .query_formats = query_formats,
-+
-+    .priv_size     = sizeof(UnsandContext),
-+    .priv_class    = &unsand_class,
-+
-+    .inputs        = avfilter_vf_unsand_inputs,
-+    .outputs       = avfilter_vf_unsand_outputs,
-+};
-+
-diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 6c6f4e1bd1..c6332d3e46 100644
---- a/libavformat/utils.c
-+++ b/libavformat/utils.c
-@@ -3013,6 +3013,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
-     return 1;
- }
- 
-+#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
-+// This should be quite general purpose but avoid possible conflicts
-+// by limiting usage to cases wehere we know it works.
-+static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
-+{
-+    // Only try fallback if we know it is supported (HEVC only)
-+    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
-+        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
-+    int err;
-+
-+    // Failed to find fallback or we are already at the fallback
-+    if (new_codec == NULL || new_codec == old_codec)
-+    {
-+        return AVERROR_DECODER_NOT_FOUND;
-+    }
-+
-+    // * This may be dodgy - header says to not use this fn,
-+    //   especially if we are going to reopen the context...
-+    //   (but it does seem to work for our cases)
-+    if (avcodec_is_open(avctx)) {
-+        avcodec_close(avctx);
-+    }
-+
-+    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
-+    {
-+        return err;
-+    }
-+
-+    return 0;
-+}
-+#else
-+#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
-+#endif
-+
- /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
- static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
-                             AVDictionary **options)
-@@ -3047,7 +3081,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
-         av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
-         if (s->codec_whitelist)
-             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
--        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
-+        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
-+        {
-+            // Try fallback if if looks worth a try
-+            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
-+        }
-         if (!options)
-             av_dict_free(&thread_opt);
-         if (ret < 0) {
-@@ -3078,6 +3116,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
-         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
-             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
-             ret = avcodec_send_packet(avctx, &pkt);
-+
-+            // If we are going to want to fall back we should know here
-+            if (ret == AVERROR_DECODER_NOT_FOUND) {
-+                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
-+                    break;
-+                continue;
-+            }
-+
-             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
-                 break;
-             if (ret >= 0)
-@@ -3671,9 +3717,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
-         // Try to just open decoders, in case this is enough to get parameters.
-         if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
-             if (codec && !avctx->codec)
--                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
--                    av_log(ic, AV_LOG_WARNING,
--                           "Failed to open codec in %s\n",__FUNCTION__);
-+            {
-+                int err;
-+
-+                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
-+                {
-+                    if (err == AVERROR_DECODER_NOT_FOUND) {
-+                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
-+                    }
-+                    if (err < 0) {
-+                        av_log(ic, AV_LOG_WARNING,
-+                               "Failed to open codec in %s\n",__FUNCTION__);
-+                    }
-+                }
-+            }
-         }
-         if (!options)
-             av_dict_free(&thread_opt);
-diff --git a/libavutil/Makefile b/libavutil/Makefile
-index 8a7a44e4b5..4b4b27a28a 100644
---- a/libavutil/Makefile
-+++ b/libavutil/Makefile
-@@ -65,6 +65,7 @@ HEADERS = adler32.h                                                     \
-           rational.h                                                    \
-           replaygain.h                                                  \
-           ripemd.h                                                      \
-+	  rpi_sand_fns.h                                                \
-           samplefmt.h                                                   \
-           sha.h                                                         \
-           sha512.h                                                      \
-@@ -82,6 +83,7 @@ HEADERS = adler32.h                                                     \
-           tx.h                                                          \
- 
- HEADERS-$(CONFIG_LZO)                   += lzo.h
-+HEADERS-$(CONFIG-RPI)                   += rpi_sand_fn_pw.h
- 
- ARCH_HEADERS = bswap.h                                                  \
-                intmath.h                                                \
-@@ -170,9 +172,11 @@ OBJS-$(CONFIG_LZO)                      += lzo.o
- OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
- OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
- OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
-+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
- OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
- OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
- OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
-+OBJS-$(CONFIG_RPI)                      += rpi_sand_fns.o
- 
- OBJS += $(COMPAT_OBJS:%=../compat/%)
- 
-diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
-index 5da44b0542..b74b7c4e2f 100644
---- a/libavutil/arm/Makefile
-+++ b/libavutil/arm/Makefile
-@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
- 
- NEON-OBJS += arm/float_dsp_init_neon.o                                  \
-              arm/float_dsp_neon.o                                       \
-+             arm/rpi_sand_neon.o                                        \
-diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
-new file mode 100644
-index 0000000000..750af9064f
---- /dev/null
-+++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,69 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+
-+@ void rpi_sand128b_stripe_to_8_10(
-+@   uint8_t * dest,             [r0]
-+@   const uint8_t * src1,       [r1]
-+@   const uint8_t * src2,       [r2]
-+@   unsigned int lines);        [r3]
-+
-+.macro  stripe2_to_8, bit_depth
-+        vpush    {q4-q7}
-+1:
-+        vldm     r1!, {q0-q7}
-+        subs     r3, #1
-+        vldm     r2!, {q8-q15}
-+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
-+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
-+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
-+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
-+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
-+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
-+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
-+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
-+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
-+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
-+        vqrshrn.u16 d10, q10, #\bit_depth - 8
-+        vqrshrn.u16 d11, q11, #\bit_depth - 8
-+        vqrshrn.u16 d12, q12, #\bit_depth - 8
-+        vqrshrn.u16 d13, q13, #\bit_depth - 8
-+        vqrshrn.u16 d14, q14, #\bit_depth - 8
-+        vqrshrn.u16 d15, q15, #\bit_depth - 8
-+        vstm     r0!, {q0-q7}
-+        bne      1b
-+        vpop     {q4-q7}
-+        bx       lr
-+.endm
-+
-+function rpi_sand128b_stripe_to_8_10, export=1
-+        stripe2_to_8     10
-+endfunc
-+
-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 8d1aa5fa84..937fb29a47 100644
---- a/libavutil/buffer.c
-+++ b/libavutil/buffer.c
-@@ -272,6 +272,19 @@ static void buffer_pool_free(AVBufferPool *pool)
-     av_freep(&pool);
- }
- 
-+void av_buffer_pool_flush(AVBufferPool *pool)
-+{
-+    ff_mutex_lock(&pool->mutex);
-+    while (pool->pool) {
-+        BufferPoolEntry *buf = pool->pool;
-+        pool->pool = buf->next;
-+
-+        buf->free(buf->opaque, buf->data);
-+        av_freep(&buf);
-+    }
-+    ff_mutex_unlock(&pool->mutex);
-+}
-+
- void av_buffer_pool_uninit(AVBufferPool **ppool)
- {
-     AVBufferPool *pool;
-@@ -355,3 +368,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
- 
-     return ret;
- }
-+
-+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
-+void *av_buffer_pool_opaque(AVBufferRef *ref) {
-+  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
-+  return buf->opaque;
-+}
-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 73b6bd0b14..f0210cae47 100644
---- a/libavutil/buffer.h
-+++ b/libavutil/buffer.h
-@@ -266,6 +266,11 @@ AVBufferPool *av_buffer_pool_init2(int size, void *opaque,
-                                    AVBufferRef* (*alloc)(void *opaque, int size),
-                                    void (*pool_free)(void *opaque));
- 
-+/**
-+ * Free all available buffers in a buffer pool.
-+ */
-+ void av_buffer_pool_flush(AVBufferPool *pool);
-+
- /**
-  * Mark the pool as being available for freeing. It will actually be freed only
-  * once all the allocated buffers associated with the pool are released. Thus it
-@@ -284,6 +289,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
-  */
- AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
- 
-+// Return the opaque for the underlying frame
-+void *av_buffer_pool_opaque(AVBufferRef *ref);
-+
- /**
-  * @}
-  */
-diff --git a/libavutil/frame.c b/libavutil/frame.c
-index dcf1fc3d17..dd0876f5a9 100644
---- a/libavutil/frame.c
-+++ b/libavutil/frame.c
-@@ -16,6 +16,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "config.h"
-+
- #include "channel_layout.h"
- #include "avassert.h"
- #include "buffer.h"
-@@ -25,6 +27,9 @@
- #include "imgutils.h"
- #include "mem.h"
- #include "samplefmt.h"
-+#if CONFIG_SAND
-+#include "rpi_sand_fns.h"
-+#endif
- 
- #if FF_API_FRAME_GET_SET
- MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
-@@ -893,6 +898,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
-         (frame->crop_top + frame->crop_bottom) >= frame->height)
-         return AVERROR(ERANGE);
- 
-+#if CONFIG_SAND
-+    // Sand cannot be cropped - do not try
-+    if (av_rpi_is_sand_format(frame->format))
-+        return 0;
-+#endif
-+
-     desc = av_pix_fmt_desc_get(frame->format);
-     if (!desc)
-         return AVERROR_BUG;
-diff --git a/libavutil/frame.h b/libavutil/frame.h
-index 5d3231e7bb..e250f420a2 100644
---- a/libavutil/frame.h
-+++ b/libavutil/frame.h
-@@ -964,6 +964,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
-  */
- const char *av_frame_side_data_name(enum AVFrameSideDataType type);
- 
-+
-+static inline int av_frame_cropped_width(const AVFrame * const frame)
-+{
-+    return frame->width - (frame->crop_left + frame->crop_right);
-+}
-+static inline int av_frame_cropped_height(const AVFrame * const frame)
-+{
-+    return frame->height - (frame->crop_top + frame->crop_bottom);
-+}
-+
- /**
-  * @}
-  */
-diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
-index 32cbde82eb..6f46fd7b6e 100644
---- a/libavutil/hwcontext_drm.c
-+++ b/libavutil/hwcontext_drm.c
-@@ -21,6 +21,7 @@
- #include <unistd.h>
- 
- #include <drm.h>
-+#include <drm/drm_fourcc.h>
- #include <xf86drm.h>
- 
- #include "avassert.h"
-@@ -28,7 +29,7 @@
- #include "hwcontext_drm.h"
- #include "hwcontext_internal.h"
- #include "imgutils.h"
--
-+#include "libavutil/rpi_sand_fns.h"
- 
- static void drm_device_free(AVHWDeviceContext *hwdev)
- {
-@@ -43,6 +44,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
-     AVDRMDeviceContext *hwctx = hwdev->hwctx;
-     drmVersionPtr version;
- 
-+    if (device == NULL) {
-+      hwctx->fd = -1;
-+      return 0;
-+    }
-+
-     hwctx->fd = open(device, O_RDWR);
-     if (hwctx->fd < 0)
-         return AVERROR(errno);
-@@ -120,6 +126,9 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
-     if (flags & AV_HWFRAME_MAP_WRITE)
-         mmap_prot |= PROT_WRITE;
- 
-+    if (dst->format == AV_PIX_FMT_NONE)
-+        dst->format = hwfc->sw_format;
-+
-     av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES);
-     for (i = 0; i < desc->nb_objects; i++) {
-         addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED,
-@@ -151,6 +160,22 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
- 
-     dst->width  = src->width;
-     dst->height = src->height;
-+    dst->crop_top    = src->crop_top;
-+    dst->crop_bottom = src->crop_bottom;
-+    dst->crop_left   = src->crop_left;
-+    dst->crop_right  = src->crop_right;
-+
-+    // Rework for sand frames
-+    if (av_rpi_is_sand_frame(dst)) {
-+        // As it stands the sand formats hold stride2 in linesize[3]
-+        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
-+        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
-+        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
-+        dst->linesize[0] = 128;
-+        dst->linesize[1] = 128;
-+        // *** Are we sure src->height is actually what we want ???
-+    }
-+
- 
-     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
-                                 &drm_unmap_frame, map);
-@@ -178,7 +203,12 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
-     if (!pix_fmts)
-         return AVERROR(ENOMEM);
- 
--    pix_fmts[0] = ctx->sw_format;
-+    // **** Offer native sand too ????
-+    pix_fmts[0] = ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
-+            AV_PIX_FMT_YUV420P :
-+        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
-+            AV_PIX_FMT_YUV420P10LE :
-+            ctx->sw_format;
-     pix_fmts[1] = AV_PIX_FMT_NONE;
- 
-     *formats = pix_fmts;
-@@ -197,18 +227,79 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
-     map = av_frame_alloc();
-     if (!map)
-         return AVERROR(ENOMEM);
--    map->format = dst->format;
- 
-+    // Map to default
-+    map->format = AV_PIX_FMT_NONE;
-     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
-     if (err)
-         goto fail;
- 
--    map->width  = dst->width;
--    map->height = dst->height;
-+#if 0
-+    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
-+           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
-+           map->width, map->height,
-+           map->linesize[0],
-+           map->linesize[1],
-+           map->linesize[2],
-+           map->linesize[3],
-+           dst->width, dst->height,
-+           dst->linesize[0],
-+           dst->linesize[1],
-+           dst->linesize[2]);
-+#endif
-+    if (av_rpi_is_sand_frame(map)) {
-+        unsigned int stride2 = map->linesize[3];
-+        const unsigned int w = FFMIN(dst->width, av_frame_cropped_width(map));
-+        const unsigned int h = FFMIN(dst->height, av_frame_cropped_height(map));
-+
-+        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
-+            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     map->crop_left, map->crop_top,
-+                                     w, h);
-+            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     map->crop_left / 2, map->crop_top / 2,
-+                                     w / 2, h / 2);
-+        }
-+        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
-+            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     map->crop_left, map->crop_top,
-+                                     w, h);  // *** ??? crop
-+            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     map->crop_left / 2, map->crop_top / 2,
-+                                     w / 2, h / 2);
-+        }
-+        else
-+        {
-+            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
-+            err = AVERROR(EINVAL);
-+            goto fail;
-+        }
-+
-+        dst->width = w;
-+        dst->height = h;
-+    }
-+    else {
-+        // Kludge mapped h/w s.t. frame_copy works
-+        map->width  = dst->width;
-+        map->height = dst->height;
-+        err = av_frame_copy(dst, map);
-+    }
- 
--    err = av_frame_copy(dst, map);
-     if (err)
-+    {
-+        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
-         goto fail;
-+    }
- 
-     err = 0;
- fail:
-@@ -222,8 +313,13 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
-     AVFrame *map;
-     int err;
- 
-+    av_log(hwfc, AV_LOG_INFO, "<<< %s\n", __func__);
-+
-     if (src->width > hwfc->width || src->height > hwfc->height)
-+    {
-+        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
-         return AVERROR(EINVAL);
-+    }
- 
-     map = av_frame_alloc();
-     if (!map)
-diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
-index b97b0665b0..bacb571a40 100644
---- a/libavutil/pixdesc.c
-+++ b/libavutil/pixdesc.c
-@@ -2050,6 +2050,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
-         .name = "cuda",
-         .flags = AV_PIX_FMT_FLAG_HWACCEL,
-     },
-+    [AV_PIX_FMT_RPI] = {
-+        .name = "rpi",
-+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-+    },
-+    [AV_PIX_FMT_RPI4_10] = {
-+        .name = "rpi",
-+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-+    },
-+    [AV_PIX_FMT_RPI4_8] = {
-+        .name = "rpi",
-+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-+    },
-     [AV_PIX_FMT_AYUV64LE] = {
-         .name = "ayuv64le",
-         .nb_components = 4,
-@@ -2344,6 +2356,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
-         },
-         .flags = AV_PIX_FMT_FLAG_PLANAR,
-     },
-+    [AV_PIX_FMT_SAND128] = {
-+        .name = "sand128",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
-+            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
-+            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
-+        },
-+        .flags = 0,
-+    },
-+    [AV_PIX_FMT_SAND64_10] = {
-+        .name = "sand64_10",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
-+            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
-+            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
-+        },
-+        .flags = 0,
-+    },
- };
- #if FF_API_PLUS1_MINUS1
- FF_ENABLE_DEPRECATION_WARNINGS
-diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index 8b54c9415b..891ea65f03 100644
---- a/libavutil/pixfmt.h
-+++ b/libavutil/pixfmt.h
-@@ -234,6 +234,11 @@ enum AVPixelFormat {
-      */
-     AV_PIX_FMT_CUDA,
- 
-+    /**
-+     * HW acceleration through RPI.
-+     */
-+    AV_PIX_FMT_RPI,
-+
-     AV_PIX_FMT_0RGB,        ///< packed RGB 8:8:8, 32bpp, XRGBXRGB...   X=unused/undefined
-     AV_PIX_FMT_RGB0,        ///< packed RGB 8:8:8, 32bpp, RGBXRGBX...   X=unused/undefined
-     AV_PIX_FMT_0BGR,        ///< packed BGR 8:8:8, 32bpp, XBGRXBGR...   X=unused/undefined
-@@ -348,6 +353,13 @@ enum AVPixelFormat {
-     AV_PIX_FMT_NV24,      ///< planar YUV 4:4:4, 24bpp, 1 plane for Y and 1 plane for the UV components, which are interleaved (first byte U and the following byte V)
-     AV_PIX_FMT_NV42,      ///< as above, but U and V bytes are swapped
- 
-+// RPI - not on ifdef so can be got at by calling progs
-+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_RPI4_8,
-+    AV_PIX_FMT_RPI4_10,
-+
-     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
- };
- 
-diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
-new file mode 100644
-index 0000000000..3133fe41ac
---- /dev/null
-+++ b/libavutil/rpi_sand_fn_pw.h
-@@ -0,0 +1,211 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+// * Included twice from rpi_sand_fn with different PW
-+
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x;
-+    const unsigned int w = _w;
-+    const unsigned int mask = stride1 - 1;
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
-+            memcpy(dst, p, w);
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const uint8_t * p = p2;
-+            uint8_t * d = dst;
-+            memcpy(d, p1, w1);
-+            d += w1;
-+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
-+                memcpy(d, p, stride1);
-+            }
-+            memcpy(d, p, w3);
-+        }
-+    }
-+}
-+
-+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
-+
-+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            const pixel * p = (const pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * p = (const pixel *)p1;
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *du++ = *p++;
-+                    *dv++ = *p++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+}
-+
-+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *p++ = *su++;
-+                    *p++ = *sv++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+}
-+
-+
-+#undef pixel
-+#undef STRCAT
-+#undef FUNC
-+
-diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
-new file mode 100644
-index 0000000000..5651727fb3
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,335 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#include "config.h"
-+#include <stdint.h>
-+#include <string.h>
-+#include "rpi_sand_fns.h"
-+#include "avassert.h"
-+#include "frame.h"
-+
-+#define PW 1
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#define PW 2
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#if HAVE_NEON
-+void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
-+#endif
-+
-+#if 1
-+// Simple round
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    const unsigned int rnd = (1 << shr) >> 1;
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        *dst++ = (*src++ + rnd) >> shr;
-+    }
-+}
-+#else
-+// Dithered variation
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    unsigned int rnd = (1 << shr) >> 1;
-+    const unsigned int mask = ((1 << shr) - 1);
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        rnd = *src++ + (rnd & mask);
-+        *dst++ = rnd >> shr;
-+    }
-+}
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+// _x & _w in pixels, strides in bytes
-+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 4;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint16_t * d = (uint16_t *)dst;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3 = *p++;
-+
-+            if (xskip0 == 1)
-+                *d++ = (p3 >> 10) & 0x3ff;
-+            *d++ = (p3 >> 20) & 0x3ff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3 = *p++;
-+            *d++ = p3 & 0x3ff;
-+            *d++ = (p3 >> 10) & 0x3ff;
-+            *d++ = (p3 >> 20) & 0x3ff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3 = *p;
-+
-+            *d++ = p3 & 0x3ff;
-+            if (xrem1 == 2)
-+                *d++ = (p3 >> 10) & 0x3ff;
-+        }
-+    }
-+}
-+
-+
-+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 8;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint16_t * du = (uint16_t *)dst_u;
-+        uint16_t * dv = (uint16_t *)dst_v;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            if (xskip0 == 1)
-+            {
-+                *du++ = (p3a >> 20) & 0x3ff;
-+                *dv++ = (p3b >>  0) & 0x3ff;
-+            }
-+            *du++ = (p3b >> 10) & 0x3ff;
-+            *dv++ = (p3b >> 20) & 0x3ff;
-+
-+            if (((x += 8) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            *du++ = p3a & 0x3ff;
-+            *dv++ = (p3a >> 10) & 0x3ff;
-+            *du++ = (p3a >> 20) & 0x3ff;
-+            *dv++ = p3b & 0x3ff;
-+            *du++ = (p3b >> 10) & 0x3ff;
-+            *dv++ = (p3b >> 20) & 0x3ff;
-+
-+            if (((x += 8) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            *du++ = p3a & 0x3ff;
-+            *dv++ = (p3a >> 10) & 0x3ff;
-+            if (xrem1 == 2)
-+            {
-+                *du++ = (p3a >> 20) & 0x3ff;
-+                *dv++ = p3b & 0x3ff;
-+            }
-+        }
-+    }
-+}
-+
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr)
-+{
-+    const unsigned int n = dst_stride1 / 2;
-+    unsigned int j;
-+
-+    // This is true for our current layouts
-+    av_assert0(dst_stride1 == src_stride1);
-+
-+    // As we have the same stride1 for src & dest and src is wider than dest
-+    // then if we loop on src we can always write contiguously to dest
-+    // We make no effort to copy an exact width - round up to nearest src stripe
-+    // as we will always have storage in dest for that
-+
-+#if HAVE_NEON
-+    if (shr == 3 && src_stride1 == 128) {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
-+        }
-+    }
-+    else
-+#endif
-+    {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
-+                cpy16_to_8(d, s1, n, shr);
-+                cpy16_to_8(d + n, s2, n, shr);
-+            }
-+        }
-+    }
-+
-+    // Fix up a trailing dest half stripe
-+    if (j < w) {
-+        uint8_t * d = dst + j * dst_stride2;
-+        const uint8_t * s1 = src + j * 2 * src_stride2;
-+
-+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
-+            cpy16_to_8(d, s1, n, shr);
-+        }
-+    }
-+}
-+
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
-+{
-+    const int w = av_frame_cropped_width(src);
-+    const int h = av_frame_cropped_height(src);
-+    const int x = src->crop_left;
-+    const int y = src->crop_top;
-+
-+    // We will crop as part of the conversion
-+    dst->crop_top = 0;
-+    dst->crop_left = 0;
-+    dst->crop_bottom = 0;
-+    dst->crop_right = 0;
-+
-+    switch (src->format){
-+        case AV_PIX_FMT_SAND128:
-+        case AV_PIX_FMT_RPI4_8:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P:
-+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2,  w/2, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        case AV_PIX_FMT_SAND64_10:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P10:
-+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x*2, y, w*2, h);
-+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y/2,  w, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        case AV_PIX_FMT_RPI4_10:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P10:
-+                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2, w/2, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        default:
-+            return -1;
-+    }
-+
-+    return av_frame_copy_props(dst, src);
-+}
-diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
-new file mode 100644
-index 0000000000..634b55e800
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,183 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#ifndef AVUTIL_RPI_SAND_FNS
-+#define AVUTIL_RPI_SAND_FNS
-+
-+#include "libavutil/frame.h"
-+
-+// For all these fns _x & _w are measured as coord * PW
-+// For the C fns coords are in chroma pels (so luma / 2)
-+// Strides are in bytes
-+
-+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr);
-+
-+
-+// dst must contain required pixel format & allocated data buffers
-+// Cropping on the src buffer will be honoured and dst crop will be set to zero
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
-+
-+
-+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
-+{
-+#ifdef RPI_ZC_SAND128_ONLY
-+    // If we are sure we only only support 128 byte sand formats replace the
-+    // var with a constant which should allow for better optimisation
-+    return 128;
-+#else
-+    return frame->linesize[0];
-+#endif
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
-+{
-+    return frame->linesize[3];
-+}
-+
-+
-+static inline int av_rpi_is_sand_format(const int format)
-+{
-+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand_format(frame->format);
-+}
-+
-+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
-+{
-+    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
-+}
-+
-+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
-+{
-+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
-+}
-+
-+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
-+{
-+    return (frame->format == AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
-+}
-+
-+// If x is measured in bytes (not pixels) then this works for sand64_16 as
-+// well as sand128 - but in the general case we work that out
-+
-+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
-+}
-+
-+#endif
-+
-diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
-index d0df061e4d..6fb32fac77 100644
---- a/libswscale/yuv2rgb.c
-+++ b/libswscale/yuv2rgb.c
-@@ -687,10 +687,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
-     if (t)
-         return t;
- 
--    av_log(c, AV_LOG_WARNING,
--           "No accelerated colorspace conversion found from %s to %s.\n",
--           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
--
-     switch (c->dstFormat) {
-     case AV_PIX_FMT_BGR48BE:
-     case AV_PIX_FMT_BGR48LE:
-diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
-new file mode 100644
-index 0000000000..7f16dff6a2
---- /dev/null
-+++ b/pi-util/BUILD.txt
-@@ -0,0 +1,29 @@
-+Building Pi FFmpeg
-+==================
-+
-+Configuration:
-+=============
-+
-+These instructions work for cross compiles from Ubuntu 16.04 & Ubuntu
-+18.04. I would expect most other linux environments to work but I haven't
-+tried them.
-+
-+pi-util/conf_pi2.sh
-+
-+contains suitable options to build the code for Pi2/3.  It expects to find
-+git clones of
-+
-+https://github.com/raspberrypi/tools
-+https://github.com/raspberrypi/firmware
-+
-+in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
-+lot of history you don't want.
-+
-+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
-+rebuilt.  Otherwise the prebuilt .c & .h files will be used.
-+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
-+
-+pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
-+H265 QPU acceleration is broken on Pi1 and so it is disabled.
-+
-+
-diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt
-new file mode 100644
-index 0000000000..fcce72226a
---- /dev/null
-+++ b/pi-util/NOTES.txt
-@@ -0,0 +1,69 @@
-+Notes on the hevc_rpi decoder & associated support code
-+-------------------------------------------------------
-+
-+There are 3 main parts to the existing code:
-+
-+1) The decoder - this is all in libavcodec as rpi_hevc*.
-+
-+2) A few filters to deal with Sand frames and a small patch to
-+automatically select the sand->i420 converter when required.
-+
-+3) A kludge in ffmpeg.c to display the decoded video. This could & should
-+be converted into a proper ffmpeg display module.
-+
-+
-+Decoder
-+-------
-+
-+The decoder is a modified version of the existing ffmpeg hevc decoder.
-+Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
-+More complex bitstreams can be up to ~200% faster but particularly easy
-+streams can cut its advantage down to ~50%.  This means that a Pi3+ can
-+display nearly all 8-bit 1080p30 streams and with some overclocking it can
-+display most lower bitrate 10-bit 1080p30 streams - this latter case is
-+not helped by the requirement to downsample to 8-bit before display on a
-+Pi.
-+
-+It has had co-processor offload added for inter-pred and large block
-+residual transform.  Various parts have had optimized ARM NEON assembler
-+added and the existing ARM asm sections have been profiled and
-+re-optimized for A53. The main C code has been substantially reworked at
-+its lower levels in an attempt to optimize it and minimize memory
-+bandwidth. To some extent code paths that deal with frame types that it
-+doesn't support have been pruned.
-+
-+It outputs frames in Broadcom Sand format. This is a somewhat annoying
-+layout that doesn't fit into ffmpegs standard frame descriptions. It has
-+vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
-+the stripe followed by interleaved U & V, that is then followed by the Y
-+for the next stripe, etc. The final stripe is always padded to
-+stripe-width. This is used in an attempt to help with cache locality and
-+cut down on the number of dram bank switches. It is annoying to use for
-+inter-pred with conventional processing but the way the Pi QPU (which is
-+used for inter-pred) works means that it has negligible downsides here and
-+the improved memory performance exceeds the overhead of the increased
-+complexity in the rest of the code.
-+
-+Frames must be allocated out of GPU memory (as otherwise they can't be
-+accessed by the co-processors). Utility functions (in rpi_zc.c) have been
-+written to make this easier. As the frames are already in GPU memory they
-+can be displayed by the Pi h/w without any further copying.
-+
-+
-+Known non-features
-+------------------
-+
-+Frame allocation should probably be done in some other way in order to fit
-+into the standard framework better.
-+
-+Sand frames are currently declared as software frames, there is an
-+argument that they should be hardware frames but they aren't really.
-+
-+There must be a better way of auto-selecting the hevc_rpi decoder over the
-+normal s/w hevc decoder, but I became confused by the existing h/w
-+acceleration framework and what I wanted to do didn't seem to fit in
-+neatly.
-+
-+Display should be a proper device rather than a kludge in ffmpeg.c
-+
-+
-diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
-new file mode 100644
-index 0000000000..3e90f6893f
---- /dev/null
-+++ b/pi-util/conf_h265.2016.csv
-@@ -0,0 +1,195 @@
-+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
-+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
-+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
-+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
-+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
-+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
-+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
-+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
-+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
-+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
-+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
-+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
-+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
-+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
-+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
-+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
-+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
-+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
-+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
-+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
-+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed
-+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
-+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
-+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
-+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
-+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
-+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
-+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
-+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5
-+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5
-diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
-new file mode 100644
-index 0000000000..6082641271
---- /dev/null
-+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
-@@ -0,0 +1,147 @@
-+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
-new file mode 100644
-index 0000000000..fc14f2a3c2
---- /dev/null
-+++ b/pi-util/conf_h265.csv
-@@ -0,0 +1,144 @@
-+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
-+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
-+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
-+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
-+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
-+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
-+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
-new file mode 100755
-index 0000000000..681f001fa7
---- /dev/null
-+++ b/pi-util/conf_native.sh
-@@ -0,0 +1,41 @@
-+echo "Configure for native build"
-+
-+RPI_OPT_VC=/opt/vc
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
-+RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+USR_PREFIX=`pwd`/install
-+LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf
-+INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf
-+
-+./configure \
-+ --prefix=$USR_PREFIX\
-+ --libdir=$LIB_PREFIX\
-+ --incdir=$INC_PREFIX\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --enable-rpi\
-+ --enable-v4l2-request\
-+ --enable-libdrm\
-+ --enable-libudev\
-+ --enable-vout-drm-kludge\
-+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+
-+# --enable-shared\
-+
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
-new file mode 100755
-index 0000000000..400e7adcbf
---- /dev/null
-+++ b/pi-util/conf_pi1.sh
-@@ -0,0 +1,31 @@
-+echo "Configure for Pi1"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+./configure --enable-cross-compile\
-+ --cpu=arm1176jzf-s\
-+ --arch=arm\
-+ --disable-neon\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --enable-mmal\
-+ --enable-shared\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
-new file mode 100755
-index 0000000000..70acedd5cb
---- /dev/null
-+++ b/pi-util/conf_pi2.sh
-@@ -0,0 +1,44 @@
-+echo "Configure for Pi2/3"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+USR_PREFIX=`pwd`/install
-+LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf
-+INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf
-+
-+./configure --enable-cross-compile\
-+ --prefix=$USR_PREFIX\
-+ --libdir=$LIB_PREFIX\
-+ --incdir=$INC_PREFIX\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --enable-rpi\
-+ --enable-v4l2-request\
-+ --enable-libdrm\
-+ --enable-libudev\
-+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+# --enable-shared\
-+
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
-new file mode 100755
-index 0000000000..e6963fbb4a
---- /dev/null
-+++ b/pi-util/ffconf.py
-@@ -0,0 +1,195 @@
-+#!/usr/bin/env python
-+
-+import string
-+import os
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+ffmpeg_exec = "./ffmpeg"
-+
-+CODEC_HEVC_RPI  = 1
-+HWACCEL_RPI     = 2
-+HWACCEL_DRM     = 3
-+
-+def testone(fileroot, srcname, es_file, md5_file, dectype, vcodec):
-+    tmp_root = "/tmp"
-+
-+    names = srcname.split('/')
-+    while len(names) > 1:
-+        tmp_root = os.path.join(tmp_root, names[0])
-+        del names[0]
-+    name = names[0]
-+
-+    if not os.path.exists(tmp_root):
-+        os.makedirs(tmp_root)
-+
-+    dec_file = os.path.join(tmp_root, name + ".dec.md5")
-+    try:
-+        os.remove(dec_file)
-+    except:
-+        pass
-+
-+    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
-+
-+    # Unaligned needed for cropping conformance
-+    if dectype == HWACCEL_RPI:
-+        rstr = subprocess.call(
-+            [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", "rpi", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
-+            stdout=flog, stderr=subprocess.STDOUT)
-+    elif dectype == HWACCEL_DRM:
-+        rstr = subprocess.call(
-+            [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", "drm", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
-+            stdout=flog, stderr=subprocess.STDOUT)
-+    else:
-+        rstr = subprocess.call(
-+            [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
-+            stdout=flog, stderr=subprocess.STDOUT)
-+
-+    try:
-+        m1 = None
-+        m2 = None
-+        with open(os.path.join(fileroot, md5_file)) as f:
-+            for line in f:
-+                m1 = re.search("[0-9a-f]{32}", line.lower())
-+                if m1:
-+                    break
-+
-+        with open(dec_file) as f:
-+            m2 = re.search("[0-9a-f]{32}", f.readline())
-+    except:
-+        pass
-+
-+    if  m1 and m2 and m1.group() == m2.group():
-+        print >> flog, "Match: " + m1.group()
-+        rv = 0
-+    elif not m1:
-+        print >> flog, "****** Cannot find m1"
-+        rv = 3
-+    elif not m2:
-+        print >> flog, "****** Cannot find m2"
-+        rv = 2
-+    else:
-+        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
-+        rv = 1
-+    flog.close()
-+    return rv
-+
-+def scandir(root):
-+    aconf = []
-+    ents = os.listdir(root)
-+    ents.sort(key=str.lower)
-+    for name in ents:
-+        test_path = os.path.join(root, name)
-+        if S_ISDIR(os.stat(test_path).st_mode):
-+            files = os.listdir(test_path)
-+            es_file = "?"
-+            md5_file = "?"
-+            for f in files:
-+                (base, ext) = os.path.splitext(f)
-+                if base[0] == '.':
-+                    pass
-+                elif ext == ".bit" or ext == ".bin":
-+                    es_file = f
-+                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
-+                    if md5_file == "?":
-+                        md5_file = f
-+                    elif base[-3:] == "yuv":
-+                        md5_file = f
-+            aconf.append((1, name, es_file, md5_file))
-+    return aconf
-+
-+def runtest(name, tests):
-+    if not tests:
-+        return True
-+    for t in tests:
-+        if name[0:len(t)] == t or name.find("/" + t) != -1:
-+            return True
-+    return False
-+
-+def doconf(csva, tests, test_root, vcodec, dectype):
-+    unx_failures = []
-+    unx_success = []
-+    failures = 0
-+    successes = 0
-+    for a in csva:
-+        exp_test = int(a[0])
-+        if (exp_test and runtest(a[1], tests)):
-+            name = a[1]
-+            print "==== ", name,
-+            sys.stdout.flush()
-+
-+            rv = testone(os.path.join(test_root, name), name, a[2], a[3], dectype=dectype, vcodec=vcodec)
-+            if (rv == 0):
-+                successes += 1
-+            else:
-+                failures += 1
-+
-+            if (rv == 0):
-+                if exp_test == 2:
-+                    print ": * OK *"
-+                    unx_success.append(name)
-+                else:
-+                    print ": ok"
-+            elif exp_test == 2 and rv == 1:
-+                print ": fail"
-+            elif exp_test == 3 and rv == 2:
-+                # Call an expected "crash" an abort
-+                print ": abort"
-+            else:
-+                unx_failures.append(name)
-+                if rv == 1:
-+                    print ": * FAIL *"
-+                elif (rv == 2) :
-+                    print ": * CRASH *"
-+                elif (rv == 3) :
-+                    print ": * MD5 MISSING *"
-+                else :
-+                    print ": * BANG *"
-+
-+    if unx_failures or unx_success:
-+        print "Unexpected Failures:", unx_failures
-+        print "Unexpected Success: ", unx_success
-+    else:
-+        print "All tests normal:", successes, "ok,", failures, "failed"
-+
-+
-+class ConfCSVDialect(csv.Dialect):
-+    delimiter = ','
-+    doublequote = True
-+    lineterminator = '\n'
-+    quotechar='"'
-+    quoting = csv.QUOTE_MINIMAL
-+    skipinitialspace = True
-+    strict = True
-+
-+if __name__ == '__main__':
-+
-+    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
-+    argp.add_argument("tests", nargs='*')
-+    argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
-+    argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
-+    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
-+    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
-+    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
-+    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
-+    args = argp.parse_args()
-+
-+    if args.csvgen:
-+        csv.writer(sys.stdout).writerows(scandir(args.test_root))
-+        exit(0)
-+
-+    with open(args.csv, 'rt') as csvfile:
-+        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
-+
-+    dectype = CODEC_HEVC_RPI
-+    if args.pi4 or os.path.exists("/dev/rpivid-hevcmem"):
-+        dectype = HWACCEL_RPI
-+    elif args.drm or os.path.exists("/sys/module/rpivid_v4l2"):
-+        dectype = HWACCEL_DRM
-+
-+    doconf(csva, args.tests, args.test_root, args.vcodec, dectype)
-+
-diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
-new file mode 100755
-index 0000000000..2fabe98c32
---- /dev/null
-+++ b/pi-util/ffperf.py
-@@ -0,0 +1,127 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+    close_threshold = 0.01
-+
-+    def __init__(self, stats_dict=None):
-+        if stats_dict != None:
-+            self.name = stats_dict["name"]
-+            self.elapsed = float(stats_dict["elapsed"])
-+            self.user = float(stats_dict["user"])
-+            self.sys = float(stats_dict["sys"])
-+
-+    def times_str(self):
-+        ctime = self.sys + self.user
-+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+    def dict(self):
-+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+    def is_close(self, other):
-+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+    def __lt__(self, other):
-+        return self.elapsed < other.elapsed
-+    def __gt__(self, other):
-+        return self.elapsed > other.elapsed
-+
-+    def time_file(name, prefix):
-+        stats = tstats()
-+        stats.name = name
-+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        cproc = subprocess.Popen(["./ffmpeg",
-+                                  "-hwaccel", "rpi",
-+                                  "-t", "30", "-i", prefix + name,
-+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+        pinfo = os.wait4(cproc.pid, 0)
-+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        stats.elapsed = end_time - start_time
-+        stats.user = pinfo[2].ru_utime
-+        stats.sys = pinfo[2].ru_stime
-+        return stats
-+
-+
-+def common_prefix(s1, s2):
-+    for i in range(min(len(s1),len(s2))):
-+        if s1[i] != s2[i]:
-+            return s1[:i]
-+    return s1[:i+1]
-+
-+def main():
-+    global flog
-+
-+    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
-+To blank the screen before starting use "xdg-screensaver activate"
-+(For some reason this doesn't seem to work from within python).
-+""")
-+
-+    argp.add_argument("streams", nargs='*')
-+    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
-+    argp.add_argument("--csv_in", help="CSV input filename")
-+    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
-+    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
-+
-+    args = argp.parse_args()
-+
-+    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
-+    csv_out.writeheader()
-+
-+    stats_in = {}
-+    if args.csv_in != None:
-+        with open(args.csv_in, 'r', newline='') as f_in:
-+            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
-+
-+    streams = args.streams
-+    if not streams:
-+        if not stats_in:
-+            print ("No source streams specified")
-+            return 1
-+        prefix = "" if args.prefix == None else args.prefix
-+        streams = [k for k in stats_in]
-+    elif args.prefix != None:
-+        prefix = args.prefix
-+    else:
-+        prefix = streams[0]
-+        for f in streams[1:]:
-+            prefix = common_prefix(prefix, f)
-+        pp = prefix.rpartition(os.sep)
-+        prefix = pp[0] + pp[1]
-+        streams = [s[len(prefix):] for s in streams]
-+
-+    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+        print ("====", f)
-+
-+        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
-+        for i in range(args.repeat):
-+            t = tstats.time_file(f, prefix)
-+            print ("...", t.times_str())
-+            if t0 > t:
-+                t0 = t
-+
-+        if t0.name in stats_in:
-+            pstat = stats_in[t0.name]
-+            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
-+
-+        csv_out.writerow(t0.dict())
-+
-+        print ()
-+
-+    return 0
-+
-+
-+if __name__ == '__main__':
-+    exit(main())
-+
-diff --git a/pi-util/genpatch.sh b/pi-util/genpatch.sh
-new file mode 100644
-index 0000000000..0948a68a7a
---- /dev/null
-+++ b/pi-util/genpatch.sh
-@@ -0,0 +1,35 @@
-+set -e
-+
-+NOPATCH=
-+if [ "$1" == "--notag" ]; then
-+  shift
-+  NOPATCH=1
-+fi
-+
-+if [ "$1" == "" ]; then
-+  echo Usage: $0 [--notag] \<patch_tag\>
-+  echo e.g.: $0 mmal_4
-+  exit 1
-+fi
-+
-+VERSION=`cat RELEASE`
-+if [ "$VERSION" == "" ]; then
-+  echo Can\'t find version RELEASE
-+  exit 1
-+fi
-+
-+PATCHFILE=../ffmpeg-$VERSION-$1.patch
-+
-+if [ $NOPATCH ]; then
-+  echo Not tagged
-+else
-+  # Only continue if we are all comitted
-+  git diff --name-status --exit-code
-+
-+  PATCHTAG=pi/$VERSION/$1
-+  echo Tagging: $PATCHTAG
-+
-+  git tag $PATCHTAG
-+fi
-+echo Generating patch: $PATCHFILE
-+git diff n$VERSION -- > $PATCHFILE
-diff --git a/pi-util/make_array.py b/pi-util/make_array.py
-new file mode 100755
-index 0000000000..67b22d2d51
---- /dev/null
-+++ b/pi-util/make_array.py
-@@ -0,0 +1,23 @@
-+#!/usr/bin/env python
-+
-+# Usage
-+#   make_array file.bin
-+#   Produces file.h with array of bytes.
-+#
-+import sys
-+for file in sys.argv[1:]:
-+  prefix,suffix = file.split('.')
-+  assert suffix=='bin'
-+  name=prefix.split('/')[-1]
-+  print 'Converting',file
-+  with open(prefix+'.h','wb') as out:
-+    print >>out, 'static const unsigned char',name,'[] = {'
-+    with open(file,'rb') as fd:
-+      i = 0
-+      for byte in fd.read():
-+        print >>out, '0x%02x, ' % ord(byte),
-+        i = i + 1
-+        if i % 8 == 0:
-+          print >>out, ' // %04x' % (i - 8)
-+    print >>out,'};'
-+
-diff --git a/pi-util/mkinst.sh b/pi-util/mkinst.sh
-new file mode 100755
-index 0000000000..271a39e846
---- /dev/null
-+++ b/pi-util/mkinst.sh
-@@ -0,0 +1,5 @@
-+set -e
-+
-+make install
-+
-+cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
-diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py
-new file mode 100755
-index 0000000000..e44cfa0c3c
---- /dev/null
-+++ b/pi-util/perfcmp.py
-@@ -0,0 +1,101 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+    close_threshold = 0.01
-+
-+    def __init__(self, stats_dict=None):
-+        if stats_dict != None:
-+            self.name = stats_dict["name"]
-+            self.elapsed = float(stats_dict["elapsed"])
-+            self.user = float(stats_dict["user"])
-+            self.sys = float(stats_dict["sys"])
-+
-+    def times_str(self):
-+        ctime = self.sys + self.user
-+        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+    def dict(self):
-+        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+    def is_close(self, other):
-+        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+    def __lt__(self, other):
-+        return self.elapsed < other.elapsed
-+    def __gt__(self, other):
-+        return self.elapsed > other.elapsed
-+
-+    def time_file(name, prefix):
-+        stats = tstats()
-+        stats.name = name
-+        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
-+                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+        pinfo = os.wait4(cproc.pid, 0)
-+        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+        stats.elapsed = end_time - start_time
-+        stats.user = pinfo[2].ru_utime
-+        stats.sys = pinfo[2].ru_stime
-+        return stats
-+
-+
-+def common_prefix(s1, s2):
-+    for i in range(min(len(s1),len(s2))):
-+        if s1[i] != s2[i]:
-+            return s1[:i]
-+    return s1[:i+1]
-+
-+def main():
-+    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
-+
-+    argp.add_argument("stream0", help="CSV to compare")
-+    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
-+
-+    args = argp.parse_args()
-+
-+    with open(args.stream0, 'r', newline='') as f_in:
-+        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+    with open(args.stream1, 'r', newline='') as f_in:
-+        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+    print (args.stream0, "<<-->>", args.stream1)
-+    print ()
-+
-+    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+       if not (f in stats0) :
-+           print ("           XX               :", f)
-+           continue
-+       if not (f in stats1) :
-+           print ("       XX                   :", f)
-+           continue
-+
-+       s0 = stats0[f]
-+       s1 = stats1[f]
-+
-+       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
-+       thresh = 0.3
-+       tc = 6
-+
-+       nchar = min(tc - 1, int(abs(pcent) / thresh))
-+       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
-+
-+       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
-+           (s0.elapsed, cc, s1.elapsed, pcent, f))
-+
-+    return 0
-+
-+
-+if __name__ == '__main__':
-+    exit(main())
-+
-diff --git a/pi-util/qem.sh b/pi-util/qem.sh
-new file mode 100755
-index 0000000000..a4dbb6eacd
---- /dev/null
-+++ b/pi-util/qem.sh
-@@ -0,0 +1,9 @@
-+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
-+QASM=python\ ../local/bin/qasm.py
-+SRC_FILE=libavcodec/rpi_hevc_shader.qasm
-+DST_BASE=shader
-+
-+cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
-+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
-+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
-+
-diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
-new file mode 100755
-index 0000000000..5935a11ca5
---- /dev/null
-+++ b/pi-util/v3dusage.py
-@@ -0,0 +1,128 @@
-+#!/usr/bin/env python
-+
-+import sys
-+import argparse
-+import re
-+
-+def do_logparse(logname):
-+
-+    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
-+    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
-+    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
-+    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
-+
-+    ttotal = {'idle':0.0}
-+    tstart = {}
-+    qctotal = {}
-+    qtstotal = {}
-+    l2hits = {}
-+    l2total = {}
-+    time0 = None
-+    idle_start = None
-+    qpu_op_no = 0
-+    op_count = 0
-+
-+    with open(logname, "rt") as infile:
-+        for line in infile:
-+            match = rmatch.match(line)
-+            if match:
-+#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
-+                time = float(match.group(1))
-+                unit = match.group(3)
-+                opstart = not match.group(2)
-+                optype = match.group(7)
-+                hascb = match.group(8) != "0"
-+
-+                if unit == 'qpu1':
-+                    unit = unit + "." + str(qpu_op_no)
-+                    if not opstart:
-+                        if hascb or optype == 'EXECUTE_SYNC':
-+                            qpu_op_no = 0
-+                        else:
-+                            qpu_op_no += 1
-+
-+                # Ignore sync type
-+                if optype == 'EXECUTE_SYNC':
-+                    continue
-+
-+                if not time0:
-+                    time0 = time
-+
-+                if opstart:
-+                    tstart[unit] = time;
-+                elif unit in tstart:
-+                    op_count += 1
-+                    if not unit in ttotal:
-+                        ttotal[unit] = 0.0
-+                    ttotal[unit] += time - tstart[unit]
-+                    del tstart[unit]
-+
-+                if not idle_start and not tstart:
-+                    idle_start = time
-+                elif idle_start and tstart:
-+                    ttotal['idle'] += time - idle_start
-+                    idle_start = None
-+
-+            match = rqcycle.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in qctotal:
-+                    qctotal[unit] = 0
-+                qctotal[unit] += int(match.group(2))
-+
-+            match = rqtscycle.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in qtstotal:
-+                    qtstotal[unit] = 0
-+                qtstotal[unit] += int(match.group(2))
-+
-+            match = rl2hits.match(line)
-+            if match:
-+                unit = "qpu1." + str(qpu_op_no)
-+                if not unit in l2total:
-+                    l2total[unit] = 0
-+                    l2hits[unit] = 0
-+                l2total[unit] += int(match.group(3))
-+                if match.group(2) == "hits":
-+                    l2hits[unit] += int(match.group(3))
-+
-+
-+    if not time0:
-+        print "No v3d profile records found"
-+    else:
-+        tlogged = time - time0
-+
-+        print "Logged time:", tlogged, "  Op count:", op_count
-+        for unit in sorted(ttotal):
-+            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
-+        print
-+        for unit in sorted(qctotal):
-+            if not unit in qtstotal:
-+                qtstotal[unit] = 0;
-+            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
-+            if unit in l2total:
-+                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
-+
-+
-+
-+if __name__ == '__main__':
-+    argp = argparse.ArgumentParser(
-+        formatter_class=argparse.RawDescriptionHelpFormatter,
-+        description="QPU/VPU perf summary from VC logging",
-+        epilog = """
-+Will also summarise TMU stalls if logging requests set in qpu noflush param
-+in the profiled code.
-+
-+Example use:
-+  vcgencmd set_logging level=0xc0
-+  <command to profile>
-+  sudo vcdbg log msg >& t.log
-+  v3dusage.py t.log
-+""")
-+
-+    argp.add_argument("logfile")
-+    args = argp.parse_args()
-+
-+    do_logparse(args.logfile)
-+
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0001-libavcodec-v4l2m2m-fix-indentation-and-add-M2MDEC_CL.patch b/packages/multimedia/ffmpeg/patches/v4l2/0001-libavcodec-v4l2m2m-fix-indentation-and-add-M2MDEC_CL.patch
deleted file mode 100644
index 4ef44cdcac..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0001-libavcodec-v4l2m2m-fix-indentation-and-add-M2MDEC_CL.patch
+++ /dev/null
@@ -1,73 +0,0 @@
-From 7134be3260ca6b885aa20447a06d35cab380a09e Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Tue, 24 Apr 2018 22:48:23 -0700
-Subject: [PATCH 01/14] libavcodec: v4l2m2m: fix indentation and add
- M2MDEC_CLASS
-
-This just makes the M2MDEC_CLASS similar to how it is done in rkmpp. It looks
-clean and has proper indentation
----
- libavcodec/v4l2_m2m_dec.c | 46 ++++++++++++++++++++-------------------
- 1 file changed, 24 insertions(+), 22 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index d0601f0e2f..e1b6925771 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -209,29 +209,31 @@ static const AVOption options[] = {
-     { NULL},
- };
- 
-+#define M2MDEC_CLASS(NAME) \
-+    static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
-+        .class_name = #NAME "_v4l2_m2m_decoder", \
-+        .item_name  = av_default_item_name, \
-+        .option     = options, \
-+        .version    = LIBAVUTIL_VERSION_INT, \
-+    };
-+
- #define M2MDEC(NAME, LONGNAME, CODEC, bsf_name) \
--static const AVClass v4l2_m2m_ ## NAME ## _dec_class = {\
--    .class_name = #NAME "_v4l2_m2m_decoder",\
--    .item_name  = av_default_item_name,\
--    .option     = options,\
--    .version    = LIBAVUTIL_VERSION_INT,\
--};\
--\
--AVCodec ff_ ## NAME ## _v4l2m2m_decoder = { \
--    .name           = #NAME "_v4l2m2m" ,\
--    .long_name      = NULL_IF_CONFIG_SMALL("V4L2 mem2mem " LONGNAME " decoder wrapper"),\
--    .type           = AVMEDIA_TYPE_VIDEO,\
--    .id             = CODEC ,\
--    .priv_data_size = sizeof(V4L2m2mPriv),\
--    .priv_class     = &v4l2_m2m_ ## NAME ## _dec_class,\
--    .init           = v4l2_decode_init,\
--    .receive_frame  = v4l2_receive_frame,\
--    .close          = ff_v4l2_m2m_codec_end,\
--    .bsfs           = bsf_name, \
--    .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | \
--                      AV_CODEC_CAP_AVOID_PROBING, \
--    .wrapper_name   = "v4l2m2m", \
--};
-+    M2MDEC_CLASS(NAME) \
-+    AVCodec ff_ ## NAME ## _v4l2m2m_decoder = { \
-+        .name           = #NAME "_v4l2m2m" , \
-+        .long_name      = NULL_IF_CONFIG_SMALL("V4L2 mem2mem " LONGNAME " decoder wrapper"), \
-+        .type           = AVMEDIA_TYPE_VIDEO, \
-+        .id             = CODEC , \
-+        .priv_data_size = sizeof(V4L2m2mPriv), \
-+        .priv_class     = &v4l2_m2m_ ## NAME ## _dec_class, \
-+        .init           = v4l2_decode_init, \
-+        .receive_frame  = v4l2_receive_frame, \
-+        .close          = ff_v4l2_m2m_codec_end, \
-+        .bsfs           = bsf_name, \
-+        .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY, \
-+	                      AV_CODEC_CAP_AVOID_PROBING, \
-+        .wrapper_name   = "v4l2m2m", \
-+    };
- 
- M2MDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");
- M2MDEC(hevc,  "HEVC",  AV_CODEC_ID_HEVC,       "hevc_mp4toannexb");
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0002-libavcodec-v4l2m2m-output-AVDRMFrameDescriptor.patch b/packages/multimedia/ffmpeg/patches/v4l2/0002-libavcodec-v4l2m2m-output-AVDRMFrameDescriptor.patch
deleted file mode 100644
index 01aa1da7e8..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0002-libavcodec-v4l2m2m-output-AVDRMFrameDescriptor.patch
+++ /dev/null
@@ -1,511 +0,0 @@
-From bf9fb2d576488ba08832e7cb7b10fe05a08665a5 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Tue, 24 Apr 2018 23:00:23 -0700
-Subject: [PATCH 02/14] libavcodec: v4l2m2m: output AVDRMFrameDescriptor
-
-This allows for a zero-copy output by exporting the v4l2 buffer then wrapping that buffer
-in the AVDRMFrameDescriptor like it is done in rkmpp.
-
-This has been in use for quite some time with great success on many platforms including:
- - Amlogic S905
- - Raspberry Pi
- - i.MX6
- - Dragonboard 410c
-
-This was developed in conjunction with Kodi to allow handling the zero-copy buffer rendering.
-A simply utility for testing is also available here: https://github.com/BayLibre/ffmpeg-drm
-
-todo:
- - allow selecting pixel format output from decoder
- - allow configuring amount of output and capture buffers
-
-V2:
- - allow selecting AV_PIX_FMT_DRM_PRIME
-
-V3:
- - use get_format to select AV_PIX_FMT_DRM_PRIME
- - use hw_configs
- - add handling of AV_PIX_FMT_YUV420P format (for raspberry pi)
- - add handling of AV_PIX_FMT_YUYV422 format (for i.MX6 coda decoder)
----
- libavcodec/v4l2_buffers.c | 216 ++++++++++++++++++++++++++++++++------
- libavcodec/v4l2_buffers.h |   4 +
- libavcodec/v4l2_context.c |  40 ++++++-
- libavcodec/v4l2_m2m.c     |   4 +-
- libavcodec/v4l2_m2m.h     |   3 +
- libavcodec/v4l2_m2m_dec.c |  23 ++++
- 6 files changed, 253 insertions(+), 37 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index aef911f3bb..e5c46ac81e 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -21,6 +21,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include <drm/drm_fourcc.h>
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- #include <sys/mman.h>
-@@ -29,6 +30,7 @@
- #include <poll.h>
- #include "libavcodec/avcodec.h"
- #include "libavcodec/internal.h"
-+#include "libavutil/hwcontext.h"
- #include "v4l2_context.h"
- #include "v4l2_buffers.h"
- #include "v4l2_m2m.h"
-@@ -203,7 +205,79 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
-     return AVCOL_TRC_UNSPECIFIED;
- }
- 
--static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
-+{
-+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-+    AVDRMLayerDescriptor *layer;
-+
-+    /* fill the DRM frame descriptor */
-+    drm_desc->nb_objects = avbuf->num_planes;
-+    drm_desc->nb_layers = 1;
-+
-+    layer = &drm_desc->layers[0];
-+    layer->nb_planes = avbuf->num_planes;
-+
-+    for (int i = 0; i < avbuf->num_planes; i++) {
-+        layer->planes[i].object_index = i;
-+        layer->planes[i].offset = 0;
-+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-+    }
-+
-+    switch (avbuf->context->av_pix_fmt) {
-+    case AV_PIX_FMT_YUYV422:
-+
-+        layer->format = DRM_FORMAT_YUYV;
-+        layer->nb_planes = 1;
-+
-+        break;
-+
-+    case AV_PIX_FMT_NV12:
-+    case AV_PIX_FMT_NV21:
-+
-+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
-+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 2;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-+        break;
-+
-+    case AV_PIX_FMT_YUV420P:
-+
-+        layer->format = DRM_FORMAT_YUV420;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 3;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+
-+        layer->planes[2].object_index = 0;
-+        layer->planes[2].offset = layer->planes[1].offset +
-+            ((avbuf->plane_info[0].bytesperline *
-+              avbuf->context->format.fmt.pix.height) >> 2);
-+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+        break;
-+
-+    default:
-+        drm_desc->nb_layers = 0;
-+        break;
-+    }
-+
-+    return (uint8_t *) drm_desc;
-+}
-+
-+static void v4l2_free_buffer(void *opaque, uint8_t *data)
- {
-     V4L2Buffer* avbuf = opaque;
-     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-@@ -227,27 +301,47 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-     }
- }
- 
--static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
- {
--    V4L2m2mContext *s = buf_to_m2mctx(in);
-+    struct v4l2_exportbuffer expbuf;
-+    int i, ret;
- 
--    if (plane >= in->num_planes)
--        return AVERROR(EINVAL);
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        memset(&expbuf, 0, sizeof(expbuf));
- 
--    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
--    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
--                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
--    if (!*buf)
--        return AVERROR(ENOMEM);
-+        expbuf.index = avbuf->buf.index;
-+        expbuf.type = avbuf->buf.type;
-+        expbuf.plane = i;
-+
-+        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
-+        if (ret < 0)
-+            return AVERROR(errno);
-+
-+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
-+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-+        } else {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
-+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int v4l2_buf_increase_ref(V4L2Buffer *in)
-+{
-+    V4L2m2mContext *s = buf_to_m2mctx(in);
- 
-     if (in->context_ref)
-         atomic_fetch_add(&in->context_refcount, 1);
-     else {
-         in->context_ref = av_buffer_ref(s->self_ref);
--        if (!in->context_ref) {
--            av_buffer_unref(buf);
-+        if (!in->context_ref)
-             return AVERROR(ENOMEM);
--        }
-+
-         in->context_refcount = 1;
-     }
- 
-@@ -257,6 +351,46 @@ static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-     return 0;
- }
- 
-+static int v4l2_buf_to_bufref_drm(V4L2Buffer *in, AVBufferRef **buf)
-+{
-+    int ret;
-+
-+    *buf = av_buffer_create((uint8_t *) &in->drm_frame,
-+                            sizeof(in->drm_frame),
-+                            v4l2_free_buffer,
-+                            in, AV_BUFFER_FLAG_READONLY);
-+    if (!*buf)
-+        return AVERROR(ENOMEM);
-+
-+    ret = v4l2_buf_increase_ref(in);
-+    if (ret)
-+         av_buffer_unref(buf);
-+
-+    return ret;
-+}
-+
-+static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-+{
-+    int ret;
-+
-+    if (plane >= in->num_planes)
-+        return AVERROR(EINVAL);
-+
-+    /* most encoders return 0 in data_offset but vp8 does require this value */
-+    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
-+                            in->plane_info[plane].length,
-+                            v4l2_free_buffer,
-+                            in, 0);
-+    if (!*buf)
-+        return AVERROR(ENOMEM);
-+
-+    ret = v4l2_buf_increase_ref(in);
-+    if (ret)
-+        av_buffer_unref(buf);
-+
-+    return ret;
-+}
-+
- static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, AVBufferRef* bref)
- {
-     unsigned int bytesused, length;
-@@ -308,31 +442,43 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
- 
-     av_frame_unref(frame);
- 
--    /* 1. get references to the actual data */
--    for (i = 0; i < avbuf->num_planes; i++) {
--        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        /* 1. get references to the actual data */
-+        ret = v4l2_buf_to_bufref_drm(avbuf, &frame->buf[0]);
-         if (ret)
-             return ret;
- 
--        frame->linesize[i] = avbuf->plane_info[i].bytesperline;
--        frame->data[i] = frame->buf[i]->data;
--    }
-+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+    } else {
-+        /* 1. get references to the actual data */
-+        for (i = 0; i < avbuf->num_planes; i++) {
-+            ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
-+            if (ret)
-+                return ret;
-+
-+            frame->linesize[i] = avbuf->plane_info[i].bytesperline;
-+            frame->data[i] = frame->buf[i]->data;
-+        }
- 
--    /* 1.1 fixup special cases */
--    switch (avbuf->context->av_pix_fmt) {
--    case AV_PIX_FMT_NV12:
--        if (avbuf->num_planes > 1)
-+        /* 1.1 fixup special cases */
-+        switch (avbuf->context->av_pix_fmt) {
-+        case AV_PIX_FMT_NV12:
-+            if (avbuf->num_planes > 1)
-+                break;
-+            frame->linesize[1] = avbuf->plane_info[0].bytesperline;
-+            frame->data[1] = frame->buf[0]->data +
-+                avbuf->plane_info[0].bytesperline *
-+                avbuf->context->format.fmt.pix.height;
-             break;
--        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
--        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
--        break;
--    default:
--        break;
-+        default:
-+            break;
-+        }
-+        frame->format = avbuf->context->av_pix_fmt;
-     }
- 
-     /* 2. get frame information */
-     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
--    frame->format = avbuf->context->av_pix_fmt;
-     frame->color_primaries = v4l2_get_color_primaries(avbuf);
-     frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_range = v4l2_get_color_range(avbuf);
-@@ -447,9 +593,6 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
- 
-     avbuf->status = V4L2BUF_AVAILABLE;
- 
--    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
--        return 0;
--
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->buf.m.planes = avbuf->planes;
-         avbuf->buf.length   = avbuf->num_planes;
-@@ -459,6 +602,15 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-         avbuf->buf.length    = avbuf->planes[0].length;
-     }
- 
-+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        return 0;
-+
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        ret = v4l2_buffer_export_drm(avbuf);
-+        if (ret)
-+                return ret;
-+    }
-+
-     return ff_v4l2_buffer_enqueue(avbuf);
- }
- 
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 7a57caf949..19324541d0 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -27,6 +27,7 @@
- #include <stdatomic.h>
- #include <linux/videodev2.h>
- 
-+#include "libavutil/hwcontext_drm.h"
- #include "avcodec.h"
- 
- enum V4L2Buffer_status {
-@@ -42,6 +43,9 @@ typedef struct V4L2Buffer {
-     /* each buffer needs to have a reference to its context */
-     struct V4L2Context *context;
- 
-+    /* DRM descriptor */
-+    AVDRMFrameDescriptor drm_frame;
-+
-     /* This object is refcounted per-plane, so we need to keep track
-      * of how many context-refs we are holding. */
-     AVBufferRef *context_ref;
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index efcb0426e4..9457fadb1e 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -393,22 +393,54 @@ static int v4l2_release_buffers(V4L2Context* ctx)
-     struct v4l2_requestbuffers req = {
-         .memory = V4L2_MEMORY_MMAP,
-         .type = ctx->type,
--        .count = 0, /* 0 -> unmaps buffers from the driver */
-+        .count = 0, /* 0 -> unmap all buffers from the driver */
-     };
--    int i, j;
-+    int ret, i, j;
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
-         V4L2Buffer *buffer = &ctx->buffers[i];
- 
-         for (j = 0; j < buffer->num_planes; j++) {
-             struct V4L2Plane_info *p = &buffer->plane_info[j];
-+
-+            if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+                /* output buffers are not EXPORTED */
-+                goto unmap;
-+            }
-+
-+            if (ctx_to_m2mctx(ctx)->output_drm) {
-+                /* use the DRM frame to close */
-+                if (buffer->drm_frame.objects[j].fd >= 0) {
-+                    if (close(buffer->drm_frame.objects[j].fd) < 0) {
-+                        av_log(logger(ctx), AV_LOG_ERROR, "%s close drm fd "
-+                            "[buffer=%2d, plane=%d, fd=%2d] - %s \n",
-+                            ctx->name, i, j, buffer->drm_frame.objects[j].fd,
-+                            av_err2str(AVERROR(errno)));
-+                    }
-+                }
-+            }
-+unmap:
-             if (p->mm_addr && p->length)
-                 if (munmap(p->mm_addr, p->length) < 0)
--                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
-+                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n",
-+                        ctx->name, av_err2str(AVERROR(errno)));
-         }
-     }
- 
--    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
-+    if (ret < 0) {
-+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
-+                ctx->name, av_err2str(AVERROR(errno)));
-+
-+            if (ctx_to_m2mctx(ctx)->output_drm)
-+                av_log(logger(ctx), AV_LOG_ERROR,
-+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
-+                    "for all buffers: \n"
-+                    "  1. drmModeRmFB(..)\n"
-+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
-+    }
-+
-+    return ret;
- }
- 
- static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index 427e165f58..7896326e80 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -159,7 +159,9 @@ static int v4l2_configure_contexts(V4L2m2mContext* s)
-         goto error;
-     }
- 
--    /* decoder's buffers need to be updated at a later stage */
-+    /* decoder's capture buffers are updated during v4l2_try_start once we find
-+     * the valid format.
-+     */
-     if (!av_codec_is_decoder(s->avctx->codec)) {
-         ret = ff_v4l2_context_init(&s->capture);
-         if (ret) {
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 0d4671beb1..043a81a86a 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -59,6 +59,9 @@ typedef struct V4L2m2mContext {
- 
-     /* Reference to self; only valid while codec is active. */
-     AVBufferRef *self_ref;
-+
-+    /* generate DRM frames */
-+    int output_drm;
- } V4L2m2mContext;
- 
- typedef struct V4L2m2mPriv
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index e1b6925771..b28f4e236a 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -23,12 +23,18 @@
- 
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
-+
-+#include "libavutil/hwcontext.h"
-+#include "libavutil/hwcontext_drm.h"
- #include "libavutil/pixfmt.h"
- #include "libavutil/pixdesc.h"
- #include "libavutil/opt.h"
- #include "libavcodec/avcodec.h"
- #include "libavcodec/decode.h"
- 
-+#include "libavcodec/hwaccel.h"
-+#include "libavcodec/internal.h"
-+
- #include "v4l2_context.h"
- #include "v4l2_m2m.h"
- #include "v4l2_fmt.h"
-@@ -186,6 +192,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-     capture->av_pix_fmt = avctx->pix_fmt;
- 
-+    /* the client requests the codec to generate DRM frames:
-+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-+     *       check the ff_v4l2_buffer_to_avframe conversion function.
-+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
-+     *       check the v4l2_get_drm_frame function.
-+     */
-+    if (ff_get_format(avctx, avctx->codec->pix_fmts) == AV_PIX_FMT_DRM_PRIME)
-+        s->output_drm = 1;
-+
-     ret = ff_v4l2_m2m_codec_init(avctx);
-     if (ret) {
-         V4L2m2mPriv *priv = avctx->priv_data;
-@@ -209,6 +224,11 @@ static const AVOption options[] = {
-     { NULL},
- };
- 
-+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-+    HW_CONFIG_INTERNAL(DRM_PRIME),
-+    NULL
-+};
-+
- #define M2MDEC_CLASS(NAME) \
-     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
-         .class_name = #NAME "_v4l2_m2m_decoder", \
-@@ -229,7 +249,10 @@ static const AVOption options[] = {
-         .init           = v4l2_decode_init, \
-         .receive_frame  = v4l2_receive_frame, \
-         .close          = ff_v4l2_m2m_codec_end, \
-+        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
-+                                                         AV_PIX_FMT_NONE}, \
-         .bsfs           = bsf_name, \
-+        .hw_configs     = v4l2_m2m_hw_configs, \
-         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY, \
- 	                      AV_CODEC_CAP_AVOID_PROBING, \
-         .wrapper_name   = "v4l2m2m", \
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0003-libavcodec-v4l2m2m-adjust-formatting.patch b/packages/multimedia/ffmpeg/patches/v4l2/0003-libavcodec-v4l2m2m-adjust-formatting.patch
deleted file mode 100644
index a40bb79848..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0003-libavcodec-v4l2m2m-adjust-formatting.patch
+++ /dev/null
@@ -1,106 +0,0 @@
-From ebcfd47d8411fcc91d8058643b151068b5a7fedc Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Tue, 8 May 2018 22:40:23 -0700
-Subject: [PATCH 03/14] libavcodec: v4l2m2m: adjust formatting
-
-just some simple formatting fixes that unify the code quality
----
- libavcodec/v4l2_buffers.c | 23 +++++++++++++++--------
- libavcodec/v4l2_buffers.h |  1 -
- 2 files changed, 15 insertions(+), 9 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index e5c46ac81e..897c3c4636 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -401,7 +401,8 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
-     bytesused = FFMIN(size, out->plane_info[plane].length);
-     length = out->plane_info[plane].length;
- 
--    memcpy(out->plane_info[plane].mm_addr, data, FFMIN(size, out->plane_info[plane].length));
-+    memcpy(out->plane_info[plane].mm_addr, data,
-+           FFMIN(size, out->plane_info[plane].length));
- 
-     if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-         out->planes[plane].bytesused = bytesused;
-@@ -425,7 +426,10 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer* out)
-     int i, ret;
- 
-     for(i = 0; i < out->num_planes; i++) {
--        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, frame->buf[i]);
-+        ret = v4l2_bufref_to_buf(out, i,
-+                                frame->buf[i]->data,
-+                                frame->buf[i]->size,
-+                                frame->buf[i]);
-         if (ret)
-             return ret;
-     }
-@@ -480,8 +484,8 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
-     /* 2. get frame information */
-     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
-     frame->color_primaries = v4l2_get_color_primaries(avbuf);
--    frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_range = v4l2_get_color_range(avbuf);
-+    frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_trc = v4l2_get_color_trc(avbuf);
-     frame->pts = v4l2_get_pts(avbuf);
- 
-@@ -507,7 +511,8 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
-     if (ret)
-         return ret;
- 
--    pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-+    pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ?
-+        avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-     pkt->data = pkt->buf->data;
- 
-     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
-@@ -563,6 +568,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-             /* in MP, the V4L2 API states that buf.length means num_planes */
-             if (avbuf->num_planes >= avbuf->buf.length)
-                 break;
-+
-             if (avbuf->buf.m.planes[avbuf->num_planes].length)
-                 avbuf->num_planes++;
-         }
-@@ -579,12 +585,14 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
-             avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-                                            PROT_READ | PROT_WRITE, MAP_SHARED,
--                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+                                           buf_to_m2mctx(avbuf)->fd,
-+                                           avbuf->buf.m.planes[i].m.mem_offset);
-         } else {
-             avbuf->plane_info[i].length = avbuf->buf.length;
-             avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
--                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+                                          buf_to_m2mctx(avbuf)->fd,
-+                                          avbuf->buf.m.offset);
-         }
- 
-         if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
-@@ -594,9 +602,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-     avbuf->status = V4L2BUF_AVAILABLE;
- 
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
--        avbuf->buf.m.planes = avbuf->planes;
-         avbuf->buf.length   = avbuf->num_planes;
--
-+        avbuf->buf.m.planes = avbuf->planes;
-     } else {
-         avbuf->buf.bytesused = avbuf->planes[0].bytesused;
-         avbuf->buf.length    = avbuf->planes[0].length;
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 19324541d0..b6072baec8 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -131,5 +131,4 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
-  */
- int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
- 
--
- #endif // AVCODEC_V4L2_BUFFERS_H
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0004-libavcodec-v4l2m2m-fix-error-handling-during-buffer-.patch b/packages/multimedia/ffmpeg/patches/v4l2/0004-libavcodec-v4l2m2m-fix-error-handling-during-buffer-.patch
deleted file mode 100644
index 3ed7a62fad..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0004-libavcodec-v4l2m2m-fix-error-handling-during-buffer-.patch
+++ /dev/null
@@ -1,84 +0,0 @@
-From db39d4579d36266a3f3b87312303d5097543633d Mon Sep 17 00:00:00 2001
-From: Jorge Ramirez-Ortiz <jramirez@baylibre.com>
-Date: Sun, 6 May 2018 19:56:30 +0200
-Subject: [PATCH 04/14] libavcodec: v4l2m2m: fix error handling during buffer
- init
-
-Signed-off-by: Jorge Ramirez-Ortiz <jramirez@baylibre.com>
----
- libavcodec/v4l2_context.c | 19 ++++++++++++++++---
- libavcodec/v4l2_m2m_dec.c |  9 +++++++--
- 2 files changed, 23 insertions(+), 5 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 9457fadb1e..12d40d597e 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -263,6 +263,12 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
-     /* if we are draining and there are no more capture buffers queued in the driver we are done */
-     if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
-         for (i = 0; i < ctx->num_buffers; i++) {
-+            /* capture buffer initialization happens during decode hence
-+             * detection happens at runtime
-+             */
-+            if (!ctx->buffers)
-+                break;
-+
-             if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-                 goto start;
-         }
-@@ -724,9 +730,8 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-         ctx->buffers[i].context = ctx;
-         ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
-         if (ret < 0) {
--            av_log(logger(ctx), AV_LOG_ERROR, "%s buffer initialization (%s)\n", ctx->name, av_err2str(ret));
--            av_free(ctx->buffers);
--            return ret;
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
-+            goto error;
-         }
-     }
- 
-@@ -739,4 +744,12 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
- 
-     return 0;
-+
-+error:
-+    v4l2_release_buffers(ctx);
-+
-+    av_free(ctx->buffers);
-+    ctx->buffers = NULL;
-+
-+    return ret;
- }
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index b28f4e236a..fb5406a74e 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -92,8 +92,8 @@ static int v4l2_try_start(AVCodecContext *avctx)
-     if (!capture->buffers) {
-         ret = ff_v4l2_context_init(capture);
-         if (ret) {
--            av_log(avctx, AV_LOG_DEBUG, "can't request output buffers\n");
--            return ret;
-+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
-+            return AVERROR(ENOMEM);
-         }
-     }
- 
-@@ -157,6 +157,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-         ret = v4l2_try_start(avctx);
-         if (ret) {
-             av_packet_unref(&avpkt);
-+
-+            /* cant recover */
-+            if (ret == AVERROR(ENOMEM))
-+                return ret;
-+
-             return 0;
-         }
-     }
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0005-libavcodec-v4l2m2m-depends-on-libdrm.patch b/packages/multimedia/ffmpeg/patches/v4l2/0005-libavcodec-v4l2m2m-depends-on-libdrm.patch
deleted file mode 100644
index 365d1f51e4..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0005-libavcodec-v4l2m2m-depends-on-libdrm.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-From 86a709f752d430166b9c1e26fa639886710b79ad Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Thu, 16 Aug 2018 21:09:40 -0700
-Subject: [PATCH 05/14] libavcodec: v4l2m2m: depends on libdrm
-
----
- configure                 | 1 +
- libavcodec/v4l2_buffers.c | 2 +-
- 2 files changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/configure b/configure
-index 34c2adb4a4..b72f1d6270 100755
---- a/configure
-+++ b/configure
-@@ -3343,6 +3343,7 @@ sndio_indev_deps="sndio"
- sndio_outdev_deps="sndio"
- v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_indev_suggest="libv4l2"
-+v4l2_outdev_deps="libdrm"
- v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_outdev_suggest="libv4l2"
- vfwcap_indev_deps="vfw32 vfwcap_defines"
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 897c3c4636..d6838866b7 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -21,7 +21,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
--#include <drm/drm_fourcc.h>
-+#include <drm_fourcc.h>
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- #include <sys/mman.h>
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0006-libavcodec-v4l2m2m-set-format_modifier-to-DRM_FORMAT.patch b/packages/multimedia/ffmpeg/patches/v4l2/0006-libavcodec-v4l2m2m-set-format_modifier-to-DRM_FORMAT.patch
deleted file mode 100644
index ddbd61d602..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0006-libavcodec-v4l2m2m-set-format_modifier-to-DRM_FORMAT.patch
+++ /dev/null
@@ -1,30 +0,0 @@
-From e657b496a21056d51f17f187dfdf0c62d1da13f2 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Thu, 16 Aug 2018 21:10:13 -0700
-Subject: [PATCH 06/14] libavcodec: v4l2m2m: set format_modifier to
- DRM_FORMAT_MOD_LINEAR
-
----
- libavcodec/v4l2_buffers.c | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index d6838866b7..d879aab7b1 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -321,10 +321,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
-             /* drm frame */
-             avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
-             avbuf->drm_frame.objects[i].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-         } else {
-             /* drm frame */
-             avbuf->drm_frame.objects[0].size = avbuf->buf.length;
-             avbuf->drm_frame.objects[0].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-         }
-     }
- 
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0007-libavcodec-v4l2m2m-only-mmap-the-buffer-when-it-is-o.patch b/packages/multimedia/ffmpeg/patches/v4l2/0007-libavcodec-v4l2m2m-only-mmap-the-buffer-when-it-is-o.patch
deleted file mode 100644
index 30a7484304..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0007-libavcodec-v4l2m2m-only-mmap-the-buffer-when-it-is-o.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-From 013503bba6eefa10caffe2451fe375b31ed1584a Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Thu, 16 Aug 2018 21:10:53 -0700
-Subject: [PATCH 07/14] libavcodec: v4l2m2m: only mmap the buffer when it is
- output type and drm prime is used
-
----
- libavcodec/v4l2_buffers.c | 24 ++++++++++++++++--------
- 1 file changed, 16 insertions(+), 8 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index d879aab7b1..ee19bb5b6f 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -585,16 +585,24 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
- 
-         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
--                                           PROT_READ | PROT_WRITE, MAP_SHARED,
--                                           buf_to_m2mctx(avbuf)->fd,
--                                           avbuf->buf.m.planes[i].m.mem_offset);
-+
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                               buf_to_m2mctx(avbuf)->fd,
-+                                               avbuf->buf.m.planes[i].m.mem_offset);
-+            }
-         } else {
-             avbuf->plane_info[i].length = avbuf->buf.length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
--                                          PROT_READ | PROT_WRITE, MAP_SHARED,
--                                          buf_to_m2mctx(avbuf)->fd,
--                                          avbuf->buf.m.offset);
-+
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-+                                              PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                              buf_to_m2mctx(avbuf)->fd,
-+                                              avbuf->buf.m.offset);
-+            }
-         }
- 
-         if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0008-libavcodec-v4l2m2m-allow-using-software-pixel-format.patch b/packages/multimedia/ffmpeg/patches/v4l2/0008-libavcodec-v4l2m2m-allow-using-software-pixel-format.patch
deleted file mode 100644
index 9f6149077f..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0008-libavcodec-v4l2m2m-allow-using-software-pixel-format.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-From f3df44dbdf5e7ca66064c0ac9ae0f84bbcc245e9 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Thu, 16 Aug 2018 21:11:38 -0700
-Subject: [PATCH 08/14] libavcodec: v4l2m2m: allow using software pixel formats
-
----
- libavcodec/v4l2_m2m_dec.c | 11 ++++++++++-
- 1 file changed, 10 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index fb5406a74e..8834b3d6fc 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -203,8 +203,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-      *   - the DRM frame format is passed in the DRM frame descriptor layer.
-      *       check the v4l2_get_drm_frame function.
-      */
--    if (ff_get_format(avctx, avctx->codec->pix_fmts) == AV_PIX_FMT_DRM_PRIME)
-+    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
-+    case AV_PIX_FMT_DRM_PRIME:
-         s->output_drm = 1;
-+        break;
-+    case AV_PIX_FMT_NONE:
-+        return 0;
-+        break;
-+    default:
-+        break;
-+    }
- 
-     ret = ff_v4l2_m2m_codec_init(avctx);
-     if (ret) {
-@@ -255,6 +263,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-         .receive_frame  = v4l2_receive_frame, \
-         .close          = ff_v4l2_m2m_codec_end, \
-         .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
-+                                                         AV_PIX_FMT_NV12, \
-                                                          AV_PIX_FMT_NONE}, \
-         .bsfs           = bsf_name, \
-         .hw_configs     = v4l2_m2m_hw_configs, \
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0009-libavcodec-v4l2m2m-fix-decoder-capabilities.patch b/packages/multimedia/ffmpeg/patches/v4l2/0009-libavcodec-v4l2m2m-fix-decoder-capabilities.patch
deleted file mode 100644
index f359d9c87a..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0009-libavcodec-v4l2m2m-fix-decoder-capabilities.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-From 3af2830856f4356acb9bf1655adf632759782aac Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Mon, 24 Sep 2018 13:38:46 -0700
-Subject: [PATCH 09/14] libavcodec: v4l2m2m: fix decoder capabilities
-
----
- libavcodec/v4l2_m2m_dec.c | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 8834b3d6fc..0087de35d0 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -267,8 +267,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-                                                          AV_PIX_FMT_NONE}, \
-         .bsfs           = bsf_name, \
-         .hw_configs     = v4l2_m2m_hw_configs, \
--        .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY, \
--	                      AV_CODEC_CAP_AVOID_PROBING, \
-+        .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
-         .wrapper_name   = "v4l2m2m", \
-     };
- 
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0010-libavcodec-v4l2m2m-implement-hwcontext.patch b/packages/multimedia/ffmpeg/patches/v4l2/0010-libavcodec-v4l2m2m-implement-hwcontext.patch
deleted file mode 100644
index 40ef615117..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0010-libavcodec-v4l2m2m-implement-hwcontext.patch
+++ /dev/null
@@ -1,101 +0,0 @@
-From 8b7ec4eae175835a806619b0bba98eb6d7252a13 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Mon, 24 Sep 2018 13:39:31 -0700
-Subject: [PATCH 10/14] libavcodec: v4l2m2m: implement hwcontext
-
----
- libavcodec/v4l2_buffers.c | 22 ++++++++++++++++++++++
- libavcodec/v4l2_context.h |  2 ++
- libavcodec/v4l2_m2m.h     |  2 ++
- libavcodec/v4l2_m2m_dec.c |  9 +++++++++
- 4 files changed, 35 insertions(+)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index ee19bb5b6f..27820905b2 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -456,6 +456,7 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
- 
-         frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
-         frame->format = AV_PIX_FMT_DRM_PRIME;
-+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
-     } else {
-         /* 1. get references to the actual data */
-         for (i = 0; i < avbuf->num_planes; i++) {
-@@ -555,6 +556,27 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-     avbuf->buf.type = ctx->type;
-     avbuf->buf.index = index;
- 
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        AVHWFramesContext *hwframes;
-+
-+        av_buffer_unref(&ctx->frames_ref);
-+
-+        ctx->frames_ref = av_hwframe_ctx_alloc(buf_to_m2mctx(avbuf)->device_ref);
-+        if (!ctx->frames_ref) {
-+            ret = AVERROR(ENOMEM);
-+            return ret;
-+        }
-+
-+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
-+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
-+        hwframes->sw_format = ctx->av_pix_fmt;
-+        hwframes->width = ctx->width;
-+        hwframes->height = ctx->height;
-+        ret = av_hwframe_ctx_init(ctx->frames_ref);
-+        if (ret < 0)
-+            return ret;
-+    }
-+
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->buf.length = VIDEO_MAX_PLANES;
-         avbuf->buf.m.planes = avbuf->planes;
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 632f1d0aac..9a1dbbea60 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -91,6 +91,8 @@ typedef struct V4L2Context {
-      */
-     int done;
- 
-+    AVBufferRef *frames_ref;
-+
- } V4L2Context;
- 
- /**
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 043a81a86a..4475618392 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -60,6 +60,8 @@ typedef struct V4L2m2mContext {
-     /* Reference to self; only valid while codec is active. */
-     AVBufferRef *self_ref;
- 
-+    AVBufferRef *device_ref;
-+
-     /* generate DRM frames */
-     int output_drm;
- } V4L2m2mContext;
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 0087de35d0..fffeb49092 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -214,6 +214,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-         break;
-     }
- 
-+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
-+    if (!s->device_ref) {
-+        ret = AVERROR(ENOMEM);
-+        return ret;
-+    }
-+    ret = av_hwdevice_ctx_init(s->device_ref);
-+    if (ret < 0)
-+        return ret;
-+
-     ret = ff_v4l2_m2m_codec_init(avctx);
-     if (ret) {
-         V4L2m2mPriv *priv = avctx->priv_data;
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0011-libavcodec-v4l2m2m-implement-flush.patch b/packages/multimedia/ffmpeg/patches/v4l2/0011-libavcodec-v4l2m2m-implement-flush.patch
deleted file mode 100644
index 1b29060cfd..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0011-libavcodec-v4l2m2m-implement-flush.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-From 799aa17fbaa5a5dda7fd8a04c71df905e363f6f3 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Mon, 24 Sep 2018 13:39:56 -0700
-Subject: [PATCH 11/14] libavcodec: v4l2m2m: implement flush
-
----
- libavcodec/v4l2_m2m_dec.c | 20 ++++++++++++++++++++
- 1 file changed, 20 insertions(+)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index fffeb49092..47ac53df00 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -236,6 +236,25 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     return v4l2_prepare_decoder(s);
- }
- 
-+static void v4l2_flush(AVCodecContext *avctx)
-+{
-+    V4L2m2mPriv *priv = avctx->priv_data;
-+    V4L2m2mContext* s = priv->context;
-+    int ret;
-+
-+    /* wait for pending buffer references */
-+    if (atomic_load(&s->refcount))
-+        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
-+
-+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
-+    if (ret)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
-+
-+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-+    if (ret)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
-+}
-+
- #define OFFSET(x) offsetof(V4L2m2mPriv, x)
- #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
- 
-@@ -271,6 +290,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-         .init           = v4l2_decode_init, \
-         .receive_frame  = v4l2_receive_frame, \
-         .close          = ff_v4l2_m2m_codec_end, \
-+        .flush          = v4l2_flush, \
-         .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
-                                                          AV_PIX_FMT_NV12, \
-                                                          AV_PIX_FMT_NONE}, \
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0012-libavcodec-v4l2m2m-aspect-ratio.patch b/packages/multimedia/ffmpeg/patches/v4l2/0012-libavcodec-v4l2m2m-aspect-ratio.patch
deleted file mode 100644
index ba62a0397c..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0012-libavcodec-v4l2m2m-aspect-ratio.patch
+++ /dev/null
@@ -1,63 +0,0 @@
-From f91c0f01a4c563129acd388272e0ac4795b83435 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Fri, 31 Jan 2020 08:33:02 -0800
-Subject: [PATCH 12/14] libavcodec: v4l2m2m: aspect ratio
-
----
- libavcodec/v4l2_context.c | 26 +++++++++++++++++++++++++-
- 1 file changed, 25 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 12d40d597e..5c51399a4c 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -539,6 +539,24 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
-     return 0;
- }
- 
-+static AVRational v4l2_get_sar(V4L2Context* ctx)
-+{
-+    struct AVRational sar = { 1, 1 };
-+    struct v4l2_cropcap cropcap;
-+    int ret;
-+
-+    memset(&cropcap, 0, sizeof(cropcap));
-+    cropcap.type = ctx->type;
-+
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_CROPCAP, &cropcap);
-+    if (ret)
-+        return sar;
-+
-+    sar.num = cropcap.pixelaspect.numerator;
-+    sar.den = cropcap.pixelaspect.denominator;
-+    return sar;
-+}
-+
-  /*****************************************************************************
-   *
-   *             V4L2 Context Interface
-@@ -612,6 +630,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
- int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame)
- {
-     V4L2Buffer* avbuf = NULL;
-+    int ret;
- 
-     /*
-      * blocks until:
-@@ -626,7 +645,12 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame)
-         return AVERROR(EAGAIN);
-     }
- 
--    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+    ret = ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+    if (ret)
-+	    return ret;
-+
-+    frame->sample_aspect_ratio = v4l2_get_sar(ctx);
-+    return 0;
- }
- 
- int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0013-libavcodec-v4l2m2m-save-pkt.patch b/packages/multimedia/ffmpeg/patches/v4l2/0013-libavcodec-v4l2m2m-save-pkt.patch
deleted file mode 100644
index 95e0c03633..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0013-libavcodec-v4l2m2m-save-pkt.patch
+++ /dev/null
@@ -1,62 +0,0 @@
-From da24c4a08dd820804d9bd0709815d954836277d8 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Fri, 31 Jan 2020 08:35:37 -0800
-Subject: [PATCH 13/14] libavcodec: v4l2m2m: save pkt
-
----
- libavcodec/v4l2_m2m_dec.c | 18 ++++++++++++++----
- 1 file changed, 14 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 47ac53df00..cf48ff4b22 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -131,6 +131,8 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
-     return 0;
- }
- 
-+static AVPacket saved_avpkt = { 0 };
-+
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-@@ -139,9 +141,14 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-     AVPacket avpkt = {0};
-     int ret;
- 
--    ret = ff_decode_get_packet(avctx, &avpkt);
--    if (ret < 0 && ret != AVERROR_EOF)
--        return ret;
-+    if (saved_avpkt.size) {
-+	avpkt = saved_avpkt;
-+	memset(&saved_avpkt, 0, sizeof(saved_avpkt));
-+    } else {
-+        ret = ff_decode_get_packet(avctx, &avpkt);
-+        if (ret < 0 && ret != AVERROR_EOF)
-+            return ret;
-+    }
- 
-     if (s->draining)
-         goto dequeue;
-@@ -150,6 +157,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-     if (ret < 0) {
-         if (ret != AVERROR(ENOMEM))
-            return ret;
-+
-+        saved_avpkt = avpkt;
-         /* no input buffers available, continue dequeing */
-     }
- 
-@@ -167,7 +176,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-     }
- 
- dequeue:
--    av_packet_unref(&avpkt);
-+    if (!saved_avpkt.size)
-+        av_packet_unref(&avpkt);
-     return ff_v4l2_context_dequeue_frame(capture, frame);
- }
- 
--- 
-2.24.1
-
diff --git a/packages/multimedia/ffmpeg/patches/v4l2/0014-libavcodec-v4l2m2m-only-use-a-few-output-buffers.patch b/packages/multimedia/ffmpeg/patches/v4l2/0014-libavcodec-v4l2m2m-only-use-a-few-output-buffers.patch
deleted file mode 100644
index 24cfd7469c..0000000000
--- a/packages/multimedia/ffmpeg/patches/v4l2/0014-libavcodec-v4l2m2m-only-use-a-few-output-buffers.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From d317cac9ec2470bcdb44f92a82b64e4d132aeb51 Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Fri, 31 Jan 2020 09:00:51 -0800
-Subject: [PATCH 14/14] libavcodec: v4l2m2m: only use a few output buffers
-
----
- libavcodec/v4l2_m2m.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 4475618392..7e075e7e80 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -38,7 +38,7 @@
- 
- #define V4L_M2M_DEFAULT_OPTS \
-     { "num_output_buffers", "Number of buffers in the output context",\
--        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
-+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 2 }, 1, INT_MAX, FLAGS }
- 
- typedef struct V4L2m2mContext {
-     char devname[PATH_MAX];
--- 
-2.24.1
-

From 1ff43295455671cfd1d6453f534042f57a29d4ee Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 9 Jul 2020 12:33:50 +0200
Subject: [PATCH 03/10] tools/ffmpeg: add gen-patches.sh script

This script creates or updates the ffmpeg patch files from
the various feature branches and record patch info (repo,
branch, gitrevs) where the changes came from in the commit
message.

The feature branch to process can be specified via command a
line argument, "all" will create/update all known branches.

By default the patches are updated to HEAD of the branches.

If only a single branch is processed the full githash to use
instead of HEAD can be specified as an optional second argument.
This can be used to drop some of the top-most commits in the
branch, eg if later changes cause issues.

The script has to be run in a local ffmpeg git tree. eg:

$ cd ~/ffmpeg-git
$ ~/libreelec-git/tools/ffmpeg/gen-patches.sh all

Additional git commit options (eg -s) can be set via the
GIT_COMMIT_ARGS environment variable

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 tools/ffmpeg/gen-patches.sh | 98 +++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100755 tools/ffmpeg/gen-patches.sh

diff --git a/tools/ffmpeg/gen-patches.sh b/tools/ffmpeg/gen-patches.sh
new file mode 100755
index 0000000000..48ace62349
--- /dev/null
+++ b/tools/ffmpeg/gen-patches.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+# base ffmpeg version
+KODI_FFMPEG_REPO="https://github.com/xbmc/FFmpeg"
+KODI_FFMPEG_VERSION="4.3-Matrix-Alpha1"
+
+ALL_FEATURE_SETS="v4l2-drmprime v4l2-request libreelec rpi"
+
+if [ $# -eq 0 ]; then
+  echo "usage: $0 all|featureset [githash]"
+  echo "available feature sets: ${ALL_FEATURE_SETS}"
+  exit 1
+fi
+
+FFMPEG_ROOT="$(pwd)"
+LE_ROOT="$(cd $(dirname $0)/../.. ; pwd)"
+
+# get kodi's ffmpeg version
+git fetch "${KODI_FFMPEG_REPO}" "${KODI_FFMPEG_VERSION}"
+KODI_REV=$(git rev-parse FETCH_HEAD)
+
+create_patch() {
+  FEATURE_SET="$1"
+  REFTYPE="branch"
+  case "${FEATURE_SET}" in
+    v4l2-drmprime)
+      REPO="https://github.com/lrusak/FFmpeg"
+      REFSPEC="v4l2-drmprime-v5"
+      ;;
+    v4l2-request)
+      REPO="https://github.com/Kwiboo/FFmpeg"
+      REFSPEC="v4l2-request-hwaccel-4.3"
+      ;;
+    libreelec)
+      REPO="https://github.com/LibreELEC/FFmpeg"
+      REFSPEC="4.3-libreelec-misc"
+      ;;
+    rpi)
+      REPO="https://github.com/jc-kynesim/rpi-ffmpeg"
+      REFSPEC="test/4.3/kodi_main"
+      ;;
+    *)
+      echo "illegal feature set ${FEATURE_SET}"
+      exit 1
+      ;;
+  esac
+
+  PATCH_DIR="packages/multimedia/ffmpeg/patches/${FEATURE_SET}"
+  PATCH_FILE="${PATCH_DIR}/ffmpeg-001-${FEATURE_SET}.patch"
+  mkdir -p "${LE_ROOT}/${PATCH_DIR}"
+
+  git fetch "${REPO}" "${REFSPEC}"
+  if [ $# -ge 2 ]; then
+    REV="$2"
+  else
+    REV=$(git rev-parse FETCH_HEAD)
+  fi
+  BASE_REV=$(git merge-base "${KODI_REV}" "${REV}")
+
+  if [ -f "${LE_ROOT}/${PATCH_FILE}" ]; then
+    ACTION="update"
+  else
+    ACTION="create"
+  fi
+
+  if [ "${FEATURE_SET}" = "rpi" ]; then
+    # branch has non-linear history, format-patch doesn't work
+    git diff "${BASE_REV}..${REV}" > "${LE_ROOT}/${PATCH_FILE}"
+  else
+    git format-patch --stdout --no-signature "${BASE_REV}..${REV}" > "${LE_ROOT}/${PATCH_FILE}"
+  fi
+
+  MSG=$(mktemp)
+
+  cat << EOF > "${MSG}"
+ffmpeg: ${ACTION} ${FEATURE_SET} patch
+
+Patch created using revisions ${BASE_REV:0:7}..${REV:0:7}
+from ${REFTYPE} ${REFSPEC} of ${REPO}
+EOF
+
+  cd "${LE_ROOT}"
+  git add "${PATCH_FILE}"
+  git commit -F "${MSG}" ${GIT_COMMIT_ARGS}
+  cd "${FFMPEG_ROOT}"
+  rm "${MSG}"
+}
+
+if [ "$1" = "all" ]; then
+  for SET in ${ALL_FEATURE_SETS}; do
+    create_patch "${SET}"
+  done
+else
+  create_patch "$@"
+fi
+
+
+

From 924d3e6a0ed7d64301ffdb1e4cf5c402b65dbd6e Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 9 Jul 2020 13:24:24 +0200
Subject: [PATCH 04/10] ffmpeg: create v4l2-drmprime patch

Patch created using revisions 8e12af2..8595d06
from branch v4l2-drmprime-v5 of https://github.com/lrusak/FFmpeg

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 .../ffmpeg-001-v4l2-drmprime.patch            | 970 ++++++++++++++++++
 1 file changed, 970 insertions(+)
 create mode 100644 packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch

diff --git a/packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch b/packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch
new file mode 100644
index 0000000000..9313375a37
--- /dev/null
+++ b/packages/multimedia/ffmpeg/patches/v4l2-drmprime/ffmpeg-001-v4l2-drmprime.patch
@@ -0,0 +1,970 @@
+From b29b5b9f529bbb10cd9880adebf0fb287dcf233b Mon Sep 17 00:00:00 2001
+From: Andriy Gelman <andriy.gelman@gmail.com>
+Date: Tue, 28 Apr 2020 22:54:21 -0400
+Subject: [PATCH 01/11] avcodec/v4l2_m2m: Adapt to call close() on init fail
+
+This fixes several mem leaks when init of encoder/decoder failed.
+
+Fixes ticket #8285
+
+Signed-off-by: Andriy Gelman <andriy.gelman@gmail.com>
+---
+ libavcodec/v4l2_m2m.c     |  8 ++++++++
+ libavcodec/v4l2_m2m_dec.c | 10 ++--------
+ libavcodec/v4l2_m2m_enc.c |  1 +
+ 3 files changed, 11 insertions(+), 8 deletions(-)
+
+diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
+index e48b3a8ccf..bfea70ff0c 100644
+--- a/libavcodec/v4l2_m2m.c
++++ b/libavcodec/v4l2_m2m.c
+@@ -338,6 +338,13 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+     V4L2m2mContext *s = priv->context;
+     int ret;
+ 
++    if (!s)
++        return 0;
++
++    if (av_codec_is_decoder(s->avctx->codec))
++        av_packet_unref(&s->buf_pkt);
++
++    if (s->fd >= 0) {
+     ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
+     if (ret)
+         av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
+@@ -345,6 +352,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+     ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
+     if (ret)
+         av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name);
++    }
+ 
+     ff_v4l2_context_release(&s->output);
+ 
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index 3e17e0fcac..a2ea0ff73a 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -212,9 +212,6 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+     ret = ff_v4l2_m2m_codec_init(priv);
+     if (ret) {
+         av_log(avctx, AV_LOG_ERROR, "can't configure decoder\n");
+-        s->self_ref = NULL;
+-        av_buffer_unref(&priv->context_ref);
+-
+         return ret;
+     }
+ 
+@@ -223,10 +220,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+ 
+ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+ {
+-    V4L2m2mPriv *priv = avctx->priv_data;
+-    V4L2m2mContext *s = priv->context;
+-    av_packet_unref(&s->buf_pkt);
+-    return ff_v4l2_m2m_codec_end(priv);
++    return ff_v4l2_m2m_codec_end(avctx->priv_data);
+ }
+ 
+ #define OFFSET(x) offsetof(V4L2m2mPriv, x)
+@@ -261,7 +255,7 @@ static const AVOption options[] = {
+         .close          = v4l2_decode_close, \
+         .bsfs           = bsf_name, \
+         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+-        .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS, \
++        .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
+         .wrapper_name   = "v4l2m2m", \
+     }
+ 
+diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
+index 32321f392f..9f1b2c2ffc 100644
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -416,6 +416,7 @@ static const AVCodecDefault v4l2_m2m_defaults[] = {
+         .close          = v4l2_encode_close, \
+         .defaults       = v4l2_m2m_defaults, \
+         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY, \
++        .caps_internal  = FF_CODEC_CAP_INIT_CLEANUP, \
+         .wrapper_name   = "v4l2m2m", \
+     }
+ 
+
+From 7aaac68934c0e03a78c9f477ec64e522729a64b7 Mon Sep 17 00:00:00 2001
+From: Andriy Gelman <andriy.gelman@gmail.com>
+Date: Tue, 5 May 2020 01:54:54 -0400
+Subject: [PATCH 02/11] avcodec/v4l2_m2m_dec: Use av_packet_move_ref()
+
+Signed-off-by: Andriy Gelman <andriy.gelman@gmail.com>
+---
+ libavcodec/v4l2_m2m_dec.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index a2ea0ff73a..45e9a8e9fe 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -142,8 +142,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
+     int ret;
+ 
+     if (s->buf_pkt.size) {
+-        avpkt = s->buf_pkt;
+-        memset(&s->buf_pkt, 0, sizeof(AVPacket));
++        av_packet_move_ref(&avpkt, &s->buf_pkt);
+     } else {
+         ret = ff_decode_get_packet(avctx, &avpkt);
+         if (ret < 0 && ret != AVERROR_EOF)
+
+From c6b85ed30f06ea99513b13cc768a922ebe4d68c2 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Tue, 24 Apr 2018 23:00:23 -0700
+Subject: [PATCH 03/11] libavcodec: v4l2m2m: output AVDRMFrameDescriptor
+
+This allows for a zero-copy output by exporting the v4l2 buffer then wrapping that buffer
+in the AVDRMFrameDescriptor like it is done in rkmpp.
+
+This has been in use for quite some time with great success on many platforms including:
+ - Amlogic S905
+ - Raspberry Pi
+ - i.MX6
+ - Dragonboard 410c
+
+This was developed in conjunction with Kodi to allow handling the zero-copy buffer rendering.
+A simply utility for testing is also available here: https://github.com/BayLibre/ffmpeg-drm
+
+todo:
+ - allow selecting pixel format output from decoder
+ - allow configuring amount of output and capture buffers
+
+V2:
+ - allow selecting AV_PIX_FMT_DRM_PRIME
+
+V3:
+ - use get_format to select AV_PIX_FMT_DRM_PRIME
+ - use hw_configs
+ - add handling of AV_PIX_FMT_YUV420P format (for raspberry pi)
+ - add handling of AV_PIX_FMT_YUYV422 format (for i.MX6 coda decoder)
+
+V4:
+ - rebased on 4.2.2
+
+V5:
+ - rebased on 4.3
+---
+ libavcodec/v4l2_buffers.c | 155 ++++++++++++++++++++++++++++++++++++--
+ libavcodec/v4l2_buffers.h |   4 +
+ libavcodec/v4l2_context.c |  40 +++++++++-
+ libavcodec/v4l2_m2m.h     |   3 +
+ libavcodec/v4l2_m2m_dec.c |  23 ++++++
+ 5 files changed, 213 insertions(+), 12 deletions(-)
+
+diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
+index 02f23d954b..4bb2bf6f87 100644
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -21,6 +21,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include <drm/drm_fourcc.h>
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <sys/mman.h>
+@@ -30,6 +31,7 @@
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext.h"
+ #include "v4l2_context.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_m2m.h"
+@@ -210,7 +212,79 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
+     return AVCOL_TRC_UNSPECIFIED;
+ }
+ 
+-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
++{
++    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++    AVDRMLayerDescriptor *layer;
++
++    /* fill the DRM frame descriptor */
++    drm_desc->nb_objects = avbuf->num_planes;
++    drm_desc->nb_layers = 1;
++
++    layer = &drm_desc->layers[0];
++    layer->nb_planes = avbuf->num_planes;
++
++    for (int i = 0; i < avbuf->num_planes; i++) {
++        layer->planes[i].object_index = i;
++        layer->planes[i].offset = 0;
++        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
++    }
++
++    switch (avbuf->context->av_pix_fmt) {
++    case AV_PIX_FMT_YUYV422:
++
++        layer->format = DRM_FORMAT_YUYV;
++        layer->nb_planes = 1;
++
++        break;
++
++    case AV_PIX_FMT_NV12:
++    case AV_PIX_FMT_NV21:
++
++        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
++            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
++
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 2;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            avbuf->context->format.fmt.pix.height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++        break;
++
++    case AV_PIX_FMT_YUV420P:
++
++        layer->format = DRM_FORMAT_YUV420;
++
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 3;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            avbuf->context->format.fmt.pix.height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++        layer->planes[2].object_index = 0;
++        layer->planes[2].offset = layer->planes[1].offset +
++            ((avbuf->plane_info[0].bytesperline *
++              avbuf->context->format.fmt.pix.height) >> 2);
++        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++        break;
++
++    default:
++        drm_desc->nb_layers = 0;
++        break;
++    }
++
++    return (uint8_t *) drm_desc;
++}
++
++static void v4l2_free_buffer(void *opaque, uint8_t *data)
+ {
+     V4L2Buffer* avbuf = opaque;
+     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+@@ -234,6 +308,36 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
+     }
+ }
+ 
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
++{
++    struct v4l2_exportbuffer expbuf;
++    int i, ret;
++
++    for (i = 0; i < avbuf->num_planes; i++) {
++        memset(&expbuf, 0, sizeof(expbuf));
++
++        expbuf.index = avbuf->buf.index;
++        expbuf.type = avbuf->buf.type;
++        expbuf.plane = i;
++
++        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
++        if (ret < 0)
++            return AVERROR(errno);
++
++        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
++            /* drm frame */
++            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
++            avbuf->drm_frame.objects[i].fd = expbuf.fd;
++        } else {
++            /* drm frame */
++            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
++            avbuf->drm_frame.objects[0].fd = expbuf.fd;
++        }
++    }
++
++    return 0;
++}
++
+ static int v4l2_buf_increase_ref(V4L2Buffer *in)
+ {
+     V4L2m2mContext *s = buf_to_m2mctx(in);
+@@ -254,6 +358,24 @@ static int v4l2_buf_increase_ref(V4L2Buffer *in)
+     return 0;
+ }
+ 
++static int v4l2_buf_to_bufref_drm(V4L2Buffer *in, AVBufferRef **buf)
++{
++    int ret;
++
++    *buf = av_buffer_create((uint8_t *) &in->drm_frame,
++                            sizeof(in->drm_frame),
++                            v4l2_free_buffer,
++                            in, AV_BUFFER_FLAG_READONLY);
++    if (!*buf)
++        return AVERROR(ENOMEM);
++
++    ret = v4l2_buf_increase_ref(in);
++    if (ret)
++         av_buffer_unref(buf);
++
++    return ret;
++}
++
+ static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
+ {
+     int ret;
+@@ -303,13 +425,24 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+ 
+     frame->format = avbuf->context->av_pix_fmt;
+ 
+-    for (i = 0; i < avbuf->num_planes; i++) {
+-        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
++    if (buf_to_m2mctx(avbuf)->output_drm) {
++        /* 1. get references to the actual data */
++        ret = v4l2_buf_to_bufref_drm(avbuf, &frame->buf[0]);
+         if (ret)
+             return ret;
+ 
+-        frame->linesize[i] = avbuf->plane_info[i].bytesperline;
+-        frame->data[i] = frame->buf[i]->data;
++        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++    } else {
++        /* 1. get references to the actual data */
++        for (i = 0; i < avbuf->num_planes; i++) {
++            ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
++            if (ret)
++                return ret;
++
++            frame->linesize[i] = avbuf->plane_info[i].bytesperline;
++            frame->data[i] = frame->buf[i]->data;
++        }
+     }
+ 
+     /* fixup special cases */
+@@ -543,9 +676,6 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+ 
+     avbuf->status = V4L2BUF_AVAILABLE;
+ 
+-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+-        return 0;
+-
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->buf.m.planes = avbuf->planes;
+         avbuf->buf.length   = avbuf->num_planes;
+@@ -555,6 +685,15 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+         avbuf->buf.length    = avbuf->planes[0].length;
+     }
+ 
++    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
++        return 0;
++
++    if (buf_to_m2mctx(avbuf)->output_drm) {
++        ret = v4l2_buffer_export_drm(avbuf);
++        if (ret)
++                return ret;
++    }
++
+     return ff_v4l2_buffer_enqueue(avbuf);
+ }
+ 
+diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
+index 8dbc7fc104..037e667997 100644
+--- a/libavcodec/v4l2_buffers.h
++++ b/libavcodec/v4l2_buffers.h
+@@ -27,6 +27,7 @@
+ #include <stdatomic.h>
+ #include <linux/videodev2.h>
+ 
++#include "libavutil/hwcontext_drm.h"
+ #include "avcodec.h"
+ 
+ enum V4L2Buffer_status {
+@@ -42,6 +43,9 @@ typedef struct V4L2Buffer {
+     /* each buffer needs to have a reference to its context */
+     struct V4L2Context *context;
+ 
++    /* DRM descriptor */
++    AVDRMFrameDescriptor drm_frame;
++
+     /* This object is refcounted per-plane, so we need to keep track
+      * of how many context-refs we are holding. */
+     AVBufferRef *context_ref;
+diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
+index 29b144ed73..7a92df2c3e 100644
+--- a/libavcodec/v4l2_context.c
++++ b/libavcodec/v4l2_context.c
+@@ -455,22 +455,54 @@ static int v4l2_release_buffers(V4L2Context* ctx)
+     struct v4l2_requestbuffers req = {
+         .memory = V4L2_MEMORY_MMAP,
+         .type = ctx->type,
+-        .count = 0, /* 0 -> unmaps buffers from the driver */
++        .count = 0, /* 0 -> unmap all buffers from the driver */
+     };
+-    int i, j;
++    int ret, i, j;
+ 
+     for (i = 0; i < ctx->num_buffers; i++) {
+         V4L2Buffer *buffer = &ctx->buffers[i];
+ 
+         for (j = 0; j < buffer->num_planes; j++) {
+             struct V4L2Plane_info *p = &buffer->plane_info[j];
++
++            if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++                /* output buffers are not EXPORTED */
++                goto unmap;
++            }
++
++            if (ctx_to_m2mctx(ctx)->output_drm) {
++                /* use the DRM frame to close */
++                if (buffer->drm_frame.objects[j].fd >= 0) {
++                    if (close(buffer->drm_frame.objects[j].fd) < 0) {
++                        av_log(logger(ctx), AV_LOG_ERROR, "%s close drm fd "
++                            "[buffer=%2d, plane=%d, fd=%2d] - %s \n",
++                            ctx->name, i, j, buffer->drm_frame.objects[j].fd,
++                            av_err2str(AVERROR(errno)));
++                    }
++                }
++            }
++unmap:
+             if (p->mm_addr && p->length)
+                 if (munmap(p->mm_addr, p->length) < 0)
+-                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
++                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n",
++                        ctx->name, av_err2str(AVERROR(errno)));
+         }
+     }
+ 
+-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
++    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
++    if (ret < 0) {
++            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
++                ctx->name, av_err2str(AVERROR(errno)));
++
++            if (ctx_to_m2mctx(ctx)->output_drm)
++                av_log(logger(ctx), AV_LOG_ERROR,
++                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
++                    "for all buffers: \n"
++                    "  1. drmModeRmFB(..)\n"
++                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
++    }
++
++    return ret;
+ }
+ 
+ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
+diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
+index 456281f48c..4ee0be653b 100644
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -63,6 +63,9 @@ typedef struct V4L2m2mContext {
+ 
+     /* reference back to V4L2m2mPriv */
+     void *priv;
++
++    /* generate DRM frames */
++    int output_drm;
+ } V4L2m2mContext;
+ 
+ typedef struct V4L2m2mPriv {
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index 45e9a8e9fe..eb6ecc8ed5 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -23,6 +23,9 @@
+ 
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
++
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/pixdesc.h"
+ #include "libavutil/opt.h"
+@@ -30,6 +33,9 @@
+ #include "libavcodec/decode.h"
+ #include "libavcodec/internal.h"
+ 
++#include "libavcodec/hwaccels.h"
++#include "libavcodec/internal.h"
++
+ #include "v4l2_context.h"
+ #include "v4l2_m2m.h"
+ #include "v4l2_fmt.h"
+@@ -207,6 +213,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+     capture->av_pix_fmt = avctx->pix_fmt;
+ 
++    /* the client requests the codec to generate DRM frames:
++     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
++     *       check the ff_v4l2_buffer_to_avframe conversion function.
++     *   - the DRM frame format is passed in the DRM frame descriptor layer.
++     *       check the v4l2_get_drm_frame function.
++     */
++    if (ff_get_format(avctx, avctx->codec->pix_fmts) == AV_PIX_FMT_DRM_PRIME)
++        s->output_drm = 1;
++
+     s->avctx = avctx;
+     ret = ff_v4l2_m2m_codec_init(priv);
+     if (ret) {
+@@ -232,6 +247,11 @@ static const AVOption options[] = {
+     { NULL},
+ };
+ 
++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
++    HW_CONFIG_INTERNAL(DRM_PRIME),
++    NULL
++};
++
+ #define M2MDEC_CLASS(NAME) \
+     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
+         .class_name = #NAME "_v4l2m2m_decoder", \
+@@ -255,6 +275,9 @@ static const AVOption options[] = {
+         .bsfs           = bsf_name, \
+         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+         .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
++        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
++                                                         AV_PIX_FMT_NONE}, \
++        .hw_configs     = v4l2_m2m_hw_configs, \
+         .wrapper_name   = "v4l2m2m", \
+     }
+ 
+
+From 2b5cd753892dceba3b211053a6266c40aab38c55 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Thu, 16 Aug 2018 21:09:40 -0700
+Subject: [PATCH 04/11] libavcodec: v4l2m2m: depends on libdrm
+
+---
+ configure                 | 1 +
+ libavcodec/v4l2_buffers.c | 2 +-
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/configure b/configure
+index 8569a60bf8..a049707dd6 100755
+--- a/configure
++++ b/configure
+@@ -3401,6 +3401,7 @@ sndio_indev_deps="sndio"
+ sndio_outdev_deps="sndio"
+ v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_indev_suggest="libv4l2"
++v4l2_outdev_deps="libdrm"
+ v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_outdev_suggest="libv4l2"
+ vfwcap_indev_deps="vfw32 vfwcap_defines"
+diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
+index 4bb2bf6f87..c36a73d1fa 100644
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -21,7 +21,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
+-#include <drm/drm_fourcc.h>
++#include <drm_fourcc.h>
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <sys/mman.h>
+
+From 09a0f1b99548a249991891ee4e02ae6613b545d7 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Thu, 16 Aug 2018 21:10:13 -0700
+Subject: [PATCH 05/11] libavcodec: v4l2m2m: set format_modifier to
+ DRM_FORMAT_MOD_LINEAR
+
+---
+ libavcodec/v4l2_buffers.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
+index c36a73d1fa..072b77bbda 100644
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -328,10 +328,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
+             /* drm frame */
+             avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
+             avbuf->drm_frame.objects[i].fd = expbuf.fd;
++            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
+         } else {
+             /* drm frame */
+             avbuf->drm_frame.objects[0].size = avbuf->buf.length;
+             avbuf->drm_frame.objects[0].fd = expbuf.fd;
++            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
+         }
+     }
+ 
+
+From e8df5a982f705aaba1e03aef653169bc17a0d464 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Thu, 16 Aug 2018 21:10:53 -0700
+Subject: [PATCH 06/11] libavcodec: v4l2m2m: only mmap the buffer when it is
+ output type and drm prime is used
+
+---
+ libavcodec/v4l2_buffers.c | 20 ++++++++++++++------
+ 1 file changed, 14 insertions(+), 6 deletions(-)
+
+diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
+index 072b77bbda..8162531973 100644
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -662,14 +662,22 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+ 
+         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
+-                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
++
++            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
++                !buf_to_m2mctx(avbuf)->output_drm) {
++                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
++                                               PROT_READ | PROT_WRITE, MAP_SHARED,
++                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
++            }
+         } else {
+             avbuf->plane_info[i].length = avbuf->buf.length;
+-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+-                                          PROT_READ | PROT_WRITE, MAP_SHARED,
+-                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
++
++            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
++                !buf_to_m2mctx(avbuf)->output_drm) {
++                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
++                                               PROT_READ | PROT_WRITE, MAP_SHARED,
++                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
++            }
+         }
+ 
+         if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
+
+From 4fb7664bb6be542b691323a03050cdd024585afc Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Thu, 16 Aug 2018 21:11:38 -0700
+Subject: [PATCH 07/11] libavcodec: v4l2m2m: allow using software pixel formats
+
+---
+ libavcodec/v4l2_m2m_dec.c | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index eb6ecc8ed5..3b2449ae6c 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -219,8 +219,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+      *   - the DRM frame format is passed in the DRM frame descriptor layer.
+      *       check the v4l2_get_drm_frame function.
+      */
+-    if (ff_get_format(avctx, avctx->codec->pix_fmts) == AV_PIX_FMT_DRM_PRIME)
++    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
++    case AV_PIX_FMT_DRM_PRIME:
+         s->output_drm = 1;
++        break;
++    case AV_PIX_FMT_NONE:
++        return 0;
++        break;
++    default:
++        break;
++    }
+ 
+     s->avctx = avctx;
+     ret = ff_v4l2_m2m_codec_init(priv);
+@@ -276,6 +284,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
+         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+         .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
+         .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
++                                                         AV_PIX_FMT_NV12, \
+                                                          AV_PIX_FMT_NONE}, \
+         .hw_configs     = v4l2_m2m_hw_configs, \
+         .wrapper_name   = "v4l2m2m", \
+
+From 27ae887df07992385b1afc9b532f978066e83774 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Mon, 24 Sep 2018 13:39:31 -0700
+Subject: [PATCH 08/11] libavcodec: v4l2m2m: implement hwcontext
+
+---
+ libavcodec/v4l2_buffers.c | 22 ++++++++++++++++++++++
+ libavcodec/v4l2_context.h |  2 ++
+ libavcodec/v4l2_m2m.h     |  2 ++
+ libavcodec/v4l2_m2m_dec.c | 11 +++++++++++
+ 4 files changed, 37 insertions(+)
+
+diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
+index 8162531973..9c5d471c9b 100644
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -435,6 +435,7 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+ 
+         frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
+         frame->format = AV_PIX_FMT_DRM_PRIME;
++        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
+     } else {
+         /* 1. get references to the actual data */
+         for (i = 0; i < avbuf->num_planes; i++) {
+@@ -635,6 +636,27 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+     avbuf->buf.type = ctx->type;
+     avbuf->buf.index = index;
+ 
++    if (buf_to_m2mctx(avbuf)->output_drm) {
++        AVHWFramesContext *hwframes;
++
++        av_buffer_unref(&ctx->frames_ref);
++
++        ctx->frames_ref = av_hwframe_ctx_alloc(buf_to_m2mctx(avbuf)->device_ref);
++        if (!ctx->frames_ref) {
++            ret = AVERROR(ENOMEM);
++            return ret;
++        }
++
++        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
++        hwframes->format = AV_PIX_FMT_DRM_PRIME;
++        hwframes->sw_format = ctx->av_pix_fmt;
++        hwframes->width = ctx->width;
++        hwframes->height = ctx->height;
++        ret = av_hwframe_ctx_init(ctx->frames_ref);
++        if (ret < 0)
++            return ret;
++    }
++
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->buf.length = VIDEO_MAX_PLANES;
+         avbuf->buf.m.planes = avbuf->planes;
+diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
+index 22a9532444..e804e94131 100644
+--- a/libavcodec/v4l2_context.h
++++ b/libavcodec/v4l2_context.h
+@@ -92,6 +92,8 @@ typedef struct V4L2Context {
+      */
+     int done;
+ 
++    AVBufferRef *frames_ref;
++
+ } V4L2Context;
+ 
+ /**
+diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
+index 4ee0be653b..61cb919771 100644
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -64,6 +64,8 @@ typedef struct V4L2m2mContext {
+     /* reference back to V4L2m2mPriv */
+     void *priv;
+ 
++    AVBufferRef *device_ref;
++
+     /* generate DRM frames */
+     int output_drm;
+ } V4L2m2mContext;
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index 3b2449ae6c..c6b865fde8 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -35,6 +35,7 @@
+ 
+ #include "libavcodec/hwaccels.h"
+ #include "libavcodec/internal.h"
++#include "libavcodec/hwconfig.h"
+ 
+ #include "v4l2_context.h"
+ #include "v4l2_m2m.h"
+@@ -230,6 +231,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+         break;
+     }
+ 
++    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
++    if (!s->device_ref) {
++        ret = AVERROR(ENOMEM);
++        return ret;
++    }
++
++    ret = av_hwdevice_ctx_init(s->device_ref);
++    if (ret < 0)
++        return ret;
++
+     s->avctx = avctx;
+     ret = ff_v4l2_m2m_codec_init(priv);
+     if (ret) {
+
+From aee464209ec6ad060d352dfb638344a1f4db3ce4 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Mon, 4 May 2020 13:01:29 -0700
+Subject: [PATCH 09/11] libavcodec: v4l2m2m: allow lower minimum buffer values
+
+There is no reason to enforce a high minimum. In the context
+of streaming only a few output buffers and capture buffers
+are even needed for continuous playback. This also helps
+alleviate memory pressure when decoding 4K media.
+---
+ libavcodec/v4l2_m2m.h     | 2 +-
+ libavcodec/v4l2_m2m_dec.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
+index 61cb919771..feeb162812 100644
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -38,7 +38,7 @@
+ 
+ #define V4L_M2M_DEFAULT_OPTS \
+     { "num_output_buffers", "Number of buffers in the output context",\
+-        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
++        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
+ 
+ typedef struct V4L2m2mContext {
+     char devname[PATH_MAX];
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index c6b865fde8..b9725be377 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -262,7 +262,7 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+ static const AVOption options[] = {
+     V4L_M2M_DEFAULT_OPTS,
+     { "num_capture_buffers", "Number of buffers in the capture context",
+-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
++        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
+     { NULL},
+ };
+ 
+
+From ffc4419f456c00ab71cf93f792b0473c6de14e64 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Wed, 6 May 2020 11:12:58 -0700
+Subject: [PATCH 10/11] libavcodec: v4l2m2m: add option to specify pixel format
+ used by the decoder
+
+---
+ libavcodec/v4l2_context.c | 9 +++++++++
+ libavcodec/v4l2_m2m.h     | 2 ++
+ libavcodec/v4l2_m2m_dec.c | 1 +
+ 3 files changed, 12 insertions(+)
+
+diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
+index 7a92df2c3e..fa2deae888 100644
+--- a/libavcodec/v4l2_context.c
++++ b/libavcodec/v4l2_context.c
+@@ -531,6 +531,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
+ 
+ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+ {
++    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
++    V4L2m2mPriv *priv = s->avctx->priv_data;
+     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
+     struct v4l2_fmtdesc fdesc;
+     int ret;
+@@ -549,6 +551,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+         if (ret)
+             return AVERROR(EINVAL);
+ 
++        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
++            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
++                fdesc.index++;
++                continue;
++            }
++        }
++
+         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
+         ret = v4l2_try_raw_format(ctx, pixfmt);
+         if (ret){
+diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
+index feeb162812..0e88bf9329 100644
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -30,6 +30,7 @@
+ #include <linux/videodev2.h>
+ 
+ #include "libavcodec/avcodec.h"
++#include "libavutil/pixfmt.h"
+ #include "v4l2_context.h"
+ 
+ #define container_of(ptr, type, member) ({ \
+@@ -78,6 +79,7 @@ typedef struct V4L2m2mPriv {
+ 
+     int num_output_buffers;
+     int num_capture_buffers;
++    enum AVPixelFormat pix_fmt;
+ } V4L2m2mPriv;
+ 
+ /**
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index b9725be377..6109deee8a 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -263,6 +263,7 @@ static const AVOption options[] = {
+     V4L_M2M_DEFAULT_OPTS,
+     { "num_capture_buffers", "Number of buffers in the capture context",
+         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
++    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
+     { NULL},
+ };
+ 
+
+From 8595d06d4909bbec0aa14625fcfc869c6bcef696 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Mon, 24 Sep 2018 13:39:56 -0700
+Subject: [PATCH 11/11] libavcodec: v4l2m2m: implement flush
+
+---
+ libavcodec/v4l2_m2m_dec.c | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index 6109deee8a..820cdf241f 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -256,6 +256,41 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+     return ff_v4l2_m2m_codec_end(avctx->priv_data);
+ }
+ 
++static void v4l2_decode_flush(AVCodecContext *avctx)
++{
++    V4L2m2mPriv *priv = avctx->priv_data;
++    V4L2m2mContext* s = priv->context;
++    V4L2Context* output = &s->output;
++    V4L2Context* capture = &s->capture;
++    int ret, i;
++
++    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
++
++    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON %s error: %d\n", output->name, ret);
++
++    for (i = 0; i < output->num_buffers; i++) {
++        if (output->buffers[i].status == V4L2BUF_IN_DRIVER)
++            output->buffers[i].status = V4L2BUF_AVAILABLE;
++    }
++
++    struct v4l2_decoder_cmd cmd = {
++        .cmd = V4L2_DEC_CMD_START,
++        .flags = 0,
++    };
++
++    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
++
++    s->draining = 0;
++    output->done = 0;
++    capture->done = 0;
++}
++
+ #define OFFSET(x) offsetof(V4L2m2mPriv, x)
+ #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
+ 
+@@ -292,6 +327,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
+         .init           = v4l2_decode_init, \
+         .receive_frame  = v4l2_receive_frame, \
+         .close          = v4l2_decode_close, \
++        .flush          = v4l2_decode_flush, \
+         .bsfs           = bsf_name, \
+         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+         .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \

From 807b70364aa6c205f0c4c1a37af5395a633543c0 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 9 Jul 2020 13:24:25 +0200
Subject: [PATCH 05/10] ffmpeg: create v4l2-request patch

Patch created using revisions f6ae50a..3e95632
from branch v4l2-request-hwaccel-4.3 of https://github.com/Kwiboo/FFmpeg

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 .../ffmpeg-001-v4l2-request.patch             | 5202 +++++++++++++++++
 1 file changed, 5202 insertions(+)
 create mode 100644 packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch

diff --git a/packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch b/packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch
new file mode 100644
index 0000000000..7ef133994f
--- /dev/null
+++ b/packages/multimedia/ffmpeg/patches/v4l2-request/ffmpeg-001-v4l2-request.patch
@@ -0,0 +1,5202 @@
+From df0e167bdb7db4cc2340ab831e6961a1108a753c Mon Sep 17 00:00:00 2001
+From: Jonas Karlman <jonas@kwiboo.se>
+Date: Mon, 3 Dec 2018 23:48:04 +0100
+Subject: [PATCH 01/18] avutil: add av_buffer_pool_flush()
+
+Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
+---
+ libavutil/buffer.c | 13 +++++++++++++
+ libavutil/buffer.h |  5 +++++
+ 2 files changed, 18 insertions(+)
+
+diff --git a/libavutil/buffer.c b/libavutil/buffer.c
+index 38a554208a..b0fedabc3e 100644
+--- a/libavutil/buffer.c
++++ b/libavutil/buffer.c
+@@ -273,6 +273,19 @@ static void buffer_pool_free(AVBufferPool *pool)
+     av_freep(&pool);
+ }
+ 
++void av_buffer_pool_flush(AVBufferPool *pool)
++{
++    ff_mutex_lock(&pool->mutex);
++    while (pool->pool) {
++        BufferPoolEntry *buf = pool->pool;
++        pool->pool = buf->next;
++
++        buf->free(buf->opaque, buf->data);
++        av_freep(&buf);
++    }
++    ff_mutex_unlock(&pool->mutex);
++}
++
+ void av_buffer_pool_uninit(AVBufferPool **ppool)
+ {
+     AVBufferPool *pool;
+diff --git a/libavutil/buffer.h b/libavutil/buffer.h
+index c0f3f6cc9a..998beec9ac 100644
+--- a/libavutil/buffer.h
++++ b/libavutil/buffer.h
+@@ -267,6 +267,11 @@ AVBufferPool *av_buffer_pool_init2(int size, void *opaque,
+                                    AVBufferRef* (*alloc)(void *opaque, int size),
+                                    void (*pool_free)(void *opaque));
+ 
++/**
++ * Free all available buffers in a buffer pool.
++ */
++ void av_buffer_pool_flush(AVBufferPool *pool);
++
+ /**
+  * Mark the pool as being available for freeing. It will actually be freed only
+  * once all the allocated buffers associated with the pool are released. Thus it
+
+From cdb4dbed4b223b9a21287cfcc594af99e7aa3990 Mon Sep 17 00:00:00 2001
+From: Jonas Karlman <jonas@kwiboo.se>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 02/18] Add common V4L2 request API code
+
+Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
+---
+ configure                 |  12 +
+ libavcodec/Makefile       |   1 +
+ libavcodec/hwconfig.h     |   2 +
+ libavcodec/v4l2_request.c | 984 ++++++++++++++++++++++++++++++++++++++
+ libavcodec/v4l2_request.h |  77 +++
+ 5 files changed, 1076 insertions(+)
+ create mode 100644 libavcodec/v4l2_request.c
+ create mode 100644 libavcodec/v4l2_request.h
+
+diff --git a/configure b/configure
+index 8569a60bf8..9f9909a236 100755
+--- a/configure
++++ b/configure
+@@ -274,6 +274,7 @@ External library support:
+   --enable-libtls          enable LibreSSL (via libtls), needed for https support
+                            if openssl, gnutls or mbedtls is not used [no]
+   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
++  --enable-libudev         enable libudev [no]
+   --enable-libv4l2         enable libv4l2/v4l-utils [no]
+   --enable-libvidstab      enable video stabilization using vid.stab [no]
+   --enable-libvmaf         enable vmaf filter via libvmaf [no]
+@@ -342,6 +343,7 @@ External library support:
+   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
+   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
+   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
++  --enable-v4l2-request    enable V4L2 request API code [no]
+   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
+   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
+   --disable-videotoolbox   disable VideoToolbox code [autodetect]
+@@ -1807,6 +1809,7 @@ EXTERNAL_LIBRARY_LIST="
+     libtesseract
+     libtheora
+     libtwolame
++    libudev
+     libv4l2
+     libvorbis
+     libvpx
+@@ -1861,6 +1864,7 @@ HWACCEL_LIBRARY_LIST="
+     mmal
+     omx
+     opencl
++    v4l2_request
+     vulkan
+ "
+ 
+@@ -2903,6 +2907,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
+ dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
+ ffnvcodec_deps_any="libdl LoadLibrary"
+ nvdec_deps="ffnvcodec"
++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
+ vaapi_x11_deps="xlib"
+ videotoolbox_hwaccel_deps="videotoolbox pthreads"
+ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
+@@ -6376,6 +6381,7 @@ enabled libtls            && require_pkg_config libtls libtls tls.h tls_configur
+ enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
+                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
+                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
++enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
+ enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
+ enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
+ enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf
+@@ -6475,6 +6481,10 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
+                                { enabled libdrm ||
+                                  die "ERROR: rkmpp requires --enable-libdrm"; }
+                              }
++enabled v4l2_request      && { enabled libdrm ||
++                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
++                             { enabled libudev ||
++                               die "ERROR: v4l2-request requires --enable-libudev"; }
+ enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
+ 
+ 
+@@ -6556,6 +6566,8 @@ if enabled v4l2_m2m; then
+     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
+ fi
+ 
++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+ 
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 5a6ea59715..d742205168 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -153,6 +153,7 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
+ OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
+ OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
+ OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
++OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_request.o
+ OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
+ OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
+ 
+diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
+index f421dc909f..ee78d8ab8e 100644
+--- a/libavcodec/hwconfig.h
++++ b/libavcodec/hwconfig.h
+@@ -80,6 +80,8 @@ typedef struct AVCodecHWConfigInternal {
+     HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD,  NONE,         ff_ ## codec ## _d3d11va_hwaccel)
+ #define HWACCEL_XVMC(codec) \
+     HW_CONFIG_HWACCEL(0, 0, 1, XVMC,         NONE,         ff_ ## codec ## _xvmc_hwaccel)
++#define HWACCEL_V4L2REQUEST(codec) \
++    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
+ 
+ #define HW_CONFIG_ENCODER(device, frames, ad_hoc, format, device_type_) \
+     &(const AVCodecHWConfigInternal) { \
+diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c
+new file mode 100644
+index 0000000000..7d97468153
+--- /dev/null
++++ b/libavcodec/v4l2_request.c
+@@ -0,0 +1,984 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <drm_fourcc.h>
++#include <linux/media.h>
++#include <sys/mman.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++
++#include <sys/sysmacros.h>
++#include <libudev.h>
++
++#include "decode.h"
++#include "internal.h"
++#include "v4l2_request.h"
++
++uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    return req ? v4l2_timeval_to_ns(&req->capture.buffer.timestamp) : 0;
++}
++
++int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    memset(&req->drm, 0, sizeof(AVDRMFrameDescriptor));
++    req->output.used = 0;
++    return 0;
++}
++
++int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    if (req->output.used + size + (AV_INPUT_BUFFER_PADDING_SIZE * 4) <= req->output.size) {
++        memcpy(req->output.addr + req->output.used, data, size);
++        req->output.used += size;
++    } else {
++        av_log(avctx, AV_LOG_ERROR, "%s: output.used=%u output.size=%u size=%u\n", __func__, req->output.used, req->output.size, size);
++    }
++    return 0;
++}
++
++static int v4l2_request_controls(V4L2RequestContext *ctx, int request_fd, unsigned long type, struct v4l2_ext_control *control, int count)
++{
++    struct v4l2_ext_controls controls = {
++        .controls = control,
++        .count = count,
++        .request_fd = request_fd,
++        .which = (request_fd >= 0) ? V4L2_CTRL_WHICH_REQUEST_VAL : 0,
++    };
++
++    if (!control || !count)
++        return 0;
++
++    return ioctl(ctx->video_fd, type, &controls);
++}
++
++static int v4l2_request_set_controls(V4L2RequestContext *ctx, int request_fd, struct v4l2_ext_control *control, int count)
++{
++    return v4l2_request_controls(ctx, request_fd, VIDIOC_S_EXT_CTRLS, control, count);
++}
++
++int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    ret = v4l2_request_controls(ctx, -1, VIDIOC_S_EXT_CTRLS, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return ret;
++}
++
++int ff_v4l2_request_get_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    ret = v4l2_request_controls(ctx, -1, VIDIOC_G_EXT_CTRLS, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get controls failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return ret;
++}
++
++int ff_v4l2_request_query_control(AVCodecContext *avctx, struct v4l2_query_ext_ctrl *control)
++{
++    int ret;
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERY_EXT_CTRL, control);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id)
++{
++    int ret;
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    struct v4l2_queryctrl control = {
++        .id = id,
++    };
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCTRL, &control);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return control.default_value;
++}
++
++static int v4l2_request_queue_buffer(V4L2RequestContext *ctx, int request_fd, V4L2RequestBuffer *buf, uint32_t flags)
++{
++    struct v4l2_plane planes[1] = {};
++    struct v4l2_buffer buffer = {
++        .type = buf->buffer.type,
++        .memory = buf->buffer.memory,
++        .index = buf->index,
++        .timestamp.tv_usec = ctx->timestamp,
++        .bytesused = buf->used,
++        .request_fd = request_fd,
++        .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags,
++    };
++
++    buf->buffer.timestamp = buffer.timestamp;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
++        planes[0].bytesused = buf->used;
++        buffer.bytesused = 0;
++        buffer.length = 1;
++        buffer.m.planes = planes;
++    }
++
++    return ioctl(ctx->video_fd, VIDIOC_QBUF, &buffer);
++}
++
++static int v4l2_request_dequeue_buffer(V4L2RequestContext *ctx, V4L2RequestBuffer *buf)
++{
++    int ret;
++    struct v4l2_plane planes[1] = {};
++    struct v4l2_buffer buffer = {
++        .type = buf->buffer.type,
++        .memory = buf->buffer.memory,
++        .index = buf->index,
++    };
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
++        buffer.length = 1;
++        buffer.m.planes = planes;
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_DQBUF, &buffer);
++    if (ret < 0)
++        return ret;
++
++    buf->buffer.timestamp = buffer.timestamp;
++    return 0;
++}
++
++const uint32_t v4l2_request_capture_pixelformats[] = {
++    V4L2_PIX_FMT_NV12,
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++    V4L2_PIX_FMT_SUNXI_TILED_NV12,
++#endif
++};
++
++static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4l2_format *format)
++{
++    AVDRMFrameDescriptor *desc = &req->drm;
++    AVDRMLayerDescriptor *layer = &desc->layers[0];
++    uint32_t pixelformat = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat;
++
++    switch (pixelformat) {
++    case V4L2_PIX_FMT_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
++        break;
++#endif
++    default:
++        return -1;
++    }
++
++    desc->nb_objects = 1;
++    desc->objects[0].fd = req->capture.fd;
++    desc->objects[0].size = req->capture.size;
++
++    desc->nb_layers = 1;
++    layer->nb_planes = 2;
++
++    layer->planes[0].object_index = 0;
++    layer->planes[0].offset = 0;
++    layer->planes[0].pitch = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.plane_fmt[0].bytesperline : format->fmt.pix.bytesperline;
++
++    layer->planes[1].object_index = 0;
++    layer->planes[1].offset = layer->planes[0].pitch * (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.height : format->fmt.pix.height);
++    layer->planes[1].pitch = layer->planes[0].pitch;
++
++    return 0;
++}
++
++static int v4l2_request_queue_decode(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    struct timeval tv = { 2, 0 };
++    fd_set except_fds;
++    int ret;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice);
++
++    if (first_slice)
++        ctx->timestamp++;
++
++    ret = v4l2_request_set_controls(ctx, req->request_fd, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    memset(req->output.addr + req->output.used, 0, AV_INPUT_BUFFER_PADDING_SIZE * 4);
++
++    ret = v4l2_request_queue_buffer(ctx, req->request_fd, &req->output, last_slice ? 0 : V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: queue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    if (first_slice) {
++        ret = v4l2_request_queue_buffer(ctx, -1, &req->capture, 0);
++        if (ret < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: queue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
++            return -1;
++        }
++    }
++
++    // NOTE: do we need to dequeue when request fails/timeout?
++
++    // 4. queue request and wait
++    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_QUEUE, NULL);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: queue request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        goto fail;
++    }
++
++    FD_ZERO(&except_fds);
++    FD_SET(req->request_fd, &except_fds);
++
++    ret = select(req->request_fd + 1, NULL, NULL, &except_fds, &tv);
++    if (ret == 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: request %d timeout\n", __func__, req->request_fd);
++        goto fail;
++    } else if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: select request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        goto fail;
++    }
++
++    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    if (last_slice) {
++        ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
++        if (ret < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
++            return -1;
++        }
++    }
++
++    // TODO: check errors
++    // buffer.flags & V4L2_BUF_FLAG_ERROR
++
++    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    if (last_slice)
++        return v4l2_request_set_drm_descriptor(req, &ctx->format);
++
++    return 0;
++
++fail:
++    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
++
++    ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
++
++    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++
++    return -1;
++}
++
++int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++
++    // fall back to queue each slice as a full frame
++    if ((req->output.capabilities & V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) != V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
++        return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
++
++    return v4l2_request_queue_decode(avctx, frame, control, count, first_slice, last_slice);
++}
++
++int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count)
++{
++    return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
++}
++
++static int v4l2_request_try_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    struct v4l2_fmtdesc fmtdesc = {
++        .index = 0,
++        .type = type,
++    };
++
++    if (V4L2_TYPE_IS_OUTPUT(type)) {
++        struct v4l2_create_buffers buffers = {
++            .count = 0,
++            .memory = V4L2_MEMORY_MMAP,
++            .format.type = type,
++        };
++
++        if (ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers) < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
++            return -1;
++        }
++
++        if ((buffers.capabilities & V4L2_BUF_CAP_SUPPORTS_REQUESTS) != V4L2_BUF_CAP_SUPPORTS_REQUESTS) {
++            av_log(avctx, AV_LOG_INFO, "%s: output buffer type do not support requests, capabilities %u\n", __func__, buffers.capabilities);
++            return -1;
++        }
++    }
++
++    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
++        if (fmtdesc.pixelformat == pixelformat)
++            return 0;
++
++        fmtdesc.index++;
++    }
++
++    av_log(avctx, AV_LOG_INFO, "%s: pixelformat %u not supported for type %u\n", __func__, pixelformat, type);
++    return -1;
++}
++
++static int v4l2_request_set_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat, uint32_t buffersize)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    struct v4l2_format format = {
++        .type = type,
++    };
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++        format.fmt.pix_mp.width = avctx->coded_width;
++        format.fmt.pix_mp.height = avctx->coded_height;
++        format.fmt.pix_mp.pixelformat = pixelformat;
++        format.fmt.pix_mp.plane_fmt[0].sizeimage = buffersize;
++        format.fmt.pix_mp.num_planes = 1;
++    } else {
++        format.fmt.pix.width = avctx->coded_width;
++        format.fmt.pix.height = avctx->coded_height;
++        format.fmt.pix.pixelformat = pixelformat;
++        format.fmt.pix.sizeimage = buffersize;
++    }
++
++    return ioctl(ctx->video_fd, VIDIOC_S_FMT, &format);
++}
++
++static int v4l2_request_select_capture_format(AVCodecContext *avctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    enum v4l2_buf_type type = ctx->format.type;
++
++#if 0
++    struct v4l2_format format = {
++        .type = type,
++    };
++    struct v4l2_fmtdesc fmtdesc = {
++        .index = 0,
++        .type = type,
++    };
++    uint32_t pixelformat;
++    int i;
++
++    if (ioctl(ctx->video_fd, VIDIOC_G_FMT, &format) < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return -1;
++    }
++
++    pixelformat = V4L2_TYPE_IS_MULTIPLANAR(type) ? format.fmt.pix_mp.pixelformat : format.fmt.pix.pixelformat;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
++        if (pixelformat == v4l2_request_capture_pixelformats[i])
++            return v4l2_request_set_format(avctx, type, pixelformat, 0);
++    }
++
++    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
++        for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
++            if (fmtdesc.pixelformat == v4l2_request_capture_pixelformats[i])
++                return v4l2_request_set_format(avctx, type, fmtdesc.pixelformat, 0);
++        }
++
++        fmtdesc.index++;
++    }
++#else
++    for (int i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
++        uint32_t pixelformat = v4l2_request_capture_pixelformats[i];
++        if (!v4l2_request_try_format(avctx, type, pixelformat))
++            return v4l2_request_set_format(avctx, type, pixelformat, 0);
++    }
++#endif
++
++    return -1;
++}
++
++static int v4l2_request_probe_video_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret = AVERROR(EINVAL);
++    struct v4l2_capability capability = {0};
++    unsigned int capabilities = 0;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get video device devnode failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ctx->video_fd = open(path, O_RDWR | O_NONBLOCK, 0);
++    if (ctx->video_fd < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCAP, &capability);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get video capability failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if (capability.capabilities & V4L2_CAP_DEVICE_CAPS)
++        capabilities = capability.device_caps;
++    else
++        capabilities = capability.capabilities;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s capabilities=%u\n", __func__, avctx, ctx, path, capabilities);
++
++    if ((capabilities & V4L2_CAP_STREAMING) != V4L2_CAP_STREAMING) {
++        av_log(avctx, AV_LOG_ERROR, "%s: missing required streaming capability\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) == V4L2_CAP_VIDEO_M2M_MPLANE) {
++        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++    } else if ((capabilities & V4L2_CAP_VIDEO_M2M) == V4L2_CAP_VIDEO_M2M) {
++        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++    } else {
++        av_log(avctx, AV_LOG_ERROR, "%s: missing required mem2mem capability\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_try_format(avctx, ctx->output_type, pixelformat);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_WARNING, "%s: try output format failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_set_controls(ctx, -1, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_set_format(avctx, ctx->output_type, pixelformat, buffersize);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set output format failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_select_capture_format(avctx);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_WARNING, "%s: select capture format failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    if (ctx->video_fd >= 0) {
++        close(ctx->video_fd);
++        ctx->video_fd = -1;
++    }
++    return ret;
++}
++
++static int v4l2_request_init_context(AVCodecContext *avctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &ctx->format);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, ctx->format.fmt.pix_mp.pixelformat, ctx->format.fmt.pix_mp.width, ctx->format.fmt.pix_mp.height, ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline, ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage, ctx->format.fmt.pix_mp.num_planes);
++    } else {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, ctx->format.fmt.pix.pixelformat, ctx->format.fmt.pix.width, ctx->format.fmt.pix.height, ctx->format.fmt.pix.bytesperline, ctx->format.fmt.pix.sizeimage);
++    }
++
++    ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM);
++    if (ret < 0)
++        goto fail;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->output_type);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: output stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->format.type);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: capture stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    ff_v4l2_request_uninit(avctx);
++    return ret;
++}
++
++static int v4l2_request_probe_media_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++    struct media_device_info device_info = {0};
++    struct media_v2_topology topology = {0};
++    struct media_v2_interface *interfaces = NULL;
++    struct udev *udev = udev_device_get_udev(device);
++    struct udev_device *video_device;
++    dev_t devnum;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media device devnode failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ctx->media_fd = open(path, O_RDWR, 0);
++    if (ctx->media_fd < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media device info failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s driver=%s\n", __func__, avctx, ctx, path, device_info.driver);
++
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if (topology.num_interfaces <= 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: media device has no interfaces\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    interfaces = av_mallocz(topology.num_interfaces * sizeof(struct media_v2_interface));
++    if (!interfaces) {
++        av_log(avctx, AV_LOG_ERROR, "%s: allocating media interface struct failed\n", __func__);
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = AVERROR(EINVAL);
++    for (int i = 0; i < topology.num_interfaces; i++) {
++        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
++            continue;
++
++        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
++        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
++        if (!video_device) {
++            av_log(avctx, AV_LOG_ERROR, "%s: video_device=%p\n", __func__, video_device);
++            continue;
++        }
++
++        ret = v4l2_request_probe_video_device(video_device, avctx, pixelformat, buffersize, control, count);
++        udev_device_unref(video_device);
++
++        if (!ret)
++            break;
++    }
++
++    av_freep(&interfaces);
++    return ret;
++
++fail:
++    av_freep(&interfaces);
++    if (ctx->media_fd >= 0) {
++        close(ctx->media_fd);
++        ctx->media_fd = -1;
++    }
++    return ret;
++}
++
++int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret = AVERROR(EINVAL);
++    struct udev *udev;
++    struct udev_enumerate *enumerate;
++    struct udev_list_entry *devices;
++    struct udev_list_entry *entry;
++    struct udev_device *device;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p hw_device_ctx=%p hw_frames_ctx=%p\n", __func__, avctx, avctx->hw_device_ctx, avctx->hw_frames_ctx);
++
++    ctx->media_fd = -1;
++    ctx->video_fd = -1;
++    ctx->timestamp = 0;
++
++    udev = udev_new();
++    if (!udev) {
++        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev context failed\n", __func__);
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    enumerate = udev_enumerate_new(udev);
++    if (!enumerate) {
++        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev enumerator failed\n", __func__);
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    udev_enumerate_add_match_subsystem(enumerate, "media");
++    udev_enumerate_scan_devices(enumerate);
++
++    devices = udev_enumerate_get_list_entry(enumerate);
++    udev_list_entry_foreach(entry, devices) {
++        const char *path = udev_list_entry_get_name(entry);
++        if (!path)
++            continue;
++
++        device = udev_device_new_from_syspath(udev, path);
++        if (!device)
++            continue;
++
++        ret = v4l2_request_probe_media_device(device, avctx, pixelformat, buffersize, control, count);
++        udev_device_unref(device);
++
++        if (!ret)
++            break;
++    }
++
++    udev_enumerate_unref(enumerate);
++
++    if (!ret)
++        ret = v4l2_request_init_context(avctx);
++
++fail:
++    udev_unref(udev);
++    return ret;
++}
++
++int ff_v4l2_request_uninit(AVCodecContext *avctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p\n", __func__, avctx, ctx);
++
++    if (ctx->video_fd >= 0) {
++        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->output_type);
++        if (ret < 0)
++            av_log(avctx, AV_LOG_ERROR, "%s: output stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
++
++        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->format.type);
++        if (ret < 0)
++            av_log(avctx, AV_LOG_ERROR, "%s: capture stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
++    }
++
++    if (avctx->hw_frames_ctx) {
++        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
++        av_buffer_pool_flush(hwfc->pool);
++    }
++
++    if (ctx->video_fd >= 0)
++        close(ctx->video_fd);
++
++    if (ctx->media_fd >= 0)
++        close(ctx->media_fd);
++
++    return 0;
++}
++
++static int v4l2_request_buffer_alloc(AVCodecContext *avctx, V4L2RequestBuffer *buf, enum v4l2_buf_type type)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++    struct v4l2_plane planes[1] = {};
++    struct v4l2_create_buffers buffers = {
++        .count = 1,
++        .memory = V4L2_MEMORY_MMAP,
++        .format.type = type,
++    };
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p buf=%p type=%u\n", __func__, avctx, buf, type);
++
++    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &buffers.format);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get format failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
++        return ret;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buffers.format.type)) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, buffers.format.fmt.pix_mp.pixelformat, buffers.format.fmt.pix_mp.width, buffers.format.fmt.pix_mp.height, buffers.format.fmt.pix_mp.plane_fmt[0].bytesperline, buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage, buffers.format.fmt.pix_mp.num_planes);
++    } else {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, buffers.format.fmt.pix.pixelformat, buffers.format.fmt.pix.width, buffers.format.fmt.pix.height, buffers.format.fmt.pix.bytesperline, buffers.format.fmt.pix.sizeimage);
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
++        return ret;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++        buf->width = buffers.format.fmt.pix_mp.width;
++        buf->height = buffers.format.fmt.pix_mp.height;
++        buf->size = buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage;
++        buf->buffer.length = 1;
++        buf->buffer.m.planes = planes;
++    } else {
++        buf->width = buffers.format.fmt.pix.width;
++        buf->height = buffers.format.fmt.pix.height;
++        buf->size = buffers.format.fmt.pix.sizeimage;
++    }
++
++    buf->index = buffers.index;
++    buf->capabilities = buffers.capabilities;
++    buf->used = 0;
++
++    buf->buffer.type = type;
++    buf->buffer.memory = V4L2_MEMORY_MMAP;
++    buf->buffer.index = buf->index;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERYBUF, &buf->buffer);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: query buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
++        return ret;
++    }
++
++    if (V4L2_TYPE_IS_OUTPUT(type)) {
++        void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset);
++        if (addr == MAP_FAILED) {
++            av_log(avctx, AV_LOG_ERROR, "%s: mmap failed, %s (%d)\n", __func__, strerror(errno), errno);
++            return -1;
++        }
++
++        buf->addr = (uint8_t*)addr;
++    } else {
++        struct v4l2_exportbuffer exportbuffer = {
++            .type = type,
++            .index = buf->index,
++            .flags = O_RDONLY,
++        };
++
++        ret = ioctl(ctx->video_fd, VIDIOC_EXPBUF, &exportbuffer);
++        if (ret < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: export buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
++            return ret;
++        }
++
++        buf->fd = exportbuffer.fd;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
++    return 0;
++}
++
++static void v4l2_request_buffer_free(V4L2RequestBuffer *buf)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
++
++    if (buf->addr)
++        munmap(buf->addr, buf->size);
++
++    if (buf->fd >= 0)
++        close(buf->fd);
++}
++
++static void v4l2_request_frame_free(void *opaque, uint8_t *data)
++{
++    AVCodecContext *avctx = opaque;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)data;
++
++    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p request_fd=%d\n", __func__, avctx, data, req->request_fd);
++
++    if (req->request_fd >= 0)
++        close(req->request_fd);
++
++    v4l2_request_buffer_free(&req->capture);
++    v4l2_request_buffer_free(&req->output);
++
++    av_free(data);
++}
++
++static AVBufferRef *v4l2_request_frame_alloc(void *opaque, int size)
++{
++    AVCodecContext *avctx = opaque;
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req;
++    AVBufferRef *ref;
++    uint8_t *data;
++    int ret;
++
++    data = av_mallocz(size);
++    if (!data)
++        return NULL;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
++
++    ref = av_buffer_create(data, size, v4l2_request_frame_free, avctx, 0);
++    if (!ref) {
++        av_freep(&data);
++        return NULL;
++    }
++
++    req = (V4L2RequestDescriptor*)data;
++    req->request_fd = -1;
++    req->output.fd = -1;
++    req->capture.fd = -1;
++
++    ret = v4l2_request_buffer_alloc(avctx, &req->output, ctx->output_type);
++    if (ret < 0) {
++        av_buffer_unref(&ref);
++        return NULL;
++    }
++
++    ret = v4l2_request_buffer_alloc(avctx, &req->capture, ctx->format.type);
++    if (ret < 0) {
++        av_buffer_unref(&ref);
++        return NULL;
++    }
++
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_REQUEST_ALLOC, &req->request_fd);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: request alloc failed, %s (%d)\n", __func__, strerror(errno), errno);
++        av_buffer_unref(&ref);
++        return NULL;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p request_fd=%d\n", __func__, avctx, size, data, req->request_fd);
++    return ref;
++}
++
++static void v4l2_request_pool_free(void *opaque)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
++}
++
++static void v4l2_request_hwframe_ctx_free(AVHWFramesContext *hwfc)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
++
++    av_buffer_pool_flush(hwfc->pool);
++    av_buffer_pool_uninit(&hwfc->pool);
++}
++
++int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
++
++    hwfc->format = AV_PIX_FMT_DRM_PRIME;
++    hwfc->sw_format = AV_PIX_FMT_NV12;
++    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++        hwfc->width = ctx->format.fmt.pix_mp.width;
++        hwfc->height = ctx->format.fmt.pix_mp.height;
++    } else {
++        hwfc->width = ctx->format.fmt.pix.width;
++        hwfc->height = ctx->format.fmt.pix.height;
++    }
++
++    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2RequestDescriptor), avctx, v4l2_request_frame_alloc, v4l2_request_pool_free);
++    if (!hwfc->pool)
++        return AVERROR(ENOMEM);
++
++    hwfc->free = v4l2_request_hwframe_ctx_free;
++
++    hwfc->initial_pool_size = 1;
++
++    switch (avctx->codec_id) {
++    case AV_CODEC_ID_VP9:
++        hwfc->initial_pool_size += 8;
++        break;
++    case AV_CODEC_ID_VP8:
++        hwfc->initial_pool_size += 3;
++        break;
++    default:
++        hwfc->initial_pool_size += 2;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
++
++    return 0;
++}
+diff --git a/libavcodec/v4l2_request.h b/libavcodec/v4l2_request.h
+new file mode 100644
+index 0000000000..58d2aa70af
+--- /dev/null
++++ b/libavcodec/v4l2_request.h
+@@ -0,0 +1,77 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_V4L2_REQUEST_H
++#define AVCODEC_V4L2_REQUEST_H
++
++#include <linux/videodev2.h>
++
++#include "libavutil/hwcontext_drm.h"
++
++typedef struct V4L2RequestContext {
++    int video_fd;
++    int media_fd;
++    enum v4l2_buf_type output_type;
++    struct v4l2_format format;
++    int timestamp;
++} V4L2RequestContext;
++
++typedef struct V4L2RequestBuffer {
++    int index;
++    int fd;
++    uint8_t *addr;
++    uint32_t width;
++    uint32_t height;
++    uint32_t size;
++    uint32_t used;
++    uint32_t capabilities;
++    struct v4l2_buffer buffer;
++} V4L2RequestBuffer;
++
++typedef struct V4L2RequestDescriptor {
++    AVDRMFrameDescriptor drm;
++    int request_fd;
++    V4L2RequestBuffer output;
++    V4L2RequestBuffer capture;
++} V4L2RequestDescriptor;
++
++uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame);
++
++int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame);
++
++int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size);
++
++int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_get_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_query_control(AVCodecContext *avctx, struct v4l2_query_ext_ctrl *control);
++
++int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id);
++
++int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice);
++
++int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_uninit(AVCodecContext *avctx);
++
++int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++
++#endif /* AVCODEC_V4L2_REQUEST_H */
+
+From ac7d0ac7775219d769deca3fb34c92b25411c947 Mon Sep 17 00:00:00 2001
+From: Jonas Karlman <jonas@kwiboo.se>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 03/18] Add V4L2 request API mpeg2 hwaccel
+
+Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
+---
+ configure                       |   3 +
+ libavcodec/Makefile             |   1 +
+ libavcodec/hwaccels.h           |   1 +
+ libavcodec/mpeg12dec.c          |   6 ++
+ libavcodec/v4l2_request_mpeg2.c | 154 ++++++++++++++++++++++++++++++++
+ 5 files changed, 165 insertions(+)
+ create mode 100644 libavcodec/v4l2_request_mpeg2.c
+
+diff --git a/configure b/configure
+index 9f9909a236..6b157d6d3e 100755
+--- a/configure
++++ b/configure
+@@ -2967,6 +2967,8 @@ mpeg2_dxva2_hwaccel_deps="dxva2"
+ mpeg2_dxva2_hwaccel_select="mpeg2video_decoder"
+ mpeg2_nvdec_hwaccel_deps="nvdec"
+ mpeg2_nvdec_hwaccel_select="mpeg2video_decoder"
++mpeg2_v4l2request_hwaccel_deps="v4l2_request mpeg2_v4l2_request"
++mpeg2_v4l2request_hwaccel_select="mpeg2video_decoder"
+ mpeg2_vaapi_hwaccel_deps="vaapi"
+ mpeg2_vaapi_hwaccel_select="mpeg2video_decoder"
+ mpeg2_vdpau_hwaccel_deps="vdpau"
+@@ -6567,6 +6569,7 @@ if enabled v4l2_m2m; then
+ fi
+ 
+ check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
+ 
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index d742205168..8963bd3e91 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -922,6 +922,7 @@ OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL)      += dxva2_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL)        += dxva2_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_NVDEC_HWACCEL)        += nvdec_mpeg12.o
+ OBJS-$(CONFIG_MPEG2_QSV_HWACCEL)          += qsvdec_other.o
++OBJS-$(CONFIG_MPEG2_V4L2REQUEST_HWACCEL)  += v4l2_request_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL)        += vaapi_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+ OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index 6109c89bd6..172a546bb2 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -47,6 +47,7 @@ extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel;
+ extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_mpeg2_nvdec_hwaccel;
+ extern const AVHWAccel ff_mpeg2_dxva2_hwaccel;
++extern const AVHWAccel ff_mpeg2_v4l2request_hwaccel;
+ extern const AVHWAccel ff_mpeg2_vaapi_hwaccel;
+ extern const AVHWAccel ff_mpeg2_vdpau_hwaccel;
+ extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel;
+diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
+index 99e56532a5..15aaf97a34 100644
+--- a/libavcodec/mpeg12dec.c
++++ b/libavcodec/mpeg12dec.c
+@@ -1154,6 +1154,9 @@ static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = {
+ #endif
+ #if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
+     AV_PIX_FMT_VIDEOTOOLBOX,
++#endif
++#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
++    AV_PIX_FMT_DRM_PRIME,
+ #endif
+     AV_PIX_FMT_YUV420P,
+     AV_PIX_FMT_NONE
+@@ -2952,6 +2955,9 @@ AVCodec ff_mpeg2video_decoder = {
+ #endif
+ #if CONFIG_MPEG2_XVMC_HWACCEL
+                         HWACCEL_XVMC(mpeg2),
++#endif
++#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
++                        HWACCEL_V4L2REQUEST(mpeg2),
+ #endif
+                         NULL
+                     },
+diff --git a/libavcodec/v4l2_request_mpeg2.c b/libavcodec/v4l2_request_mpeg2.c
+new file mode 100644
+index 0000000000..88d86cc4c2
+--- /dev/null
++++ b/libavcodec/v4l2_request_mpeg2.c
+@@ -0,0 +1,154 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hwconfig.h"
++#include "mpegvideo.h"
++#include "v4l2_request.h"
++
++typedef struct V4L2RequestControlsMPEG2 {
++    struct v4l2_ctrl_mpeg2_slice_params slice_params;
++    struct v4l2_ctrl_mpeg2_quantization quantization;
++} V4L2RequestControlsMPEG2;
++
++static int v4l2_request_mpeg2_start_frame(AVCodecContext *avctx,
++                                          av_unused const uint8_t *buffer,
++                                          av_unused uint32_t size)
++{
++    const MpegEncContext *s = avctx->priv_data;
++    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
++
++    controls->slice_params = (struct v4l2_ctrl_mpeg2_slice_params) {
++        .bit_size = 0,
++        .data_bit_offset = 0,
++
++        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
++        .quantiser_scale_code = s->qscale >> 1,
++
++        .sequence = {
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
++            .horizontal_size = s->width,
++            .vertical_size = s->height,
++            .vbv_buffer_size = req->output.size,
++
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
++            .profile_and_level_indication = 0,
++            .progressive_sequence = s->progressive_sequence,
++            .chroma_format = s->chroma_format,
++        },
++
++        .picture = {
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
++            .picture_coding_type = s->pict_type,
++
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
++            .f_code[0][0] = s->mpeg_f_code[0][0],
++            .f_code[0][1] = s->mpeg_f_code[0][1],
++            .f_code[1][0] = s->mpeg_f_code[1][0],
++            .f_code[1][1] = s->mpeg_f_code[1][1],
++            .intra_dc_precision = s->intra_dc_precision,
++            .picture_structure = s->picture_structure,
++            .top_field_first = s->top_field_first,
++            .frame_pred_frame_dct = s->frame_pred_frame_dct,
++            .concealment_motion_vectors = s->concealment_motion_vectors,
++            .q_scale_type = s->q_scale_type,
++            .intra_vlc_format = s->intra_vlc_format,
++            .alternate_scan = s->alternate_scan,
++            .repeat_first_field = s->repeat_first_field,
++            .progressive_frame = s->progressive_frame,
++        },
++    };
++
++    switch (s->pict_type) {
++    case AV_PICTURE_TYPE_B:
++        controls->slice_params.backward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->next_picture.f);
++        // fall-through
++    case AV_PICTURE_TYPE_P:
++        controls->slice_params.forward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->last_picture.f);
++    }
++
++    controls->quantization = (struct v4l2_ctrl_mpeg2_quantization) {
++        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
++        .load_intra_quantiser_matrix = 1,
++        .load_non_intra_quantiser_matrix = 1,
++        .load_chroma_intra_quantiser_matrix = 1,
++        .load_chroma_non_intra_quantiser_matrix = 1,
++    };
++
++    for (int i = 0; i < 64; i++) {
++        int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
++        controls->quantization.intra_quantiser_matrix[i] = s->intra_matrix[n];
++        controls->quantization.non_intra_quantiser_matrix[i] = s->inter_matrix[n];
++        controls->quantization.chroma_intra_quantiser_matrix[i] = s->chroma_intra_matrix[n];
++        controls->quantization.chroma_non_intra_quantiser_matrix[i] = s->chroma_inter_matrix[n];
++    }
++
++    return ff_v4l2_request_reset_frame(avctx, s->current_picture_ptr->f);
++}
++
++static int v4l2_request_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const MpegEncContext *s = avctx->priv_data;
++
++    return ff_v4l2_request_append_output_buffer(avctx, s->current_picture_ptr->f, buffer, size);
++}
++
++static int v4l2_request_mpeg2_end_frame(AVCodecContext *avctx)
++{
++    const MpegEncContext *s = avctx->priv_data;
++    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS,
++            .ptr = &controls->slice_params,
++            .size = sizeof(controls->slice_params),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION,
++            .ptr = &controls->quantization,
++            .size = sizeof(controls->quantization),
++        },
++    };
++
++    controls->slice_params.bit_size = req->output.used * 8;
++
++    return ff_v4l2_request_decode_frame(avctx, s->current_picture_ptr->f, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_mpeg2_init(AVCodecContext *avctx)
++{
++    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_MPEG2_SLICE, 1024 * 1024, NULL, 0);
++}
++
++const AVHWAccel ff_mpeg2_v4l2request_hwaccel = {
++    .name           = "mpeg2_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_MPEG2VIDEO,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_mpeg2_start_frame,
++    .decode_slice   = v4l2_request_mpeg2_decode_slice,
++    .end_frame      = v4l2_request_mpeg2_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsMPEG2),
++    .init           = v4l2_request_mpeg2_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContext),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+
+From b22e02f5933366677620bac2583e2edec5c3007c Mon Sep 17 00:00:00 2001
+From: Jernej Skrabec <jernej.skrabec@siol.net>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 04/18] Add V4L2 request API h264 hwaccel
+
+Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
+Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
+---
+ configure                      |   3 +
+ libavcodec/Makefile            |   1 +
+ libavcodec/h264_slice.c        |   4 +
+ libavcodec/h264dec.c           |   3 +
+ libavcodec/hwaccels.h          |   1 +
+ libavcodec/v4l2_request_h264.c | 460 +++++++++++++++++++++++++++++++++
+ 6 files changed, 472 insertions(+)
+ create mode 100644 libavcodec/v4l2_request_h264.c
+
+diff --git a/configure b/configure
+index 6b157d6d3e..1a7720ebe3 100755
+--- a/configure
++++ b/configure
+@@ -2925,6 +2925,8 @@ h264_dxva2_hwaccel_deps="dxva2"
+ h264_dxva2_hwaccel_select="h264_decoder"
+ h264_nvdec_hwaccel_deps="nvdec"
+ h264_nvdec_hwaccel_select="h264_decoder"
++h264_v4l2request_hwaccel_deps="v4l2_request h264_v4l2_request"
++h264_v4l2request_hwaccel_select="h264_decoder"
+ h264_vaapi_hwaccel_deps="vaapi"
+ h264_vaapi_hwaccel_select="h264_decoder"
+ h264_vdpau_hwaccel_deps="vdpau"
+@@ -6569,6 +6571,7 @@ if enabled v4l2_m2m; then
+ fi
+ 
+ check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
+ check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
+ 
+ check_headers sys/videoio.h
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 8963bd3e91..9a10a292e3 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -903,6 +903,7 @@ OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
+ OBJS-$(CONFIG_H264_DXVA2_HWACCEL)         += dxva2_h264.o
+ OBJS-$(CONFIG_H264_NVDEC_HWACCEL)         += nvdec_h264.o
+ OBJS-$(CONFIG_H264_QSV_HWACCEL)           += qsvdec_h2645.o
++OBJS-$(CONFIG_H264_V4L2REQUEST_HWACCEL)   += v4l2_request_h264.o
+ OBJS-$(CONFIG_H264_VAAPI_HWACCEL)         += vaapi_h264.o
+ OBJS-$(CONFIG_H264_VDPAU_HWACCEL)         += vdpau_h264.o
+ OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
+diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
+index db8363e4cc..3ae11ac8a7 100644
+--- a/libavcodec/h264_slice.c
++++ b/libavcodec/h264_slice.c
+@@ -759,6 +759,7 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
+ #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \
+                      (CONFIG_H264_D3D11VA_HWACCEL * 2) + \
+                      CONFIG_H264_NVDEC_HWACCEL + \
++                     CONFIG_H264_V4L2REQUEST_HWACCEL + \
+                      CONFIG_H264_VAAPI_HWACCEL + \
+                      CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \
+                      CONFIG_H264_VDPAU_HWACCEL)
+@@ -843,6 +844,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
+ #endif
+ #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+             *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
++#endif
++#if CONFIG_H264_V4L2REQUEST_HWACCEL
++            *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+             if (h->avctx->codec->pix_fmts)
+                 choices = h->avctx->codec->pix_fmts;
+diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
+index 5eedeb3c27..a504c89565 100644
+--- a/libavcodec/h264dec.c
++++ b/libavcodec/h264dec.c
+@@ -1102,6 +1102,9 @@ AVCodec ff_h264_decoder = {
+ #endif
+ #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+                                HWACCEL_VIDEOTOOLBOX(h264),
++#endif
++#if CONFIG_H264_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(h264),
+ #endif
+                                NULL
+                            },
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index 172a546bb2..44e00e79b5 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -27,6 +27,7 @@ extern const AVHWAccel ff_h264_d3d11va_hwaccel;
+ extern const AVHWAccel ff_h264_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_h264_dxva2_hwaccel;
+ extern const AVHWAccel ff_h264_nvdec_hwaccel;
++extern const AVHWAccel ff_h264_v4l2request_hwaccel;
+ extern const AVHWAccel ff_h264_vaapi_hwaccel;
+ extern const AVHWAccel ff_h264_vdpau_hwaccel;
+ extern const AVHWAccel ff_h264_videotoolbox_hwaccel;
+diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
+new file mode 100644
+index 0000000000..94b9aca8ad
+--- /dev/null
++++ b/libavcodec/v4l2_request_h264.c
+@@ -0,0 +1,460 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "h264dec.h"
++#include "hwconfig.h"
++#include "v4l2_request.h"
++
++typedef struct V4L2RequestControlsH264 {
++    struct v4l2_ctrl_h264_sps sps;
++    struct v4l2_ctrl_h264_pps pps;
++    struct v4l2_ctrl_h264_scaling_matrix scaling_matrix;
++    struct v4l2_ctrl_h264_decode_params decode_params;
++    struct v4l2_ctrl_h264_slice_params slice_params[MAX_SLICES];
++    int first_slice;
++} V4L2RequestControlsH264;
++
++typedef struct V4L2RequestContextH264 {
++    V4L2RequestContext base;
++    int decode_mode;
++    int start_code;
++    int max_slices;
++} V4L2RequestContextH264;
++
++static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++static void fill_weight_factors(struct v4l2_h264_weight_factors *factors, int list, const H264SliceContext *sl)
++{
++    for (int i = 0; i < sl->ref_count[list]; i++) {
++        if (sl->pwt.luma_weight_flag[list]) {
++            factors->luma_weight[i] = sl->pwt.luma_weight[i][list][0];
++            factors->luma_offset[i] = sl->pwt.luma_weight[i][list][1];
++        } else {
++            factors->luma_weight[i] = 1 << sl->pwt.luma_log2_weight_denom;
++            factors->luma_offset[i] = 0;
++        }
++        for (int j = 0; j < 2; j++) {
++            if (sl->pwt.chroma_weight_flag[list]) {
++                factors->chroma_weight[i][j] = sl->pwt.chroma_weight[i][list][j][0];
++                factors->chroma_offset[i][j] = sl->pwt.chroma_weight[i][list][j][1];
++            } else {
++                factors->chroma_weight[i][j] = 1 << sl->pwt.chroma_log2_weight_denom;
++                factors->chroma_offset[i][j] = 0;
++            }
++        }
++    }
++}
++
++static void fill_dpb_entry(struct v4l2_h264_dpb_entry *entry, const H264Picture *pic)
++{
++    entry->reference_ts = ff_v4l2_request_get_capture_timestamp(pic->f);
++    entry->frame_num = pic->frame_num;
++    entry->pic_num = pic->pic_id;
++    entry->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
++    if (pic->reference)
++        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE;
++    if (pic->long_ref)
++        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
++    if (pic->field_poc[0] != INT_MAX)
++        entry->top_field_order_cnt = pic->field_poc[0];
++    if (pic->field_poc[1] != INT_MAX)
++        entry->bottom_field_order_cnt = pic->field_poc[1];
++}
++
++static void fill_dpb(struct v4l2_ctrl_h264_decode_params *decode, const H264Context *h)
++{
++    int entries = 0;
++
++    for (int i = 0; i < h->short_ref_count; i++) {
++        const H264Picture *pic = h->short_ref[i];
++        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
++            fill_dpb_entry(&decode->dpb[entries++], pic);
++    }
++
++    if (!h->long_ref_count)
++        return;
++
++    for (int i = 0; i < FF_ARRAY_ELEMS(h->long_ref); i++) {
++        const H264Picture *pic = h->long_ref[i];
++        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
++            fill_dpb_entry(&decode->dpb[entries++], pic);
++    }
++}
++
++static uint8_t get_dpb_index(struct v4l2_ctrl_h264_decode_params *decode, const H264Ref *ref)
++{
++    uint64_t timestamp;
++
++    if (!ref->parent)
++        return 0;
++
++    timestamp = ff_v4l2_request_get_capture_timestamp(ref->parent->f);
++
++    for (uint8_t i = 0; i < FF_ARRAY_ELEMS(decode->dpb); i++) {
++        struct v4l2_h264_dpb_entry *entry = &decode->dpb[i];
++        if ((entry->flags & V4L2_H264_DPB_ENTRY_FLAG_VALID) &&
++            entry->reference_ts == timestamp)
++            return i;
++    }
++
++    return 0;
++}
++
++static void fill_sps(struct v4l2_ctrl_h264_sps *ctrl, const H264Context *h)
++{
++    const SPS *sps = h->ps.sps;
++
++    *ctrl = (struct v4l2_ctrl_h264_sps) {
++        .profile_idc = sps->profile_idc,
++        .constraint_set_flags = sps->constraint_set_flags,
++        .level_idc = sps->level_idc,
++        .seq_parameter_set_id = sps->sps_id,
++        .chroma_format_idc = sps->chroma_format_idc,
++        .bit_depth_luma_minus8 = sps->bit_depth_luma - 8,
++        .bit_depth_chroma_minus8 = sps->bit_depth_chroma - 8,
++        .log2_max_frame_num_minus4 = sps->log2_max_frame_num - 4,
++        .pic_order_cnt_type = sps->poc_type,
++        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++        .max_num_ref_frames = sps->ref_frame_count,
++        .num_ref_frames_in_pic_order_cnt_cycle = sps->poc_cycle_length,
++        //.offset_for_ref_frame[255] - not required? not set by libva-v4l2-request - copy sps->offset_for_ref_frame
++        .offset_for_non_ref_pic = sps->offset_for_non_ref_pic,
++        .offset_for_top_to_bottom_field = sps->offset_for_top_to_bottom_field,
++        .pic_width_in_mbs_minus1 = h->mb_width - 1,
++        .pic_height_in_map_units_minus1 = sps->frame_mbs_only_flag ? h->mb_height - 1 : h->mb_height / 2 - 1,
++    };
++
++    if (sps->residual_color_transform_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++    if (sps->transform_bypass)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS;
++    if (sps->delta_pic_order_always_zero_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO;
++    if (sps->gaps_in_frame_num_allowed_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED;
++    if (sps->frame_mbs_only_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY;
++    if (sps->mb_aff)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD;
++    if (sps->direct_8x8_inference_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE;
++}
++
++static void fill_pps(struct v4l2_ctrl_h264_pps *ctrl, const H264Context *h)
++{
++    const SPS *sps = h->ps.sps;
++    const PPS *pps = h->ps.pps;
++    const H264SliceContext *sl = &h->slice_ctx[0];
++    int qp_bd_offset = 6 * (sps->bit_depth_luma - 8);
++
++    *ctrl = (struct v4l2_ctrl_h264_pps) {
++        .pic_parameter_set_id = sl->pps_id,
++        .seq_parameter_set_id = pps->sps_id,
++        .num_slice_groups_minus1 = pps->slice_group_count - 1,
++        .num_ref_idx_l0_default_active_minus1 = pps->ref_count[0] - 1,
++        .num_ref_idx_l1_default_active_minus1 = pps->ref_count[1] - 1,
++        .weighted_bipred_idc = pps->weighted_bipred_idc,
++        .pic_init_qp_minus26 = pps->init_qp - 26 - qp_bd_offset,
++        .pic_init_qs_minus26 = pps->init_qs - 26 - qp_bd_offset,
++        .chroma_qp_index_offset = pps->chroma_qp_index_offset[0],
++        .second_chroma_qp_index_offset = pps->chroma_qp_index_offset[1],
++    };
++
++    if (pps->cabac)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE;
++    if (pps->pic_order_present)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT;
++    if (pps->weighted_pred)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED;
++    if (pps->deblocking_filter_parameters_present)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT;
++    if (pps->constrained_intra_pred)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++    if (pps->redundant_pic_cnt_present)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT;
++    if (pps->transform_8x8_mode)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE;
++}
++
++static int v4l2_request_h264_start_frame(AVCodecContext *avctx,
++                                         av_unused const uint8_t *buffer,
++                                         av_unused uint32_t size)
++{
++    const H264Context *h = avctx->priv_data;
++    const PPS *pps = h->ps.pps;
++    const SPS *sps = h->ps.sps;
++    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
++
++    fill_sps(&controls->sps, h);
++    fill_pps(&controls->pps, h);
++
++    memcpy(controls->scaling_matrix.scaling_list_4x4, pps->scaling_matrix4, sizeof(controls->scaling_matrix.scaling_list_4x4));
++    memcpy(controls->scaling_matrix.scaling_list_8x8[0], pps->scaling_matrix8[0], sizeof(controls->scaling_matrix.scaling_list_8x8[0]));
++    memcpy(controls->scaling_matrix.scaling_list_8x8[1], pps->scaling_matrix8[3], sizeof(controls->scaling_matrix.scaling_list_8x8[1]));
++
++    if (sps->chroma_format_idc == 3) {
++        memcpy(controls->scaling_matrix.scaling_list_8x8[2], pps->scaling_matrix8[1], sizeof(controls->scaling_matrix.scaling_list_8x8[2]));
++        memcpy(controls->scaling_matrix.scaling_list_8x8[3], pps->scaling_matrix8[4], sizeof(controls->scaling_matrix.scaling_list_8x8[3]));
++        memcpy(controls->scaling_matrix.scaling_list_8x8[4], pps->scaling_matrix8[2], sizeof(controls->scaling_matrix.scaling_list_8x8[4]));
++        memcpy(controls->scaling_matrix.scaling_list_8x8[5], pps->scaling_matrix8[5], sizeof(controls->scaling_matrix.scaling_list_8x8[5]));
++    }
++
++    controls->decode_params = (struct v4l2_ctrl_h264_decode_params) {
++        .num_slices = 0,
++        .nal_ref_idc = h->nal_ref_idc,
++        .top_field_order_cnt = h->cur_pic_ptr->field_poc[0] != INT_MAX ? h->cur_pic_ptr->field_poc[0] : 0,
++        .bottom_field_order_cnt = h->cur_pic_ptr->field_poc[1] != INT_MAX ? h->cur_pic_ptr->field_poc[1] : 0,
++    };
++
++    if (h->picture_idr)
++        controls->decode_params.flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC;
++
++    fill_dpb(&controls->decode_params, h);
++
++    controls->first_slice = !FIELD_PICTURE(h) || h->first_field;
++
++    return ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
++}
++
++static int v4l2_request_h264_queue_decode(AVCodecContext *avctx, int last_slice)
++{
++    const H264Context *h = avctx->priv_data;
++    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
++    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
++            .ptr = &controls->sps,
++            .size = sizeof(controls->sps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
++            .ptr = &controls->pps,
++            .size = sizeof(controls->pps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS,
++            .ptr = &controls->slice_params,
++            .size = sizeof(controls->slice_params[0]) * FFMAX(FFMIN(controls->decode_params.num_slices, MAX_SLICES), ctx->max_slices),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS,
++            .ptr = &controls->decode_params,
++            .size = sizeof(controls->decode_params),
++        },
++    };
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED)
++        return ff_v4l2_request_decode_slice(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
++
++    return ff_v4l2_request_decode_frame(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const H264Context *h = avctx->priv_data;
++    const PPS *pps = h->ps.pps;
++    const H264SliceContext *sl = &h->slice_ctx[0];
++    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
++    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->cur_pic_ptr->f->data[0];
++    int i, ret, count, slice = FFMIN(controls->decode_params.num_slices, MAX_SLICES - 1);
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED && slice) {
++        ret = v4l2_request_h264_queue_decode(avctx, 0);
++        if (ret)
++            return ret;
++
++        ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
++        slice = controls->decode_params.num_slices = 0;
++        controls->first_slice = 0;
++    }
++
++    controls->slice_params[slice] = (struct v4l2_ctrl_h264_slice_params) {
++        /* Size in bytes, including header */
++        .size = 0,
++        .start_byte_offset = req->output.used,
++        /* Offset in bits to slice_data() from the beginning of this slice. */
++        .header_bit_size = get_bits_count(&sl->gb),
++
++        .first_mb_in_slice = sl->first_mb_addr,
++        .slice_type = ff_h264_get_slice_type(sl),
++        .pic_parameter_set_id = sl->pps_id,
++        .colour_plane_id = 0, /* what is this? */
++        .frame_num = h->poc.frame_num,
++        .idr_pic_id = 0, /* what is this? */
++        .pic_order_cnt_lsb = sl->poc_lsb,
++        .delta_pic_order_cnt_bottom = sl->delta_poc_bottom,
++        .delta_pic_order_cnt0 = sl->delta_poc[0],
++        .delta_pic_order_cnt1 = sl->delta_poc[1],
++        .redundant_pic_cnt = sl->redundant_pic_count,
++
++        /* Size in bits of dec_ref_pic_marking() syntax element. */
++        .dec_ref_pic_marking_bit_size = 0,
++        /* Size in bits of pic order count syntax. */
++        .pic_order_cnt_bit_size = 0,
++
++        .cabac_init_idc = sl->cabac_init_idc,
++        .slice_qp_delta = sl->qscale - pps->init_qp,
++        .slice_qs_delta = 0, /* XXX not implemented by FFmpeg */
++        .disable_deblocking_filter_idc = sl->deblocking_filter < 2 ? !sl->deblocking_filter : sl->deblocking_filter,
++        .slice_alpha_c0_offset_div2 = sl->slice_alpha_c0_offset / 2,
++        .slice_beta_offset_div2 = sl->slice_beta_offset / 2,
++        .slice_group_change_cycle = 0, /* what is this? */
++
++        .num_ref_idx_l0_active_minus1 = sl->list_count > 0 ? sl->ref_count[0] - 1 : 0,
++        .num_ref_idx_l1_active_minus1 = sl->list_count > 1 ? sl->ref_count[1] - 1 : 0,
++    };
++
++    if (FIELD_PICTURE(h))
++        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_FIELD_PIC;
++    if (h->picture_structure == PICT_BOTTOM_FIELD)
++        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_BOTTOM_FIELD;
++    if (sl->slice_type == AV_PICTURE_TYPE_B && sl->direct_spatial_mv_pred)
++        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED;
++
++    controls->slice_params[slice].pred_weight_table.chroma_log2_weight_denom = sl->pwt.chroma_log2_weight_denom;
++    controls->slice_params[slice].pred_weight_table.luma_log2_weight_denom = sl->pwt.luma_log2_weight_denom;
++
++    count = sl->list_count > 0 ? sl->ref_count[0] : 0;
++    for (i = 0; i < count; i++)
++        controls->slice_params[slice].ref_pic_list0[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[0][i]);
++    if (count)
++        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[0], 0, sl);
++
++    count = sl->list_count > 1 ? sl->ref_count[1] : 0;
++    for (i = 0; i < count; i++)
++        controls->slice_params[slice].ref_pic_list1[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[1][i]);
++    if (count)
++        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[1], 1, sl);
++
++    if (ctx->start_code == V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
++        ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, nalu_slice_start_code, 3);
++        if (ret)
++            return ret;
++    }
++
++    ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, buffer, size);
++    if (ret)
++        return ret;
++
++    controls->slice_params[slice].size = req->output.used - controls->slice_params[slice].start_byte_offset;
++    controls->decode_params.num_slices++;
++    return 0;
++}
++
++static int v4l2_request_h264_end_frame(AVCodecContext *avctx)
++{
++    const H264Context *h = avctx->priv_data;
++    return v4l2_request_h264_queue_decode(avctx, !FIELD_PICTURE(h) || !h->first_field);
++}
++
++static int v4l2_request_h264_set_controls(AVCodecContext *avctx)
++{
++    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        { .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE, },
++        { .id = V4L2_CID_MPEG_VIDEO_H264_START_CODE, },
++    };
++    struct v4l2_query_ext_ctrl slice_params = {
++        .id = V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS,
++    };
++
++    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE);
++    if (ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED &&
++        ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
++        return AVERROR(EINVAL);
++    }
++
++    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_START_CODE);
++    if (ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_NONE &&
++        ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
++        return AVERROR(EINVAL);
++    }
++
++    ret = ff_v4l2_request_query_control(avctx, &slice_params);
++    if (ret)
++        return ret;
++
++    ctx->max_slices = slice_params.elems;
++    if (ctx->max_slices > MAX_SLICES) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
++        return AVERROR(EINVAL);
++    }
++
++    control[0].value = ctx->decode_mode;
++    control[1].value = ctx->start_code;
++
++    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_h264_init(AVCodecContext *avctx)
++{
++    const H264Context *h = avctx->priv_data;
++    struct v4l2_ctrl_h264_sps sps;
++    struct v4l2_ctrl_h264_pps pps;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
++            .ptr = &sps,
++            .size = sizeof(sps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
++            .ptr = &pps,
++            .size = sizeof(pps),
++        },
++    };
++
++    fill_sps(&sps, h);
++    fill_pps(&pps, h);
++
++    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_H264_SLICE, 4 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    return v4l2_request_h264_set_controls(avctx);
++}
++
++const AVHWAccel ff_h264_v4l2request_hwaccel = {
++    .name           = "h264_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_H264,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_h264_start_frame,
++    .decode_slice   = v4l2_request_h264_decode_slice,
++    .end_frame      = v4l2_request_h264_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsH264),
++    .init           = v4l2_request_h264_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContextH264),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+
+From c804445e166d743ce41831556c968ed9e3a414f5 Mon Sep 17 00:00:00 2001
+From: Jernej Skrabec <jernej.skrabec@siol.net>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 05/18] Add V4L2 request API hevc hwaccel
+
+Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
+Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
+---
+ configure                      |   3 +
+ libavcodec/Makefile            |   1 +
+ libavcodec/hevcdec.c           |  10 +
+ libavcodec/hwaccels.h          |   1 +
+ libavcodec/v4l2_request_hevc.c | 533 +++++++++++++++++++++++++++++++++
+ 5 files changed, 548 insertions(+)
+ create mode 100644 libavcodec/v4l2_request_hevc.c
+
+diff --git a/configure b/configure
+index 1a7720ebe3..58abd99335 100755
+--- a/configure
++++ b/configure
+@@ -2941,6 +2941,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
+ hevc_dxva2_hwaccel_select="hevc_decoder"
+ hevc_nvdec_hwaccel_deps="nvdec"
+ hevc_nvdec_hwaccel_select="hevc_decoder"
++hevc_v4l2request_hwaccel_deps="v4l2_request hevc_v4l2_request"
++hevc_v4l2request_hwaccel_select="hevc_decoder"
+ hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
+ hevc_vaapi_hwaccel_select="hevc_decoder"
+ hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
+@@ -6572,6 +6574,7 @@ fi
+ 
+ check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
+ check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+ check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
+ 
+ check_headers sys/videoio.h
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 9a10a292e3..5d0e1d7dae 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -911,6 +911,7 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
+ OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec_h2645.o
++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o
+ OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
+ OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
+diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
+index 0772608a30..d01b7b34bc 100644
+--- a/libavcodec/hevcdec.c
++++ b/libavcodec/hevcdec.c
+@@ -372,6 +372,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
+                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
+                      CONFIG_HEVC_NVDEC_HWACCEL + \
++                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
+                      CONFIG_HEVC_VAAPI_HWACCEL + \
+                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
+                      CONFIG_HEVC_VDPAU_HWACCEL)
+@@ -398,6 +399,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV420P10:
+@@ -416,6 +420,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+         *fmt++ = AV_PIX_FMT_CUDA;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV444P:
+@@ -3588,6 +3595,9 @@ AVCodec ff_hevc_decoder = {
+ #endif
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+                                HWACCEL_VIDEOTOOLBOX(hevc),
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(hevc),
+ #endif
+                                NULL
+                            },
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index 44e00e79b5..e2f90a5fdd 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -35,6 +35,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
+ extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+ extern const AVHWAccel ff_hevc_nvdec_hwaccel;
++extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
+ extern const AVHWAccel ff_hevc_vaapi_hwaccel;
+ extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+ extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+new file mode 100644
+index 0000000000..f724909546
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -0,0 +1,533 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hevcdec.h"
++#include "hwconfig.h"
++#include "v4l2_request.h"
++
++#define MAX_SLICES 16
++
++typedef struct V4L2RequestControlsHEVC {
++    struct v4l2_ctrl_hevc_sps sps;
++    struct v4l2_ctrl_hevc_pps pps;
++    struct v4l2_ctrl_hevc_slice_params slice_params[MAX_SLICES];
++    int first_slice;
++    int num_slices; //TODO: this should be in control
++} V4L2RequestControlsHEVC;
++
++typedef struct V4L2RequestContextHEVC {
++    V4L2RequestContext base;
++    int decode_mode;
++    int start_code;
++    int max_slices;
++} V4L2RequestContextHEVC;
++
++static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++static void v4l2_request_hevc_fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
++{
++    int32_t luma_weight_denom, chroma_weight_denom;
++    const SliceHeader *sh = &h->sh;
++
++    if (sh->slice_type == HEVC_SLICE_I ||
++        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
++        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
++        return;
++
++    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
++
++    if (h->ps.sps->chroma_format_idc)
++        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
++
++    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
++    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
++        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
++        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
++        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
++        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
++        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
++    }
++
++    if (sh->slice_type != HEVC_SLICE_B)
++        return;
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
++        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
++        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
++        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
++        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
++        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
++    }
++}
++
++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
++{
++    const HEVCFrame *frame;
++    int i;
++
++    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
++        frame = h->rps[ST_CURR_BEF].ref[i];
++        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
++    }
++
++    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
++        frame = h->rps[ST_CURR_AFT].ref[i];
++        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
++    }
++
++    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
++        frame = h->rps[LT_CURR].ref[i];
++        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
++    }
++
++    return 0;
++}
++
++static uint8_t get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
++                                 struct v4l2_ctrl_hevc_slice_params *slice_params)
++{
++    uint64_t timestamp;
++
++    if (!frame)
++        return 0;
++
++    timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
++
++    for (uint8_t i = 0; i < slice_params->num_active_dpb_entries; i++) {
++        struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[i];
++        if (entry->timestamp == timestamp)
++            return i;
++    }
++
++    return 0;
++}
++
++static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
++                                                struct v4l2_ctrl_hevc_slice_params *slice_params)
++{
++    const HEVCFrame *pic = h->ref;
++    const SliceHeader *sh = &h->sh;
++    int i, entries = 0;
++    RefPicList *rpl;
++
++    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
++        .bit_size = 0,
++        .data_bit_offset = get_bits_count(&h->HEVClc->gb),
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++        .nal_unit_type = h->nal_unit_type,
++        .nuh_temporal_id_plus1 = h->temporal_id + 1,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_type = sh->slice_type,
++        .colour_plane_id = sh->colour_plane_id,
++        .slice_pic_order_cnt = pic->poc,
++        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
++        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
++        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
++        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
++        .slice_qp_delta = sh->slice_qp_delta,
++        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
++        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
++        .slice_act_y_qp_offset = 0,
++        .slice_act_cb_qp_offset = 0,
++        .slice_act_cr_qp_offset = 0,
++        .slice_beta_offset_div2 = sh->beta_offset / 2,
++        .slice_tc_offset_div2 = sh->tc_offset / 2,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++        .pic_struct = h->sei.picture_timing.picture_struct,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++    };
++
++    if (sh->slice_sample_adaptive_offset_flag[0])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
++
++    if (sh->slice_sample_adaptive_offset_flag[1])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
++
++    if (sh->slice_temporal_mvp_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
++
++    if (sh->mvd_l1_zero_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
++
++    if (sh->cabac_init_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
++
++    if (sh->collocated_list == L0)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
++
++    if (sh->disable_deblocking_filter_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
++
++    if (sh->slice_loop_filter_across_slices_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
++        const HEVCFrame *frame = &h->DPB[i];
++        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
++            struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[entries++];
++
++            entry->timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
++            entry->rps = find_frame_rps_type(h, entry->timestamp);
++            entry->field_pic = frame->frame->interlaced_frame;
++
++            /* TODO: Interleaved: Get the POC for each field. */
++            entry->pic_order_cnt[0] = frame->poc;
++            entry->pic_order_cnt[1] = frame->poc;
++        }
++    }
++
++    slice_params->num_active_dpb_entries = entries;
++
++    if (sh->slice_type != HEVC_SLICE_I) {
++        rpl = &h->ref->refPicList[0];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
++    }
++
++    if (sh->slice_type == HEVC_SLICE_B) {
++        rpl = &h->ref->refPicList[1];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
++    }
++
++    v4l2_request_hevc_fill_pred_table(h, &slice_params->pred_weight_table);
++}
++
++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCContext *h)
++{
++    const HEVCSPS *sps = h->ps.sps;
++
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++    *ctrl = (struct v4l2_ctrl_hevc_sps) {
++        .chroma_format_idc = sps->chroma_format_idc,
++        .pic_width_in_luma_samples = sps->width,
++        .pic_height_in_luma_samples = sps->height,
++        .bit_depth_luma_minus8 = sps->bit_depth - 8,
++        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
++        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
++        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
++        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
++        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
++        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
++        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
++        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
++        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
++        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
++        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
++        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
++        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
++        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
++        .num_short_term_ref_pic_sets = sps->nb_st_rps,
++        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
++    };
++
++    if (sps->separate_colour_plane_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++
++    if (sps->scaling_list_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
++
++    if (sps->amp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
++
++    if (sps->sao_enabled)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
++
++    if (sps->pcm_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
++
++    if (sps->pcm.loop_filter_disable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
++
++    if (sps->long_term_ref_pics_present_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
++
++    if (sps->sps_temporal_mvp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
++
++    if (sps->sps_strong_intra_smoothing_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
++}
++
++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
++                                         av_unused const uint8_t *buffer,
++                                         av_unused uint32_t size)
++{
++    const HEVCContext *h = avctx->priv_data;
++    const HEVCSPS *sps = h->ps.sps;
++    const HEVCPPS *pps = h->ps.pps;
++    const ScalingList *sl = pps->scaling_list_data_present_flag ?
++                            &pps->scaling_list :
++                            sps->scaling_list_enable_flag ?
++                            &sps->scaling_list : NULL;
++    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
++
++    fill_sps(&controls->sps, h);
++
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++    controls->pps = (struct v4l2_ctrl_hevc_pps) {
++        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
++        .init_qp_minus26 = pps->pic_init_qp_minus26,
++        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
++        .pps_cb_qp_offset = pps->cb_qp_offset,
++        .pps_cr_qp_offset = pps->cr_qp_offset,
++        .pps_beta_offset_div2 = pps->beta_offset / 2,
++        .pps_tc_offset_div2 = pps->tc_offset / 2,
++        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
++    };
++
++    if (pps->dependent_slice_segments_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT;
++
++    if (pps->output_flag_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
++
++    if (pps->sign_data_hiding_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
++
++    if (pps->cabac_init_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
++
++    if (pps->constrained_intra_pred_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++
++    if (pps->transform_skip_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
++
++    if (pps->cu_qp_delta_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
++
++    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
++
++    if (pps->weighted_pred_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
++
++    if (pps->weighted_bipred_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
++
++    if (pps->transquant_bypass_enable_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
++
++    if (pps->tiles_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
++
++    if (pps->entropy_coding_sync_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
++
++    if (pps->loop_filter_across_tiles_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
++
++    if (pps->seq_loop_filter_across_slices_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    if (pps->deblocking_filter_override_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
++
++    if (pps->disable_dbf)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
++
++    if (pps->lists_modification_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
++
++    if (pps->slice_header_extension_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
++
++    if (pps->tiles_enabled_flag) {
++        controls->pps.num_tile_columns_minus1 = pps->num_tile_columns - 1;
++        controls->pps.num_tile_rows_minus1 = pps->num_tile_rows - 1;
++
++        for (int i = 0; i < pps->num_tile_columns; i++)
++            controls->pps.column_width_minus1[i] = pps->column_width[i] - 1;
++
++        for (int i = 0; i < pps->num_tile_rows; i++)
++            controls->pps.row_height_minus1[i] = pps->row_height[i] - 1;
++    }
++
++    controls->first_slice = 1;
++    controls->num_slices = 0;
++
++    return ff_v4l2_request_reset_frame(avctx, h->ref->frame);
++}
++
++static int v4l2_request_hevc_queue_decode(AVCodecContext *avctx, int last_slice)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
++            .ptr = &controls->sps,
++            .size = sizeof(controls->sps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
++            .ptr = &controls->pps,
++            .size = sizeof(controls->pps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
++            .ptr = &controls->slice_params,
++            .size = sizeof(controls->slice_params[0]) * FFMAX(FFMIN(controls->num_slices, MAX_SLICES), ctx->max_slices),
++        },
++    };
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED)
++        return ff_v4l2_request_decode_slice(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
++
++    return ff_v4l2_request_decode_frame(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->ref->frame->data[0];
++    int ret, slice = FFMIN(controls->num_slices, MAX_SLICES - 1);
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && slice) {
++        ret = v4l2_request_hevc_queue_decode(avctx, 0);
++        if (ret)
++            return ret;
++
++        ff_v4l2_request_reset_frame(avctx, h->ref->frame);
++        slice = controls->num_slices = 0;
++        controls->first_slice = 0;
++    }
++
++    v4l2_request_hevc_fill_slice_params(h, &controls->slice_params[slice]);
++
++    if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
++        ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, nalu_slice_start_code, 3);
++        if (ret)
++            return ret;
++    }
++
++    ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, buffer, size);
++    if (ret)
++        return ret;
++
++    controls->slice_params[slice].bit_size = req->output.used * 8; //FIXME
++    controls->num_slices++;
++    return 0;
++}
++
++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
++{
++    return v4l2_request_hevc_queue_decode(avctx, 1);
++}
++
++static int v4l2_request_hevc_set_controls(AVCodecContext *avctx)
++{
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
++    };
++    struct v4l2_query_ext_ctrl slice_params = {
++        .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
++    };
++
++    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE);
++    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
++        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
++        return AVERROR(EINVAL);
++    }
++
++    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_START_CODE);
++    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
++        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
++        return AVERROR(EINVAL);
++    }
++
++    ret = ff_v4l2_request_query_control(avctx, &slice_params);
++    if (ret)
++        return ret;
++
++    ctx->max_slices = slice_params.elems;
++    if (ctx->max_slices > MAX_SLICES) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
++        return AVERROR(EINVAL);
++    }
++
++    control[0].value = ctx->decode_mode;
++    control[1].value = ctx->start_code;
++
++    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_hevc_init(AVCodecContext *avctx)
++{
++    const HEVCContext *h = avctx->priv_data;
++    struct v4l2_ctrl_hevc_sps sps;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
++            .ptr = &sps,
++            .size = sizeof(sps),
++        },
++    };
++
++    fill_sps(&sps, h);
++
++    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_HEVC_SLICE, 4 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    return v4l2_request_hevc_set_controls(avctx);
++}
++
++const AVHWAccel ff_hevc_v4l2request_hwaccel = {
++    .name           = "hevc_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_hevc_start_frame,
++    .decode_slice   = v4l2_request_hevc_decode_slice,
++    .end_frame      = v4l2_request_hevc_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsHEVC),
++    .init           = v4l2_request_hevc_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContextHEVC),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+
+From 11f2c3d30ce6085ef1510f51481852bab1380ecd Mon Sep 17 00:00:00 2001
+From: Boris Brezillon <boris.brezillon@collabora.com>
+Date: Wed, 22 May 2019 14:46:58 +0200
+Subject: [PATCH 06/18] Add V4L2 request API vp8 hwaccel
+
+Need to fix the STREAMOFF/STREAMON issue in a proper way.
+
+Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
+Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
+---
+ configure                     |   3 +
+ libavcodec/Makefile           |   1 +
+ libavcodec/hwaccels.h         |   1 +
+ libavcodec/v4l2_request_vp8.c | 180 ++++++++++++++++++++++++++++++++++
+ libavcodec/vp8.c              |   8 +-
+ 5 files changed, 192 insertions(+), 1 deletion(-)
+ create mode 100644 libavcodec/v4l2_request_vp8.c
+
+diff --git a/configure b/configure
+index 58abd99335..cbb91c2bca 100755
+--- a/configure
++++ b/configure
+@@ -3003,6 +3003,8 @@ vc1_vdpau_hwaccel_deps="vdpau"
+ vc1_vdpau_hwaccel_select="vc1_decoder"
+ vp8_nvdec_hwaccel_deps="nvdec"
+ vp8_nvdec_hwaccel_select="vp8_decoder"
++vp8_v4l2request_hwaccel_deps="v4l2_request vp8_v4l2_request"
++vp8_v4l2request_hwaccel_select="vp8_decoder"
+ vp8_vaapi_hwaccel_deps="vaapi"
+ vp8_vaapi_hwaccel_select="vp8_decoder"
+ vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
+@@ -6576,6 +6578,7 @@ check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
+ check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
+ check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+ check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
++check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;"
+ 
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 5d0e1d7dae..d6af854daa 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -940,6 +940,7 @@ OBJS-$(CONFIG_VC1_QSV_HWACCEL)            += qsvdec_other.o
+ OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
+ OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
+ OBJS-$(CONFIG_VP8_NVDEC_HWACCEL)          += nvdec_vp8.o
++OBJS-$(CONFIG_VP8_V4L2REQUEST_HWACCEL)    += v4l2_request_vp8.o
+ OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
+ OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
+ OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index e2f90a5fdd..bd75e94f4c 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -65,6 +65,7 @@ extern const AVHWAccel ff_vc1_nvdec_hwaccel;
+ extern const AVHWAccel ff_vc1_vaapi_hwaccel;
+ extern const AVHWAccel ff_vc1_vdpau_hwaccel;
+ extern const AVHWAccel ff_vp8_nvdec_hwaccel;
++extern const AVHWAccel ff_vp8_v4l2request_hwaccel;
+ extern const AVHWAccel ff_vp8_vaapi_hwaccel;
+ extern const AVHWAccel ff_vp9_d3d11va_hwaccel;
+ extern const AVHWAccel ff_vp9_d3d11va2_hwaccel;
+diff --git a/libavcodec/v4l2_request_vp8.c b/libavcodec/v4l2_request_vp8.c
+new file mode 100644
+index 0000000000..7e75ee398a
+--- /dev/null
++++ b/libavcodec/v4l2_request_vp8.c
+@@ -0,0 +1,180 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hwconfig.h"
++#include "v4l2_request.h"
++#include "vp8.h"
++
++typedef struct V4L2RequestControlsVP8 {
++    struct v4l2_ctrl_vp8_frame_header ctrl;
++} V4L2RequestControlsVP8;
++
++static int v4l2_request_vp8_start_frame(AVCodecContext          *avctx,
++                                        av_unused const uint8_t *buffer,
++                                        av_unused uint32_t       size)
++{
++    const VP8Context *s = avctx->priv_data;
++    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
++
++    memset(&controls->ctrl, 0, sizeof(controls->ctrl));
++    return ff_v4l2_request_reset_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f);
++}
++
++static int v4l2_request_vp8_end_frame(AVCodecContext *avctx)
++{
++    const VP8Context *s = avctx->priv_data;
++    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER,
++            .ptr = &controls->ctrl,
++            .size = sizeof(controls->ctrl),
++        },
++    };
++
++    return ff_v4l2_request_decode_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f,
++                                        control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_vp8_decode_slice(AVCodecContext *avctx,
++                                         const uint8_t *buffer,
++                                         uint32_t size)
++{
++    const VP8Context *s = avctx->priv_data;
++    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
++    struct v4l2_ctrl_vp8_frame_header *hdr = &controls->ctrl;
++    const uint8_t *data = buffer + 3 + 7 * s->keyframe;
++    unsigned int i, j, k;
++
++    hdr->version = s->profile & 0x3;
++    hdr->width = avctx->width;
++    hdr->height = avctx->height;
++    /* FIXME: set ->xx_scale */
++    hdr->prob_skip_false = s->prob->mbskip;
++    hdr->prob_intra = s->prob->intra;
++    hdr->prob_gf = s->prob->golden;
++    hdr->prob_last = s->prob->last;
++    hdr->first_part_size = s->header_partition_size;
++    hdr->first_part_header_bits = (8 * (s->coder_state_at_header_end.input - data) -
++                                   s->coder_state_at_header_end.bit_count - 8);
++    hdr->num_dct_parts = s->num_coeff_partitions;
++    for (i = 0; i < 8; i++)
++        hdr->dct_part_sizes[i] = s->coeff_partition_size[i];
++
++    hdr->coder_state.range = s->coder_state_at_header_end.range;
++    hdr->coder_state.value = s->coder_state_at_header_end.value;
++    hdr->coder_state.bit_count = s->coder_state_at_header_end.bit_count;
++    if (s->framep[VP56_FRAME_PREVIOUS])
++        hdr->last_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_PREVIOUS]->tf.f);
++    if (s->framep[VP56_FRAME_GOLDEN])
++        hdr->golden_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN]->tf.f);
++    if (s->framep[VP56_FRAME_GOLDEN2])
++        hdr->alt_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN2]->tf.f);
++    hdr->flags |= s->invisible ? 0 : V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME;
++    hdr->flags |= s->mbskip_enabled ? V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF : 0;
++    hdr->flags |= (s->profile & 0x4) ? V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL : 0;
++    hdr->flags |= s->keyframe ? V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME : 0;
++    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN : 0;
++    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN2] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT : 0;
++    hdr->segment_header.flags |= s->segmentation.enabled ? V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED : 0;
++    hdr->segment_header.flags |= s->segmentation.update_map ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP : 0;
++    hdr->segment_header.flags |= s->segmentation.update_feature_data ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA : 0;
++    hdr->segment_header.flags |= s->segmentation.absolute_vals ? 0 : V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE;
++    for (i = 0; i < 4; i++) {
++        hdr->segment_header.quant_update[i] = s->segmentation.base_quant[i];
++        hdr->segment_header.lf_update[i] = s->segmentation.filter_level[i];
++    }
++
++    for (i = 0; i < 3; i++)
++        hdr->segment_header.segment_probs[i] = s->prob->segmentid[i];
++
++    hdr->lf_header.level = s->filter.level;
++    hdr->lf_header.sharpness_level = s->filter.sharpness;
++    hdr->lf_header.flags |= s->lf_delta.enabled ? V4L2_VP8_LF_HEADER_ADJ_ENABLE : 0;
++    hdr->lf_header.flags |= s->lf_delta.update ? V4L2_VP8_LF_HEADER_DELTA_UPDATE : 0;
++    hdr->lf_header.flags |= s->filter.simple ? V4L2_VP8_LF_FILTER_TYPE_SIMPLE : 0;
++    for (i = 0; i < 4; i++) {
++        hdr->lf_header.ref_frm_delta[i] = s->lf_delta.ref[i];
++        hdr->lf_header.mb_mode_delta[i] = s->lf_delta.mode[i + MODE_I4x4];
++    }
++
++    // Probabilites
++    if (s->keyframe) {
++        static const uint8_t keyframe_y_mode_probs[4] = {
++            145, 156, 163, 128
++        };
++        static const uint8_t keyframe_uv_mode_probs[3] = {
++            142, 114, 183
++        };
++
++        memcpy(hdr->entropy_header.y_mode_probs, keyframe_y_mode_probs,  4);
++        memcpy(hdr->entropy_header.uv_mode_probs, keyframe_uv_mode_probs, 3);
++    } else {
++        for (i = 0; i < 4; i++)
++            hdr->entropy_header.y_mode_probs[i] = s->prob->pred16x16[i];
++        for (i = 0; i < 3; i++)
++            hdr->entropy_header.uv_mode_probs[i] = s->prob->pred8x8c[i];
++    }
++    for (i = 0; i < 2; i++)
++        for (j = 0; j < 19; j++)
++            hdr->entropy_header.mv_probs[i][j] = s->prob->mvc[i][j];
++
++    for (i = 0; i < 4; i++) {
++        for (j = 0; j < 8; j++) {
++            static const int coeff_bands_inverse[8] = {
++                0, 1, 2, 3, 5, 6, 4, 15
++            };
++            int coeff_pos = coeff_bands_inverse[j];
++
++            for (k = 0; k < 3; k++) {
++                memcpy(hdr->entropy_header.coeff_probs[i][j][k],
++                       s->prob->token[i][coeff_pos][k], 11);
++            }
++        }
++    }
++
++    hdr->quant_header.y_ac_qi = s->quant.yac_qi;
++    hdr->quant_header.y_dc_delta = s->quant.ydc_delta;
++    hdr->quant_header.y2_dc_delta = s->quant.y2dc_delta;
++    hdr->quant_header.y2_ac_delta = s->quant.y2ac_delta;
++    hdr->quant_header.uv_dc_delta = s->quant.uvdc_delta;
++    hdr->quant_header.uv_ac_delta = s->quant.uvac_delta;
++
++    return ff_v4l2_request_append_output_buffer(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f, buffer, size);
++}
++
++static int v4l2_request_vp8_init(AVCodecContext *avctx)
++{
++    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP8_FRAME, 2 * 1024 * 1024, NULL, 0);
++}
++
++const AVHWAccel ff_vp8_v4l2request_hwaccel = {
++    .name           = "vp8_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_VP8,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_vp8_start_frame,
++    .decode_slice   = v4l2_request_vp8_decode_slice,
++    .end_frame      = v4l2_request_vp8_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsVP8),
++    .init           = v4l2_request_vp8_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContext),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
+index bab4223aca..0e1edb46fb 100644
+--- a/libavcodec/vp8.c
++++ b/libavcodec/vp8.c
+@@ -175,6 +175,9 @@ static enum AVPixelFormat get_pixel_format(VP8Context *s)
+ #endif
+ #if CONFIG_VP8_NVDEC_HWACCEL
+         AV_PIX_FMT_CUDA,
++#endif
++#if CONFIG_VP8_V4L2REQUEST_HWACCEL
++        AV_PIX_FMT_DRM_PRIME,
+ #endif
+         AV_PIX_FMT_YUV420P,
+         AV_PIX_FMT_NONE,
+@@ -198,7 +201,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
+             return ret;
+     }
+ 
+-    if (!s->actually_webp && !is_vp7) {
++    if (!s->actually_webp && !is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
+         s->pix_fmt = get_pixel_format(s);
+         if (s->pix_fmt < 0)
+             return AVERROR(EINVAL);
+@@ -2968,6 +2971,9 @@ AVCodec ff_vp8_decoder = {
+ #endif
+ #if CONFIG_VP8_NVDEC_HWACCEL
+                                HWACCEL_NVDEC(vp8),
++#endif
++#if CONFIG_VP8_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(vp8),
+ #endif
+                                NULL
+                            },
+
+From d1cbb6de7dd7462fb696160612ae45623c61265c Mon Sep 17 00:00:00 2001
+From: Ezequiel Garcia <ezequiel@collabora.com>
+Date: Wed, 20 Feb 2019 11:18:00 -0300
+Subject: [PATCH 07/18] avcodec/h264: parse idr_pic_id
+
+Signed-off-by: Ezequiel Garcia <ezequiel@collabora.com>
+---
+ libavcodec/h264_slice.c        | 2 +-
+ libavcodec/h264dec.h           | 2 ++
+ libavcodec/v4l2_request_h264.c | 2 +-
+ 3 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
+index 3ae11ac8a7..96e8edd102 100644
+--- a/libavcodec/h264_slice.c
++++ b/libavcodec/h264_slice.c
+@@ -1822,7 +1822,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+     }
+ 
+     if (nal->type == H264_NAL_IDR_SLICE)
+-        get_ue_golomb_long(&sl->gb); /* idr_pic_id */
++        sl->idr_pic_id = get_ue_golomb_long(&sl->gb);
+ 
+     if (sps->poc_type == 0) {
+         sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
+diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h
+index a419615124..aebc5ed2f6 100644
+--- a/libavcodec/h264dec.h
++++ b/libavcodec/h264dec.h
+@@ -190,6 +190,8 @@ typedef struct H264SliceContext {
+     int slice_type_nos;         ///< S free slice type (SI/SP are remapped to I/P)
+     int slice_type_fixed;
+ 
++    int idr_pic_id;
++
+     int qscale;
+     int chroma_qp[2];   // QPc
+     int qp_thresh;      ///< QP threshold to skip loopfilter
+diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
+index 94b9aca8ad..9382e573b4 100644
+--- a/libavcodec/v4l2_request_h264.c
++++ b/libavcodec/v4l2_request_h264.c
+@@ -303,7 +303,7 @@ static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *
+         .pic_parameter_set_id = sl->pps_id,
+         .colour_plane_id = 0, /* what is this? */
+         .frame_num = h->poc.frame_num,
+-        .idr_pic_id = 0, /* what is this? */
++        .idr_pic_id = sl->idr_pic_id,
+         .pic_order_cnt_lsb = sl->poc_lsb,
+         .delta_pic_order_cnt_bottom = sl->delta_poc_bottom,
+         .delta_pic_order_cnt0 = sl->delta_poc[0],
+
+From 88da95ec126bdf5ffbc4399c7e453a2248e79119 Mon Sep 17 00:00:00 2001
+From: Boris Brezillon <boris.brezillon@collabora.com>
+Date: Wed, 22 May 2019 14:44:22 +0200
+Subject: [PATCH 08/18] avcodec/h264: parse ref_pic_marking_size_in_bits and
+ pic_order_cnt_bit_size
+
+Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
+---
+ libavcodec/h264_slice.c        | 6 +++++-
+ libavcodec/h264dec.h           | 2 ++
+ libavcodec/v4l2_request_h264.c | 4 ++--
+ 3 files changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
+index 96e8edd102..c3896cfd90 100644
+--- a/libavcodec/h264_slice.c
++++ b/libavcodec/h264_slice.c
+@@ -1740,7 +1740,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+     unsigned int slice_type, tmp, i;
+     int field_pic_flag, bottom_field_flag;
+     int first_slice = sl == h->slice_ctx && !h->current_slice;
+-    int picture_structure;
++    int picture_structure, pos;
+ 
+     if (first_slice)
+         av_assert0(!h->setup_finished);
+@@ -1824,6 +1824,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+     if (nal->type == H264_NAL_IDR_SLICE)
+         sl->idr_pic_id = get_ue_golomb_long(&sl->gb);
+ 
++    pos = sl->gb.index;
+     if (sps->poc_type == 0) {
+         sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
+ 
+@@ -1837,6 +1838,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+         if (pps->pic_order_present == 1 && picture_structure == PICT_FRAME)
+             sl->delta_poc[1] = get_se_golomb(&sl->gb);
+     }
++    sl->pic_order_cnt_bit_size = sl->gb.index - pos;
+ 
+     sl->redundant_pic_count = 0;
+     if (pps->redundant_pic_cnt_present)
+@@ -1876,9 +1878,11 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+ 
+     sl->explicit_ref_marking = 0;
+     if (nal->ref_idc) {
++        int bit_pos = sl->gb.index;
+         ret = ff_h264_decode_ref_pic_marking(sl, &sl->gb, nal, h->avctx);
+         if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
+             return AVERROR_INVALIDDATA;
++        sl->ref_pic_marking_size_in_bits = sl->gb.index - bit_pos;
+     }
+ 
+     if (sl->slice_type_nos != AV_PICTURE_TYPE_I && pps->cabac) {
+diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h
+index aebc5ed2f6..b3dcd6e7da 100644
+--- a/libavcodec/h264dec.h
++++ b/libavcodec/h264dec.h
+@@ -330,11 +330,13 @@ typedef struct H264SliceContext {
+     MMCO mmco[MAX_MMCO_COUNT];
+     int  nb_mmco;
+     int explicit_ref_marking;
++    int ref_pic_marking_size_in_bits;
+ 
+     int frame_num;
+     int poc_lsb;
+     int delta_poc_bottom;
+     int delta_poc[2];
++    int pic_order_cnt_bit_size;
+     int curr_pic_num;
+     int max_pic_num;
+ } H264SliceContext;
+diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
+index 9382e573b4..bdaeb67d26 100644
+--- a/libavcodec/v4l2_request_h264.c
++++ b/libavcodec/v4l2_request_h264.c
+@@ -311,9 +311,9 @@ static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *
+         .redundant_pic_cnt = sl->redundant_pic_count,
+ 
+         /* Size in bits of dec_ref_pic_marking() syntax element. */
+-        .dec_ref_pic_marking_bit_size = 0,
++        .dec_ref_pic_marking_bit_size = sl->ref_pic_marking_size_in_bits,
+         /* Size in bits of pic order count syntax. */
+-        .pic_order_cnt_bit_size = 0,
++        .pic_order_cnt_bit_size = sl->pic_order_cnt_bit_size,
+ 
+         .cabac_init_idc = sl->cabac_init_idc,
+         .slice_qp_delta = sl->qscale - pps->init_qp,
+
+From afd9b1ffb7a2805423d888c49ea13ee5ffc95994 Mon Sep 17 00:00:00 2001
+From: Jernej Skrabec <jernej.skrabec@siol.net>
+Date: Thu, 14 Feb 2019 23:20:05 +0100
+Subject: [PATCH 09/18] Add and use private linux headers for V4L2 request API
+ ctrls
+
+Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
+---
+ configure                       |   6 +-
+ libavcodec/h264-ctrls.h         | 210 +++++++++++++++++++++++++++++++
+ libavcodec/hevc-ctrls.h         | 212 ++++++++++++++++++++++++++++++++
+ libavcodec/mpeg2-ctrls.h        |  82 ++++++++++++
+ libavcodec/v4l2_request_h264.c  |   1 +
+ libavcodec/v4l2_request_hevc.c  |   1 +
+ libavcodec/v4l2_request_mpeg2.c |   1 +
+ libavcodec/v4l2_request_vp8.c   |   1 +
+ libavcodec/vp8-ctrls.h          | 112 +++++++++++++++++
+ 9 files changed, 623 insertions(+), 3 deletions(-)
+ create mode 100644 libavcodec/h264-ctrls.h
+ create mode 100644 libavcodec/hevc-ctrls.h
+ create mode 100644 libavcodec/mpeg2-ctrls.h
+ create mode 100644 libavcodec/vp8-ctrls.h
+
+diff --git a/configure b/configure
+index cbb91c2bca..623012757c 100755
+--- a/configure
++++ b/configure
+@@ -2925,7 +2925,7 @@ h264_dxva2_hwaccel_deps="dxva2"
+ h264_dxva2_hwaccel_select="h264_decoder"
+ h264_nvdec_hwaccel_deps="nvdec"
+ h264_nvdec_hwaccel_select="h264_decoder"
+-h264_v4l2request_hwaccel_deps="v4l2_request h264_v4l2_request"
++h264_v4l2request_hwaccel_deps="v4l2_request"
+ h264_v4l2request_hwaccel_select="h264_decoder"
+ h264_vaapi_hwaccel_deps="vaapi"
+ h264_vaapi_hwaccel_select="h264_decoder"
+@@ -2941,7 +2941,7 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
+ hevc_dxva2_hwaccel_select="hevc_decoder"
+ hevc_nvdec_hwaccel_deps="nvdec"
+ hevc_nvdec_hwaccel_select="hevc_decoder"
+-hevc_v4l2request_hwaccel_deps="v4l2_request hevc_v4l2_request"
++hevc_v4l2request_hwaccel_deps="v4l2_request"
+ hevc_v4l2request_hwaccel_select="hevc_decoder"
+ hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
+ hevc_vaapi_hwaccel_select="hevc_decoder"
+@@ -3003,7 +3003,7 @@ vc1_vdpau_hwaccel_deps="vdpau"
+ vc1_vdpau_hwaccel_select="vc1_decoder"
+ vp8_nvdec_hwaccel_deps="nvdec"
+ vp8_nvdec_hwaccel_select="vp8_decoder"
+-vp8_v4l2request_hwaccel_deps="v4l2_request vp8_v4l2_request"
++vp8_v4l2request_hwaccel_deps="v4l2_request"
+ vp8_v4l2request_hwaccel_select="vp8_decoder"
+ vp8_vaapi_hwaccel_deps="vaapi"
+ vp8_vaapi_hwaccel_select="vp8_decoder"
+diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h
+new file mode 100644
+index 0000000000..e877bf1d53
+--- /dev/null
++++ b/libavcodec/h264-ctrls.h
+@@ -0,0 +1,210 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the H.264 state controls for use with stateless H.264
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _H264_CTRLS_H_
++#define _H264_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* Our pixel format isn't stable at the moment */
++#define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
++
++/*
++ * This is put insanely high to avoid conflicting with controls that
++ * would be added during the phase where those controls are not
++ * stable. It should be fixed eventually.
++ */
++#define V4L2_CID_MPEG_VIDEO_H264_SPS		(V4L2_CID_MPEG_BASE+1000)
++#define V4L2_CID_MPEG_VIDEO_H264_PPS		(V4L2_CID_MPEG_BASE+1001)
++#define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
++#define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
++#define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
++#define V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE	(V4L2_CID_MPEG_BASE+1005)
++#define V4L2_CID_MPEG_VIDEO_H264_START_CODE	(V4L2_CID_MPEG_BASE+1006)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_H264_SPS			0x0110
++#define V4L2_CTRL_TYPE_H264_PPS			0x0111
++#define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
++#define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
++#define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
++
++enum v4l2_mpeg_video_h264_decode_mode {
++	V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_h264_start_code {
++	V4L2_MPEG_VIDEO_H264_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B,
++};
++
++#define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
++#define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
++#define V4L2_H264_SPS_CONSTRAINT_SET2_FLAG			0x04
++#define V4L2_H264_SPS_CONSTRAINT_SET3_FLAG			0x08
++#define V4L2_H264_SPS_CONSTRAINT_SET4_FLAG			0x10
++#define V4L2_H264_SPS_CONSTRAINT_SET5_FLAG			0x20
++
++#define V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE		0x01
++#define V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS	0x02
++#define V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO		0x04
++#define V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED	0x08
++#define V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY			0x10
++#define V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD		0x20
++#define V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE			0x40
++
++struct v4l2_ctrl_h264_sps {
++	__u8 profile_idc;
++	__u8 constraint_set_flags;
++	__u8 level_idc;
++	__u8 seq_parameter_set_id;
++	__u8 chroma_format_idc;
++	__u8 bit_depth_luma_minus8;
++	__u8 bit_depth_chroma_minus8;
++	__u8 log2_max_frame_num_minus4;
++	__u8 pic_order_cnt_type;
++	__u8 log2_max_pic_order_cnt_lsb_minus4;
++	__u8 max_num_ref_frames;
++	__u8 num_ref_frames_in_pic_order_cnt_cycle;
++	__s32 offset_for_ref_frame[255];
++	__s32 offset_for_non_ref_pic;
++	__s32 offset_for_top_to_bottom_field;
++	__u16 pic_width_in_mbs_minus1;
++	__u16 pic_height_in_map_units_minus1;
++	__u32 flags;
++};
++
++#define V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE				0x0001
++#define V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT	0x0002
++#define V4L2_H264_PPS_FLAG_WEIGHTED_PRED				0x0004
++#define V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT		0x0008
++#define V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED			0x0010
++#define V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT			0x0020
++#define V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE				0x0040
++#define V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT			0x0080
++
++struct v4l2_ctrl_h264_pps {
++	__u8 pic_parameter_set_id;
++	__u8 seq_parameter_set_id;
++	__u8 num_slice_groups_minus1;
++	__u8 num_ref_idx_l0_default_active_minus1;
++	__u8 num_ref_idx_l1_default_active_minus1;
++	__u8 weighted_bipred_idc;
++	__s8 pic_init_qp_minus26;
++	__s8 pic_init_qs_minus26;
++	__s8 chroma_qp_index_offset;
++	__s8 second_chroma_qp_index_offset;
++	__u16 flags;
++};
++
++struct v4l2_ctrl_h264_scaling_matrix {
++	__u8 scaling_list_4x4[6][16];
++	__u8 scaling_list_8x8[6][64];
++};
++
++struct v4l2_h264_weight_factors {
++	__s16 luma_weight[32];
++	__s16 luma_offset[32];
++	__s16 chroma_weight[32][2];
++	__s16 chroma_offset[32][2];
++};
++
++struct v4l2_h264_pred_weight_table {
++	__u16 luma_log2_weight_denom;
++	__u16 chroma_log2_weight_denom;
++	struct v4l2_h264_weight_factors weight_factors[2];
++};
++
++#define V4L2_H264_SLICE_TYPE_P				0
++#define V4L2_H264_SLICE_TYPE_B				1
++#define V4L2_H264_SLICE_TYPE_I				2
++#define V4L2_H264_SLICE_TYPE_SP				3
++#define V4L2_H264_SLICE_TYPE_SI				4
++
++#define V4L2_H264_SLICE_FLAG_FIELD_PIC			0x01
++#define V4L2_H264_SLICE_FLAG_BOTTOM_FIELD		0x02
++#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED	0x04
++#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH		0x08
++
++struct v4l2_ctrl_h264_slice_params {
++	/* Size in bytes, including header */
++	__u32 size;
++
++	/* Offset in bytes to the start of slice in the OUTPUT buffer. */
++	__u32 start_byte_offset;
++
++	/* Offset in bits to slice_data() from the beginning of this slice. */
++	__u32 header_bit_size;
++
++	__u16 first_mb_in_slice;
++	__u8 slice_type;
++	__u8 pic_parameter_set_id;
++	__u8 colour_plane_id;
++	__u8 redundant_pic_cnt;
++	__u16 frame_num;
++	__u16 idr_pic_id;
++	__u16 pic_order_cnt_lsb;
++	__s32 delta_pic_order_cnt_bottom;
++	__s32 delta_pic_order_cnt0;
++	__s32 delta_pic_order_cnt1;
++
++	struct v4l2_h264_pred_weight_table pred_weight_table;
++	/* Size in bits of dec_ref_pic_marking() syntax element. */
++	__u32 dec_ref_pic_marking_bit_size;
++	/* Size in bits of pic order count syntax. */
++	__u32 pic_order_cnt_bit_size;
++
++	__u8 cabac_init_idc;
++	__s8 slice_qp_delta;
++	__s8 slice_qs_delta;
++	__u8 disable_deblocking_filter_idc;
++	__s8 slice_alpha_c0_offset_div2;
++	__s8 slice_beta_offset_div2;
++	__u8 num_ref_idx_l0_active_minus1;
++	__u8 num_ref_idx_l1_active_minus1;
++	__u32 slice_group_change_cycle;
++
++	/*
++	 * Entries on each list are indices into
++	 * v4l2_ctrl_h264_decode_params.dpb[].
++	 */
++	__u8 ref_pic_list0[32];
++	__u8 ref_pic_list1[32];
++
++	__u32 flags;
++};
++
++#define V4L2_H264_DPB_ENTRY_FLAG_VALID		0x01
++#define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE		0x02
++#define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM	0x04
++
++struct v4l2_h264_dpb_entry {
++	__u64 reference_ts;
++	__u16 frame_num;
++	__u16 pic_num;
++	/* Note that field is indicated by v4l2_buffer.field */
++	__s32 top_field_order_cnt;
++	__s32 bottom_field_order_cnt;
++	__u32 flags; /* V4L2_H264_DPB_ENTRY_FLAG_* */
++};
++
++#define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC	0x01
++
++struct v4l2_ctrl_h264_decode_params {
++	struct v4l2_h264_dpb_entry dpb[16];
++	__u16 num_slices;
++	__u16 nal_ref_idc;
++	__s32 top_field_order_cnt;
++	__s32 bottom_field_order_cnt;
++	__u32 flags; /* V4L2_H264_DECODE_PARAM_FLAG_* */
++};
++
++#endif
+diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h
+new file mode 100644
+index 0000000000..1009cf0891
+--- /dev/null
++++ b/libavcodec/hevc-ctrls.h
+@@ -0,0 +1,212 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++
++	__u8	padding;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	rps;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	num_active_dpb_entries;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	num_rps_poc_st_curr_before;
++	__u8	num_rps_poc_st_curr_after;
++	__u8	num_rps_poc_lt_curr;
++
++	__u8	padding;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++#endif
+diff --git a/libavcodec/mpeg2-ctrls.h b/libavcodec/mpeg2-ctrls.h
+new file mode 100644
+index 0000000000..6601455b3d
+--- /dev/null
++++ b/libavcodec/mpeg2-ctrls.h
+@@ -0,0 +1,82 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the MPEG2 state controls for use with stateless MPEG-2
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _MPEG2_CTRLS_H_
++#define _MPEG2_CTRLS_H_
++
++#define V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS		(V4L2_CID_MPEG_BASE+250)
++#define V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION		(V4L2_CID_MPEG_BASE+251)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS 0x0103
++#define	V4L2_CTRL_TYPE_MPEG2_QUANTIZATION 0x0104
++
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_I	1
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_P	2
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_B	3
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_D	4
++
++struct v4l2_mpeg2_sequence {
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
++	__u16	horizontal_size;
++	__u16	vertical_size;
++	__u32	vbv_buffer_size;
++
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
++	__u16	profile_and_level_indication;
++	__u8	progressive_sequence;
++	__u8	chroma_format;
++};
++
++struct v4l2_mpeg2_picture {
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
++	__u8	picture_coding_type;
++
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
++	__u8	f_code[2][2];
++	__u8	intra_dc_precision;
++	__u8	picture_structure;
++	__u8	top_field_first;
++	__u8	frame_pred_frame_dct;
++	__u8	concealment_motion_vectors;
++	__u8	q_scale_type;
++	__u8	intra_vlc_format;
++	__u8	alternate_scan;
++	__u8	repeat_first_field;
++	__u16	progressive_frame;
++};
++
++struct v4l2_ctrl_mpeg2_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++	__u64	backward_ref_ts;
++	__u64	forward_ref_ts;
++
++	struct v4l2_mpeg2_sequence sequence;
++	struct v4l2_mpeg2_picture picture;
++
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
++	__u32	quantiser_scale_code;
++};
++
++struct v4l2_ctrl_mpeg2_quantization {
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
++	__u8	load_intra_quantiser_matrix;
++	__u8	load_non_intra_quantiser_matrix;
++	__u8	load_chroma_intra_quantiser_matrix;
++	__u8	load_chroma_non_intra_quantiser_matrix;
++
++	__u8	intra_quantiser_matrix[64];
++	__u8	non_intra_quantiser_matrix[64];
++	__u8	chroma_intra_quantiser_matrix[64];
++	__u8	chroma_non_intra_quantiser_matrix[64];
++};
++
++#endif
+diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
+index bdaeb67d26..0254716e52 100644
+--- a/libavcodec/v4l2_request_h264.c
++++ b/libavcodec/v4l2_request_h264.c
+@@ -19,6 +19,7 @@
+ #include "h264dec.h"
+ #include "hwconfig.h"
+ #include "v4l2_request.h"
++#include "h264-ctrls.h"
+ 
+ typedef struct V4L2RequestControlsH264 {
+     struct v4l2_ctrl_h264_sps sps;
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+index f724909546..c16f8a868e 100644
+--- a/libavcodec/v4l2_request_hevc.c
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -19,6 +19,7 @@
+ #include "hevcdec.h"
+ #include "hwconfig.h"
+ #include "v4l2_request.h"
++#include "hevc-ctrls.h"
+ 
+ #define MAX_SLICES 16
+ 
+diff --git a/libavcodec/v4l2_request_mpeg2.c b/libavcodec/v4l2_request_mpeg2.c
+index 88d86cc4c2..bc251a6fd2 100644
+--- a/libavcodec/v4l2_request_mpeg2.c
++++ b/libavcodec/v4l2_request_mpeg2.c
+@@ -19,6 +19,7 @@
+ #include "hwconfig.h"
+ #include "mpegvideo.h"
+ #include "v4l2_request.h"
++#include "mpeg2-ctrls.h"
+ 
+ typedef struct V4L2RequestControlsMPEG2 {
+     struct v4l2_ctrl_mpeg2_slice_params slice_params;
+diff --git a/libavcodec/v4l2_request_vp8.c b/libavcodec/v4l2_request_vp8.c
+index 7e75ee398a..ea2c55fa2f 100644
+--- a/libavcodec/v4l2_request_vp8.c
++++ b/libavcodec/v4l2_request_vp8.c
+@@ -19,6 +19,7 @@
+ #include "hwconfig.h"
+ #include "v4l2_request.h"
+ #include "vp8.h"
++#include "vp8-ctrls.h"
+ 
+ typedef struct V4L2RequestControlsVP8 {
+     struct v4l2_ctrl_vp8_frame_header ctrl;
+diff --git a/libavcodec/vp8-ctrls.h b/libavcodec/vp8-ctrls.h
+new file mode 100644
+index 0000000000..53cba826e4
+--- /dev/null
++++ b/libavcodec/vp8-ctrls.h
+@@ -0,0 +1,112 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the VP8 state controls for use with stateless VP8
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _VP8_CTRLS_H_
++#define _VP8_CTRLS_H_
++
++#include <linux/types.h>
++
++#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F')
++
++#define V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (V4L2_CID_MPEG_BASE + 2000)
++#define V4L2_CTRL_TYPE_VP8_FRAME_HEADER 0x301
++
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED              0x01
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP           0x02
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA  0x04
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE     0x08
++
++struct v4l2_vp8_segment_header {
++	__s8 quant_update[4];
++	__s8 lf_update[4];
++	__u8 segment_probs[3];
++	__u8 padding;
++	__u32 flags;
++};
++
++#define V4L2_VP8_LF_HEADER_ADJ_ENABLE	0x01
++#define V4L2_VP8_LF_HEADER_DELTA_UPDATE	0x02
++#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE	0x04
++struct v4l2_vp8_loopfilter_header {
++	__s8 ref_frm_delta[4];
++	__s8 mb_mode_delta[4];
++	__u8 sharpness_level;
++	__u8 level;
++	__u16 padding;
++	__u32 flags;
++};
++
++struct v4l2_vp8_quantization_header {
++	__u8 y_ac_qi;
++	__s8 y_dc_delta;
++	__s8 y2_dc_delta;
++	__s8 y2_ac_delta;
++	__s8 uv_dc_delta;
++	__s8 uv_ac_delta;
++	__u16 padding;
++};
++
++struct v4l2_vp8_entropy_header {
++	__u8 coeff_probs[4][8][3][11];
++	__u8 y_mode_probs[4];
++	__u8 uv_mode_probs[3];
++	__u8 mv_probs[2][19];
++	__u8 padding[3];
++};
++
++struct v4l2_vp8_entropy_coder_state {
++	__u8 range;
++	__u8 value;
++	__u8 bit_count;
++	__u8 padding;
++};
++
++#define V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME		0x01
++#define V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL		0x02
++#define V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME		0x04
++#define V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF	0x08
++#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN	0x10
++#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT	0x20
++
++#define VP8_FRAME_IS_KEY_FRAME(hdr) \
++	(!!((hdr)->flags & V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME))
++
++struct v4l2_ctrl_vp8_frame_header {
++	struct v4l2_vp8_segment_header segment_header;
++	struct v4l2_vp8_loopfilter_header lf_header;
++	struct v4l2_vp8_quantization_header quant_header;
++	struct v4l2_vp8_entropy_header entropy_header;
++	struct v4l2_vp8_entropy_coder_state coder_state;
++
++	__u16 width;
++	__u16 height;
++
++	__u8 horizontal_scale;
++	__u8 vertical_scale;
++
++	__u8 version;
++	__u8 prob_skip_false;
++	__u8 prob_intra;
++	__u8 prob_last;
++	__u8 prob_gf;
++	__u8 num_dct_parts;
++
++	__u32 first_part_size;
++	__u32 first_part_header_bits;
++	__u32 dct_part_sizes[8];
++
++	__u64 last_frame_ts;
++	__u64 golden_frame_ts;
++	__u64 alt_frame_ts;
++
++	__u64 flags;
++};
++
++#endif
+
+From 4b5474250e1adb4931afb6418403def0d914aaea Mon Sep 17 00:00:00 2001
+From: Jonas Karlman <jonas@kwiboo.se>
+Date: Sat, 2 May 2020 11:00:26 +0000
+Subject: [PATCH 10/18] Update to v5.7 private linux headers
+
+---
+ libavcodec/h264-ctrls.h        | 2 ++
+ libavcodec/v4l2_request_h264.c | 8 +++++++-
+ 2 files changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h
+index e877bf1d53..1c6ff7d63b 100644
+--- a/libavcodec/h264-ctrls.h
++++ b/libavcodec/h264-ctrls.h
+@@ -185,6 +185,8 @@ struct v4l2_ctrl_h264_slice_params {
+ #define V4L2_H264_DPB_ENTRY_FLAG_VALID		0x01
+ #define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE		0x02
+ #define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM	0x04
++#define V4L2_H264_DPB_ENTRY_FLAG_FIELD		0x08
++#define V4L2_H264_DPB_ENTRY_FLAG_BOTTOM_FIELD	0x10
+ 
+ struct v4l2_h264_dpb_entry {
+ 	__u64 reference_ts;
+diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
+index 0254716e52..d28ed07da3 100644
+--- a/libavcodec/v4l2_request_h264.c
++++ b/libavcodec/v4l2_request_h264.c
+@@ -67,8 +67,14 @@ static void fill_dpb_entry(struct v4l2_h264_dpb_entry *entry, const H264Picture
+     entry->frame_num = pic->frame_num;
+     entry->pic_num = pic->pic_id;
+     entry->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
+-    if (pic->reference)
++    if (pic->reference) {
+         entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE;
++        if (pic->reference != PICT_FRAME) {
++            entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_FIELD;
++            if (pic->reference == PICT_BOTTOM_FIELD)
++                entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_BOTTOM_FIELD;
++        }
++    }
+     if (pic->long_ref)
+         entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
+     if (pic->field_poc[0] != INT_MAX)
+
+From 5044b279b44f20c58cb92600d751984de6c8111c Mon Sep 17 00:00:00 2001
+From: Jonas Karlman <jonas@kwiboo.se>
+Date: Sat, 2 May 2020 22:03:42 +0000
+Subject: [PATCH 11/18] Update to v5.8 private linux headers
+
+---
+ libavcodec/h264-ctrls.h | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h
+index 1c6ff7d63b..080fd1293c 100644
+--- a/libavcodec/h264-ctrls.h
++++ b/libavcodec/h264-ctrls.h
+@@ -13,6 +13,12 @@
+ 
+ #include <linux/videodev2.h>
+ 
++/*
++ * Maximum DPB size, as specified by section 'A.3.1 Level limits
++ * common to the Baseline, Main, and Extended profiles'.
++ */
++#define V4L2_H264_NUM_DPB_ENTRIES 16
++
+ /* Our pixel format isn't stable at the moment */
+ #define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
+ 
+@@ -201,7 +207,7 @@ struct v4l2_h264_dpb_entry {
+ #define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC	0x01
+ 
+ struct v4l2_ctrl_h264_decode_params {
+-	struct v4l2_h264_dpb_entry dpb[16];
++	struct v4l2_h264_dpb_entry dpb[V4L2_H264_NUM_DPB_ENTRIES];
+ 	__u16 num_slices;
+ 	__u16 nal_ref_idc;
+ 	__s32 top_field_order_cnt;
+
+From 3503a60af592944589b2beae7ce004884dfe04e2 Mon Sep 17 00:00:00 2001
+From: Jonas Karlman <jonas@kwiboo.se>
+Date: Mon, 29 Apr 2019 22:08:59 +0000
+Subject: [PATCH 12/18] HACK: hwcontext_drm: do not require drm device
+
+Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
+---
+ libavutil/hwcontext_drm.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
+index 32cbde82eb..aa4794c5e6 100644
+--- a/libavutil/hwcontext_drm.c
++++ b/libavutil/hwcontext_drm.c
+@@ -43,6 +43,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
+     AVDRMDeviceContext *hwctx = hwdev->hwctx;
+     drmVersionPtr version;
+ 
++    if (device == NULL) {
++      hwctx->fd = -1;
++      return 0;
++    }
++
+     hwctx->fd = open(device, O_RDWR);
+     if (hwctx->fd < 0)
+         return AVERROR(errno);
+
+From 9e5907d59c23f5ccd4c48cfe37775411ce308107 Mon Sep 17 00:00:00 2001
+From: Jernej Skrabec <jernej.skrabec@siol.net>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 13/18] WIP: h264 field reference
+
+Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
+Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
+---
+ libavcodec/v4l2_request_h264.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
+index d28ed07da3..5b0f21a60d 100644
+--- a/libavcodec/v4l2_request_h264.c
++++ b/libavcodec/v4l2_request_h264.c
+@@ -116,7 +116,8 @@ static uint8_t get_dpb_index(struct v4l2_ctrl_h264_decode_params *decode, const
+         struct v4l2_h264_dpb_entry *entry = &decode->dpb[i];
+         if ((entry->flags & V4L2_H264_DPB_ENTRY_FLAG_VALID) &&
+             entry->reference_ts == timestamp)
+-            return i;
++            // TODO: signal reference type, possible using top 2 bits
++            return i | ((ref->reference & 3) << 6);
+     }
+ 
+     return 0;
+
+From 9cf9b825c33690656331a9693b3132d1d82b75a8 Mon Sep 17 00:00:00 2001
+From: Jernej Skrabec <jernej.skrabec@siol.net>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 14/18] WIP: hevc scaling matrix
+
+Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
+---
+ libavcodec/hevc-ctrls.h        | 11 +++++++++++
+ libavcodec/v4l2_request_hevc.c | 22 ++++++++++++++++++++++
+ 2 files changed, 33 insertions(+)
+
+diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h
+index 1009cf0891..1592e52c36 100644
+--- a/libavcodec/hevc-ctrls.h
++++ b/libavcodec/hevc-ctrls.h
+@@ -19,6 +19,7 @@
+ #define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
+ #define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
+ #define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
+ #define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
+ #define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
+ 
+@@ -26,6 +27,7 @@
+ #define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
+ #define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
+ #define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
+ 
+ enum v4l2_mpeg_video_hevc_decode_mode {
+ 	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
+@@ -209,4 +211,13 @@ struct v4l2_ctrl_hevc_slice_params {
+ 	__u64	flags;
+ };
+ 
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
+ #endif
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+index c16f8a868e..f400bf4f3c 100644
+--- a/libavcodec/v4l2_request_hevc.c
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -26,6 +26,7 @@
+ typedef struct V4L2RequestControlsHEVC {
+     struct v4l2_ctrl_hevc_sps sps;
+     struct v4l2_ctrl_hevc_pps pps;
++    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
+     struct v4l2_ctrl_hevc_slice_params slice_params[MAX_SLICES];
+     int first_slice;
+     int num_slices; //TODO: this should be in control
+@@ -295,6 +296,22 @@ static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
+ 
+     fill_sps(&controls->sps, h);
+ 
++    if (sl) {
++        for (int i = 0; i < 6; i++) {
++            for (int j = 0; j < 16; j++)
++                controls->scaling_matrix.scaling_list_4x4[i][j] = sl->sl[0][i][j];
++            for (int j = 0; j < 64; j++) {
++                controls->scaling_matrix.scaling_list_8x8[i][j]   = sl->sl[1][i][j];
++                controls->scaling_matrix.scaling_list_16x16[i][j] = sl->sl[2][i][j];
++                if (i < 2)
++                    controls->scaling_matrix.scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
++            }
++            controls->scaling_matrix.scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
++            if (i < 2)
++                controls->scaling_matrix.scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
++        }
++    }
++
+     /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
+     controls->pps = (struct v4l2_ctrl_hevc_pps) {
+         .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
+@@ -398,6 +415,11 @@ static int v4l2_request_hevc_queue_decode(AVCodecContext *avctx, int last_slice)
+             .ptr = &controls->pps,
+             .size = sizeof(controls->pps),
+         },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        },
+         {
+             .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
+             .ptr = &controls->slice_params,
+
+From 572a336f459070be3ac0ecc15a11057c6be6cb61 Mon Sep 17 00:00:00 2001
+From: Jernej Skrabec <jernej.skrabec@siol.net>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 15/18] WIP: hevc segment address
+
+Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
+---
+ libavcodec/hevc-ctrls.h        | 5 ++++-
+ libavcodec/v4l2_request_hevc.c | 3 +++
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h
+index 1592e52c36..3e2e320983 100644
+--- a/libavcodec/hevc-ctrls.h
++++ b/libavcodec/hevc-ctrls.h
+@@ -167,6 +167,9 @@ struct v4l2_ctrl_hevc_slice_params {
+ 	__u32	bit_size;
+ 	__u32	data_bit_offset;
+ 
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++
+ 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+ 	__u8	nal_unit_type;
+ 	__u8	nuh_temporal_id_plus1;
+@@ -200,7 +203,7 @@ struct v4l2_ctrl_hevc_slice_params {
+ 	__u8	num_rps_poc_st_curr_after;
+ 	__u8	num_rps_poc_lt_curr;
+ 
+-	__u8	padding;
++	__u8	padding[5];
+ 
+ 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+ 	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+index f400bf4f3c..98222fc74c 100644
+--- a/libavcodec/v4l2_request_hevc.c
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -138,6 +138,9 @@ static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
+         .bit_size = 0,
+         .data_bit_offset = get_bits_count(&h->HEVClc->gb),
+ 
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_segment_addr = sh->slice_segment_addr,
++
+         /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+         .nal_unit_type = h->nal_unit_type,
+         .nuh_temporal_id_plus1 = h->temporal_id + 1,
+
+From 8777f29fef8e41f305ecf54dbc9908fca2e5d98a Mon Sep 17 00:00:00 2001
+From: Jernej Skrabec <jernej.skrabec@siol.net>
+Date: Sat, 15 Dec 2018 22:32:16 +0100
+Subject: [PATCH 16/18] WIP: hevc entry point offsets
+
+Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
+---
+ libavcodec/hevc-ctrls.h        | 5 ++++-
+ libavcodec/v4l2_request_hevc.c | 9 +++++++++
+ 2 files changed, 13 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h
+index 3e2e320983..d1b094c8aa 100644
+--- a/libavcodec/hevc-ctrls.h
++++ b/libavcodec/hevc-ctrls.h
+@@ -169,6 +169,7 @@ struct v4l2_ctrl_hevc_slice_params {
+ 
+ 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+ 	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
+ 
+ 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
+ 	__u8	nal_unit_type;
+@@ -203,7 +204,9 @@ struct v4l2_ctrl_hevc_slice_params {
+ 	__u8	num_rps_poc_st_curr_after;
+ 	__u8	num_rps_poc_lt_curr;
+ 
+-	__u8	padding[5];
++	__u8	padding;
++
++	__u32	entry_point_offset_minus1[256];
+ 
+ 	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
+ 	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+index 98222fc74c..7e77c83e4e 100644
+--- a/libavcodec/v4l2_request_hevc.c
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -225,6 +225,15 @@ static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
+     }
+ 
+     v4l2_request_hevc_fill_pred_table(h, &slice_params->pred_weight_table);
++
++    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++    if (slice_params->num_entry_point_offsets > 256) {
++        slice_params->num_entry_point_offsets = 256;
++        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
++    }
++
++    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
++        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
+ }
+ 
+ static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCContext *h)
+
+From 931d210b66a033eed21dab4be65ae93f2198d9b5 Mon Sep 17 00:00:00 2001
+From: Boris Brezillon <boris.brezillon@collabora.com>
+Date: Thu, 12 Dec 2019 16:13:55 +0100
+Subject: [PATCH 17/18] WIP: Add V4L2 request API vp9 hwaccel
+
+Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
+---
+ configure                     |   3 +
+ libavcodec/Makefile           |   1 +
+ libavcodec/hwaccels.h         |   1 +
+ libavcodec/v4l2_request_vp9.c | 352 ++++++++++++++++++++++++++++++++++
+ libavcodec/vp9.c              |  13 +-
+ libavcodec/vp9shared.h        |   1 +
+ 6 files changed, 370 insertions(+), 1 deletion(-)
+ create mode 100644 libavcodec/v4l2_request_vp9.c
+
+diff --git a/configure b/configure
+index 623012757c..2b723df55a 100755
+--- a/configure
++++ b/configure
+@@ -3015,6 +3015,8 @@ vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9"
+ vp9_dxva2_hwaccel_select="vp9_decoder"
+ vp9_nvdec_hwaccel_deps="nvdec"
+ vp9_nvdec_hwaccel_select="vp9_decoder"
++vp9_v4l2request_hwaccel_deps="v4l2_request vp9_v4l2_request"
++vp9_v4l2request_hwaccel_select="vp9_decoder"
+ vp9_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferVP9_bit_depth"
+ vp9_vaapi_hwaccel_select="vp9_decoder"
+ vp9_vdpau_hwaccel_deps="vdpau VdpPictureInfoVP9"
+@@ -6579,6 +6581,7 @@ check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
+ check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
+ check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
+ check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;"
++check_cc vp9_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP9_FRAME;"
+ 
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index d6af854daa..2f0e0a0976 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -945,6 +945,7 @@ OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
+ OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
+ OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
+ OBJS-$(CONFIG_VP9_NVDEC_HWACCEL)          += nvdec_vp9.o
++OBJS-$(CONFIG_VP9_V4L2REQUEST_HWACCEL)    += v4l2_request_vp9.o
+ OBJS-$(CONFIG_VP9_VAAPI_HWACCEL)          += vaapi_vp9.o
+ OBJS-$(CONFIG_VP9_VDPAU_HWACCEL)          += vdpau_vp9.o
+ OBJS-$(CONFIG_VP8_QSV_HWACCEL)            += qsvdec_other.o
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index bd75e94f4c..03a1aefe09 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -71,6 +71,7 @@ extern const AVHWAccel ff_vp9_d3d11va_hwaccel;
+ extern const AVHWAccel ff_vp9_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_vp9_dxva2_hwaccel;
+ extern const AVHWAccel ff_vp9_nvdec_hwaccel;
++extern const AVHWAccel ff_vp9_v4l2request_hwaccel;
+ extern const AVHWAccel ff_vp9_vaapi_hwaccel;
+ extern const AVHWAccel ff_vp9_vdpau_hwaccel;
+ extern const AVHWAccel ff_wmv3_d3d11va_hwaccel;
+diff --git a/libavcodec/v4l2_request_vp9.c b/libavcodec/v4l2_request_vp9.c
+new file mode 100644
+index 0000000000..4074c7fe4b
+--- /dev/null
++++ b/libavcodec/v4l2_request_vp9.c
+@@ -0,0 +1,352 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hwconfig.h"
++#include "v4l2_request.h"
++#include "vp9dec.h"
++
++typedef struct V4L2RequestControlsVP9 {
++    struct v4l2_ctrl_vp9_frame_decode_params decode_params;
++} V4L2RequestControlsVP9;
++
++static const uint8_t ff_to_v4l2_intramode[] = {
++    [VERT_PRED] = V4L2_VP9_INTRA_PRED_MODE_V,
++    [HOR_PRED] = V4L2_VP9_INTRA_PRED_MODE_H,
++    [DC_PRED] = V4L2_VP9_INTRA_PRED_MODE_DC,
++    [DIAG_DOWN_LEFT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D45,
++    [DIAG_DOWN_RIGHT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D135,
++    [VERT_RIGHT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D117,
++    [HOR_DOWN_PRED] = V4L2_VP9_INTRA_PRED_MODE_D153,
++    [VERT_LEFT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D63,
++    [HOR_UP_PRED] = V4L2_VP9_INTRA_PRED_MODE_D207,
++    [TM_VP8_PRED] = V4L2_VP9_INTRA_PRED_MODE_TM,
++};
++
++static int v4l2_request_vp9_set_frame_ctx(AVCodecContext *avctx, unsigned int id)
++{
++    VP9Context *s = avctx->priv_data;
++    struct v4l2_ctrl_vp9_frame_ctx fctx = {};
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(id),
++            .ptr = &fctx,
++            .size = sizeof(fctx),
++        },
++    };
++
++    memcpy(fctx.probs.tx8, s->prob_ctx[id].p.tx8p, sizeof(s->prob_ctx[id].p.tx8p));
++    memcpy(fctx.probs.tx16, s->prob_ctx[id].p.tx16p, sizeof(s->prob_ctx[id].p.tx16p));
++    memcpy(fctx.probs.tx32, s->prob_ctx[id].p.tx32p, sizeof(s->prob_ctx[id].p.tx32p));
++    memcpy(fctx.probs.coef, s->prob_ctx[id].coef, sizeof(s->prob_ctx[id].coef));
++    memcpy(fctx.probs.skip, s->prob_ctx[id].p.skip, sizeof(s->prob_ctx[id].p.skip));
++    memcpy(fctx.probs.inter_mode, s->prob_ctx[id].p.mv_mode, sizeof(s->prob_ctx[id].p.mv_mode));
++    memcpy(fctx.probs.interp_filter, s->prob_ctx[id].p.filter, sizeof(s->prob_ctx[id].p.filter));
++    memcpy(fctx.probs.is_inter, s->prob_ctx[id].p.intra, sizeof(s->prob_ctx[id].p.intra));
++    memcpy(fctx.probs.comp_mode, s->prob_ctx[id].p.comp, sizeof(s->prob_ctx[id].p.comp));
++    memcpy(fctx.probs.single_ref, s->prob_ctx[id].p.single_ref, sizeof(s->prob_ctx[id].p.single_ref));
++    memcpy(fctx.probs.comp_ref, s->prob_ctx[id].p.comp_ref, sizeof(s->prob_ctx[id].p.comp_ref));
++    memcpy(fctx.probs.y_mode, s->prob_ctx[id].p.y_mode, sizeof(s->prob_ctx[id].p.y_mode));
++    for (unsigned i = 0; i < 10; i++)
++        memcpy(fctx.probs.uv_mode[ff_to_v4l2_intramode[i]], s->prob_ctx[id].p.uv_mode[i], sizeof(s->prob_ctx[id].p.uv_mode[0]));
++    for (unsigned i = 0; i < 4; i++)
++        memcpy(fctx.probs.partition[i * 4], s->prob_ctx[id].p.partition[3 - i], sizeof(s->prob_ctx[id].p.partition[0]));
++    memcpy(fctx.probs.mv.joint, s->prob_ctx[id].p.mv_joint, sizeof(s->prob_ctx[id].p.mv_joint));
++    for (unsigned i = 0; i < 2; i++) {
++         fctx.probs.mv.sign[i] = s->prob_ctx[id].p.mv_comp[i].sign;
++         memcpy(fctx.probs.mv.class[i], s->prob_ctx[id].p.mv_comp[i].classes, sizeof(s->prob_ctx[id].p.mv_comp[0].classes));
++         fctx.probs.mv.class0_bit[i] = s->prob_ctx[id].p.mv_comp[i].class0;
++         memcpy(fctx.probs.mv.bits[i], s->prob_ctx[id].p.mv_comp[i].bits, sizeof(s->prob_ctx[id].p.mv_comp[0].bits));
++         memcpy(fctx.probs.mv.class0_fr[i], s->prob_ctx[id].p.mv_comp[i].class0_fp, sizeof(s->prob_ctx[id].p.mv_comp[0].class0_fp));
++         memcpy(fctx.probs.mv.fr[i], s->prob_ctx[id].p.mv_comp[i].fp, sizeof(s->prob_ctx[id].p.mv_comp[0].fp));
++         fctx.probs.mv.class0_hp[i] = s->prob_ctx[id].p.mv_comp[i].class0_hp;
++         fctx.probs.mv.hp[i] = s->prob_ctx[id].p.mv_comp[i].hp;
++    }
++
++    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_vp9_get_frame_ctx(AVCodecContext *avctx, unsigned int id)
++{
++    VP9Context *s = avctx->priv_data;
++    struct v4l2_ctrl_vp9_frame_ctx fctx = {};
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(id),
++            .ptr = &fctx,
++            .size = sizeof(fctx),
++        },
++    };
++
++    int ret = ff_v4l2_request_get_controls(avctx, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    memcpy(s->prob_ctx[id].p.tx8p, fctx.probs.tx8, sizeof(s->prob_ctx[id].p.tx8p));
++    memcpy(s->prob_ctx[id].p.tx16p, fctx.probs.tx16, sizeof(s->prob_ctx[id].p.tx16p));
++    memcpy(s->prob_ctx[id].p.tx32p, fctx.probs.tx32, sizeof(s->prob_ctx[id].p.tx32p));
++    memcpy(s->prob_ctx[id].coef, fctx.probs.coef, sizeof(s->prob_ctx[id].coef));
++    memcpy(s->prob_ctx[id].p.skip, fctx.probs.skip, sizeof(s->prob_ctx[id].p.skip));
++    memcpy(s->prob_ctx[id].p.mv_mode, fctx.probs.inter_mode, sizeof(s->prob_ctx[id].p.mv_mode));
++    memcpy(s->prob_ctx[id].p.filter, fctx.probs.interp_filter, sizeof(s->prob_ctx[id].p.filter));
++    memcpy(s->prob_ctx[id].p.intra, fctx.probs.is_inter, sizeof(s->prob_ctx[id].p.intra));
++    memcpy(s->prob_ctx[id].p.comp, fctx.probs.comp_mode, sizeof(s->prob_ctx[id].p.comp));
++    memcpy(s->prob_ctx[id].p.single_ref, fctx.probs.single_ref, sizeof(s->prob_ctx[id].p.single_ref));
++    memcpy(s->prob_ctx[id].p.comp_ref, fctx.probs.comp_ref, sizeof(s->prob_ctx[id].p.comp_ref));
++    memcpy(s->prob_ctx[id].p.y_mode, fctx.probs.y_mode, sizeof(s->prob_ctx[id].p.y_mode));
++    for (unsigned i = 0; i < 10; i++)
++        memcpy(s->prob_ctx[id].p.uv_mode[i], fctx.probs.uv_mode[ff_to_v4l2_intramode[i]], sizeof(s->prob_ctx[id].p.uv_mode[0]));
++    for (unsigned i = 0; i < 4; i++)
++        memcpy(s->prob_ctx[id].p.partition[3 - i], fctx.probs.partition[i * 4], sizeof(s->prob_ctx[id].p.partition[0]));
++    memcpy(s->prob_ctx[id].p.mv_joint, fctx.probs.mv.joint, sizeof(s->prob_ctx[id].p.mv_joint));
++    for (unsigned i = 0; i < 2; i++) {
++         s->prob_ctx[id].p.mv_comp[i].sign = fctx.probs.mv.sign[i];
++         memcpy(s->prob_ctx[id].p.mv_comp[i].classes, fctx.probs.mv.class[i], sizeof(s->prob_ctx[id].p.mv_comp[0].classes));
++         s->prob_ctx[id].p.mv_comp[i].class0 = fctx.probs.mv.class0_bit[i];
++         memcpy(s->prob_ctx[id].p.mv_comp[i].bits, fctx.probs.mv.bits[i], sizeof(s->prob_ctx[id].p.mv_comp[0].bits));
++         memcpy(s->prob_ctx[id].p.mv_comp[i].class0_fp, fctx.probs.mv.class0_fr[i], sizeof(s->prob_ctx[id].p.mv_comp[0].class0_fp));
++         memcpy(s->prob_ctx[id].p.mv_comp[i].fp, fctx.probs.mv.fr[i], sizeof(s->prob_ctx[id].p.mv_comp[0].fp));
++         s->prob_ctx[id].p.mv_comp[i].class0_hp = fctx.probs.mv.class0_hp[i];
++         s->prob_ctx[id].p.mv_comp[i].hp = fctx.probs.mv.hp[i];
++    }
++
++    return 0;
++}
++
++static int v4l2_request_vp9_start_frame(AVCodecContext *avctx,
++                                        av_unused const uint8_t *buffer,
++                                        av_unused uint32_t size)
++{
++    const VP9Context *s = avctx->priv_data;
++    const VP9Frame *f = &s->s.frames[CUR_FRAME];
++    V4L2RequestControlsVP9 *controls = f->hwaccel_picture_private;
++    struct v4l2_ctrl_vp9_frame_decode_params *dec_params = &controls->decode_params;
++    int ret;
++
++    if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
++        for (unsigned i = 0; i < 4; i++) {
++            ret = v4l2_request_vp9_set_frame_ctx(avctx, i);
++            if (ret)
++                return ret;
++        }
++    } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
++        ret = v4l2_request_vp9_set_frame_ctx(avctx, s->s.h.framectxid);
++        if (ret)
++            return ret;
++    }
++
++    if (s->s.h.keyframe)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_KEY_FRAME;
++    if (!s->s.h.invisible)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_SHOW_FRAME;
++    if (s->s.h.errorres)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT;
++    if (s->s.h.intraonly)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_INTRA_ONLY;
++    if (!s->s.h.keyframe && s->s.h.highprecisionmvs)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV;
++    if (s->s.h.refreshctx)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX;
++    if (s->s.h.parallelmode)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE;
++    if (s->ss_h)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING;
++    if (s->ss_v)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING;
++    if (avctx->color_range == AVCOL_RANGE_JPEG)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING;
++
++    dec_params->compressed_header_size = s->s.h.compressed_header_size;
++    dec_params->uncompressed_header_size = s->s.h.uncompressed_header_size;
++    dec_params->profile = s->s.h.profile;
++    dec_params->reset_frame_context = s->s.h.resetctx > 0 ? s->s.h.resetctx - 1 : 0;
++    dec_params->frame_context_idx = s->s.h.framectxid;
++    dec_params->bit_depth = s->s.h.bpp;
++
++    dec_params->interpolation_filter = s->s.h.filtermode ^ (s->s.h.filtermode <= 1);
++    dec_params->tile_cols_log2 = s->s.h.tiling.log2_tile_cols;
++    dec_params->tile_rows_log2 = s->s.h.tiling.log2_tile_rows;
++    dec_params->tx_mode = s->s.h.txfmmode;
++    dec_params->reference_mode = s->s.h.comppredmode;
++    dec_params->frame_width_minus_1 = s->w - 1;
++    dec_params->frame_height_minus_1 = s->h - 1;
++    //dec_params->render_width_minus_1 = avctx->width - 1;
++    //dec_params->render_height_minus_1 = avctx->height - 1;
++
++    for (unsigned i = 0; i < 3; i++) {
++        const ThreadFrame *ref = &s->s.refs[s->s.h.refidx[i]];
++        if (ref->f && ref->f->buf[0])
++            dec_params->refs[i] = ff_v4l2_request_get_capture_timestamp(ref->f);
++    }
++
++    if (s->s.h.lf_delta.enabled)
++        dec_params->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED;
++    if (s->s.h.lf_delta.updated)
++        dec_params->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE;
++
++    dec_params->lf.level = s->s.h.filter.level;
++    dec_params->lf.sharpness = s->s.h.filter.sharpness;
++    for (unsigned i = 0; i < 4; i++)
++        dec_params->lf.ref_deltas[i] = s->s.h.lf_delta.ref[i];
++    for (unsigned i = 0; i < 2; i++)
++        dec_params->lf.mode_deltas[i] = s->s.h.lf_delta.mode[i];
++    for (unsigned i = 0; i < 8; i++) {
++        for (unsigned j = 0; j < 4; j++)
++            memcpy(dec_params->lf.level_lookup[i][j], s->s.h.segmentation.feat[i].lflvl[j], sizeof(dec_params->lf.level_lookup[0][0]));
++    }
++
++    dec_params->quant.base_q_idx = s->s.h.yac_qi;
++    dec_params->quant.delta_q_y_dc = s->s.h.ydc_qdelta;
++    dec_params->quant.delta_q_uv_dc = s->s.h.uvdc_qdelta;
++    dec_params->quant.delta_q_uv_ac = s->s.h.uvac_qdelta;
++
++    if (s->s.h.segmentation.enabled)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ENABLED;
++    if (s->s.h.segmentation.update_map)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP;
++    if (s->s.h.segmentation.temporal)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE;
++    if (s->s.h.segmentation.update_data)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA;
++    if (s->s.h.segmentation.absolute_vals)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE;
++
++    for (unsigned i = 0; i < 7; i++)
++        dec_params->seg.tree_probs[i] = s->s.h.segmentation.prob[i];
++
++    if (s->s.h.segmentation.temporal) {
++        for (unsigned i = 0; i < 3; i++)
++            dec_params->seg.pred_probs[i] = s->s.h.segmentation.pred_prob[i];
++    } else {
++        memset(dec_params->seg.pred_probs, 255, sizeof(dec_params->seg.pred_probs));
++    }
++
++    for (unsigned i = 0; i < 8; i++) {
++        if (s->s.h.segmentation.feat[i].q_enabled) {
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_QP_DELTA;
++            dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_QP_DELTA] = s->s.h.segmentation.feat[i].q_val;
++        }
++
++        if (s->s.h.segmentation.feat[i].lf_enabled) {
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_LF;
++            dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_LF] = s->s.h.segmentation.feat[i].lf_val;
++        }
++
++        if (s->s.h.segmentation.feat[i].ref_enabled) {
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_REF_FRAME;
++            dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_REF_FRAME] = s->s.h.segmentation.feat[i].ref_val;
++        }
++
++        if (s->s.h.segmentation.feat[i].skip_enabled)
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_SKIP;
++    }
++
++    memcpy(dec_params->probs.tx8, s->prob.p.tx8p, sizeof(s->prob.p.tx8p));
++    memcpy(dec_params->probs.tx16, s->prob.p.tx16p, sizeof(s->prob.p.tx16p));
++    memcpy(dec_params->probs.tx32, s->prob.p.tx32p, sizeof(s->prob.p.tx32p));
++    for (unsigned i = 0; i < 4; i++) {
++        for (unsigned j = 0; j < 2; j++) {
++            for (unsigned k = 0; k < 2; k++) {
++                for (unsigned l = 0; l < 6; l++) {
++                    for (unsigned m = 0; m < 6; m++) {
++                        memcpy(dec_params->probs.coef[i][j][k][l][m], s->prob.coef[i][j][k][l][m], sizeof(dec_params->probs.coef[0][0][0][0][0]));
++                    }
++                }
++            }
++        }
++    }
++    memcpy(dec_params->probs.skip, s->prob.p.skip, sizeof(s->prob.p.skip));
++    memcpy(dec_params->probs.inter_mode, s->prob.p.mv_mode, sizeof(s->prob.p.mv_mode));
++    memcpy(dec_params->probs.interp_filter, s->prob.p.filter, sizeof(s->prob.p.filter));
++    memcpy(dec_params->probs.is_inter, s->prob.p.intra, sizeof(s->prob.p.intra));
++    memcpy(dec_params->probs.comp_mode, s->prob.p.comp, sizeof(s->prob.p.comp));
++    memcpy(dec_params->probs.single_ref, s->prob.p.single_ref, sizeof(s->prob.p.single_ref));
++    memcpy(dec_params->probs.comp_ref, s->prob.p.comp_ref, sizeof(s->prob.p.comp_ref));
++    memcpy(dec_params->probs.y_mode, s->prob.p.y_mode, sizeof(s->prob.p.y_mode));
++    for (unsigned i = 0; i < 10; i++)
++        memcpy(dec_params->probs.uv_mode[ff_to_v4l2_intramode[i]], s->prob.p.uv_mode[i], sizeof(s->prob.p.uv_mode[0]));
++    for (unsigned i = 0; i < 4; i++)
++        memcpy(dec_params->probs.partition[i * 4], s->prob.p.partition[3 - i], sizeof(s->prob.p.partition[0]));
++    memcpy(dec_params->probs.mv.joint, s->prob.p.mv_joint, sizeof(s->prob.p.mv_joint));
++    for (unsigned i = 0; i < 2; i++) {
++         dec_params->probs.mv.sign[i] = s->prob.p.mv_comp[i].sign;
++         memcpy(dec_params->probs.mv.class[i], s->prob.p.mv_comp[i].classes, sizeof(s->prob.p.mv_comp[0].classes));
++         dec_params->probs.mv.class0_bit[i] = s->prob.p.mv_comp[i].class0;
++         memcpy(dec_params->probs.mv.bits[i], s->prob.p.mv_comp[i].bits, sizeof(s->prob.p.mv_comp[0].bits));
++         memcpy(dec_params->probs.mv.class0_fr[i], s->prob.p.mv_comp[i].class0_fp, sizeof(s->prob.p.mv_comp[0].class0_fp));
++         memcpy(dec_params->probs.mv.fr[i], s->prob.p.mv_comp[i].fp, sizeof(s->prob.p.mv_comp[0].fp));
++         dec_params->probs.mv.class0_hp[i] = s->prob.p.mv_comp[i].class0_hp;
++         dec_params->probs.mv.hp[i] = s->prob.p.mv_comp[i].hp;
++    }
++
++    return ff_v4l2_request_reset_frame(avctx, f->tf.f);
++}
++
++static int v4l2_request_vp9_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const VP9Context *s = avctx->priv_data;
++    const VP9Frame *f = &s->s.frames[CUR_FRAME];
++
++    return ff_v4l2_request_append_output_buffer(avctx, f->tf.f, buffer, size);
++}
++
++static int v4l2_request_vp9_end_frame(AVCodecContext *avctx)
++{
++    const VP9Context *s = avctx->priv_data;
++    const VP9Frame *f = &s->s.frames[CUR_FRAME];
++    V4L2RequestControlsVP9 *controls = f->hwaccel_picture_private;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_DECODE_PARAMS,
++            .ptr = &controls->decode_params,
++            .size = sizeof(controls->decode_params),
++        },
++    };
++
++    ret = ff_v4l2_request_decode_frame(avctx, f->tf.f, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    if (!s->s.h.refreshctx)
++        return 0;
++
++    return v4l2_request_vp9_get_frame_ctx(avctx, s->s.h.framectxid);
++}
++
++static int v4l2_request_vp9_init(AVCodecContext *avctx)
++{
++    // TODO: check V4L2_CID_MPEG_VIDEO_VP9_PROFILE
++    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP9_FRAME, 3 * 1024 * 1024, NULL, 0);
++}
++
++const AVHWAccel ff_vp9_v4l2request_hwaccel = {
++    .name           = "vp9_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_VP9,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_vp9_start_frame,
++    .decode_slice   = v4l2_request_vp9_decode_slice,
++    .end_frame      = v4l2_request_vp9_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsVP9),
++    .init           = v4l2_request_vp9_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContext),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
+index fd0bab14a2..434f905c62 100644
+--- a/libavcodec/vp9.c
++++ b/libavcodec/vp9.c
+@@ -191,6 +191,7 @@ static int update_size(AVCodecContext *avctx, int w, int h)
+ #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + \
+                      CONFIG_VP9_D3D11VA_HWACCEL * 2 + \
+                      CONFIG_VP9_NVDEC_HWACCEL + \
++                     CONFIG_VP9_V4L2REQUEST_HWACCEL + \
+                      CONFIG_VP9_VAAPI_HWACCEL + \
+                      CONFIG_VP9_VDPAU_HWACCEL)
+     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
+@@ -223,6 +224,9 @@ static int update_size(AVCodecContext *avctx, int w, int h)
+ #endif
+ #if CONFIG_VP9_VAAPI_HWACCEL
+             *fmtp++ = AV_PIX_FMT_VAAPI;
++#endif
++#if CONFIG_VP9_V4L2REQUEST_HWACCEL
++            *fmtp++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+             break;
+         case AV_PIX_FMT_YUV420P12:
+@@ -231,6 +235,9 @@ static int update_size(AVCodecContext *avctx, int w, int h)
+ #endif
+ #if CONFIG_VP9_VAAPI_HWACCEL
+             *fmtp++ = AV_PIX_FMT_VAAPI;
++#endif
++#if CONFIG_VP9_V4L2REQUEST_HWACCEL
++            *fmtp++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+             break;
+         }
+@@ -700,7 +707,8 @@ static int decode_frame_header(AVCodecContext *avctx,
+                                          get_bits(&s->gb, 8) : 255;
+         }
+ 
+-        if (get_bits1(&s->gb)) {
++        s->s.h.segmentation.update_data = get_bits1(&s->gb);
++        if (s->s.h.segmentation.update_data) {
+             s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
+             for (i = 0; i < 8; i++) {
+                 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
+@@ -1909,6 +1917,9 @@ AVCodec ff_vp9_decoder = {
+ #endif
+ #if CONFIG_VP9_VDPAU_HWACCEL
+                                HWACCEL_VDPAU(vp9),
++#endif
++#if CONFIG_VP9_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(vp9),
+ #endif
+                                NULL
+                            },
+diff --git a/libavcodec/vp9shared.h b/libavcodec/vp9shared.h
+index 54726df742..fee3568736 100644
+--- a/libavcodec/vp9shared.h
++++ b/libavcodec/vp9shared.h
+@@ -131,6 +131,7 @@ typedef struct VP9BitstreamHeader {
+         uint8_t temporal;
+         uint8_t absolute_vals;
+         uint8_t update_map;
++        uint8_t update_data;
+         uint8_t prob[7];
+         uint8_t pred_prob[3];
+         struct {
+
+From 3e956323f01b221d7a38ad0a3293d337cd106f3f Mon Sep 17 00:00:00 2001
+From: Boris Brezillon <boris.brezillon@collabora.com>
+Date: Thu, 12 Dec 2019 16:13:55 +0100
+Subject: [PATCH 18/18] WIP: Add and use vp9 private linux header
+
+Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
+---
+ configure                     |   2 +-
+ libavcodec/v4l2_request_vp9.c |   1 +
+ libavcodec/vp9-ctrls.h        | 485 ++++++++++++++++++++++++++++++++++
+ 3 files changed, 487 insertions(+), 1 deletion(-)
+ create mode 100644 libavcodec/vp9-ctrls.h
+
+diff --git a/configure b/configure
+index 2b723df55a..87c6836af2 100755
+--- a/configure
++++ b/configure
+@@ -3015,7 +3015,7 @@ vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9"
+ vp9_dxva2_hwaccel_select="vp9_decoder"
+ vp9_nvdec_hwaccel_deps="nvdec"
+ vp9_nvdec_hwaccel_select="vp9_decoder"
+-vp9_v4l2request_hwaccel_deps="v4l2_request vp9_v4l2_request"
++vp9_v4l2request_hwaccel_deps="v4l2_request"
+ vp9_v4l2request_hwaccel_select="vp9_decoder"
+ vp9_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferVP9_bit_depth"
+ vp9_vaapi_hwaccel_select="vp9_decoder"
+diff --git a/libavcodec/v4l2_request_vp9.c b/libavcodec/v4l2_request_vp9.c
+index 4074c7fe4b..2e10b7ad1a 100644
+--- a/libavcodec/v4l2_request_vp9.c
++++ b/libavcodec/v4l2_request_vp9.c
+@@ -19,6 +19,7 @@
+ #include "hwconfig.h"
+ #include "v4l2_request.h"
+ #include "vp9dec.h"
++#include "vp9-ctrls.h"
+ 
+ typedef struct V4L2RequestControlsVP9 {
+     struct v4l2_ctrl_vp9_frame_decode_params decode_params;
+diff --git a/libavcodec/vp9-ctrls.h b/libavcodec/vp9-ctrls.h
+new file mode 100644
+index 0000000000..0cdea8a18b
+--- /dev/null
++++ b/libavcodec/vp9-ctrls.h
+@@ -0,0 +1,485 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the VP9 state controls for use with stateless VP9
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _VP9_CTRLS_H_
++#define _VP9_CTRLS_H_
++
++#include <linux/types.h>
++
++#define V4L2_PIX_FMT_VP9_FRAME v4l2_fourcc('V', 'P', '9', 'F')
++
++#define V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(i)	(V4L2_CID_MPEG_BASE + 4000 + (i))
++#define V4L2_CID_MPEG_VIDEO_VP9_FRAME_DECODE_PARAMS	(V4L2_CID_MPEG_BASE + 4004)
++#define V4L2_CTRL_TYPE_VP9_FRAME_CONTEXT		0x400
++#define V4L2_CTRL_TYPE_VP9_FRAME_DECODE_PARAMS		0x404
++
++/**
++ * enum v4l2_vp9_loop_filter_flags - VP9 loop filter flags
++ *
++ * @V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED: the filter level depends on
++ *					     the mode and reference frame used
++ *					     to predict a block
++ * @V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE: the bitstream contains additional
++ *					    syntax elements that specify which
++ *					    mode and reference frame deltas
++ *					    are to be updated
++ *
++ * Those are the flags you should pass to &v4l2_vp9_loop_filter.flags. See
++ * section '7.2.8 Loop filter semantics' of the VP9 specification for more
++ * details.
++ */
++enum v4l2_vp9_loop_filter_flags {
++	V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED = 1 << 0,
++	V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE = 1 << 1,
++};
++
++/**
++ * struct v4l2_vp9_loop_filter - VP9 loop filter parameters
++ *
++ * @flags: combination of V4L2_VP9_LOOP_FILTER_FLAG_* flags
++ * @level: indicates the loop filter strength
++ * @sharpness: indicates the sharpness level
++ * @ref_deltas: contains the adjustment needed for the filter level based on
++ *		the chosen reference frame
++ * @mode_deltas: contains the adjustment needed for the filter level based on
++ *		 the chosen mode
++ * @level_lookup: level lookup table
++ *
++ * This structure contains all loop filter related parameters. See sections
++ * '7.2.8 Loop filter semantics' and '8.8.1 Loop filter frame init process'
++ * of the VP9 specification for more details.
++ */
++struct v4l2_vp9_loop_filter {
++	__u8 flags;
++	__u8 level;
++	__u8 sharpness;
++	__s8 ref_deltas[4];
++	__s8 mode_deltas[2];
++	__u8 level_lookup[8][4][2];
++};
++
++/**
++ * struct v4l2_vp9_quantization - VP9 quantization parameters
++ *
++ * @base_q_idx: indicates the base frame qindex
++ * @delta_q_y_dc: indicates the Y DC quantizer relative to base_q_idx
++ * @delta_q_uv_dc: indicates the UV DC quantizer relative to base_q_idx
++ * @delta_q_uv_ac indicates the UV AC quantizer relative to base_q_idx
++ * @padding: padding bytes to align things on 64 bits. Must be set to 0
++ *
++ * Encodes the quantization parameters. See section '7.2.9 Quantization params
++ * syntax' of the VP9 specification for more details.
++ */
++struct v4l2_vp9_quantization {
++	__u8 base_q_idx;
++	__s8 delta_q_y_dc;
++	__s8 delta_q_uv_dc;
++	__s8 delta_q_uv_ac;
++	__u8 padding[4];
++};
++
++/**
++ * enum v4l2_vp9_segmentation_flags - VP9 segmentation flags
++ *
++ * @V4L2_VP9_SEGMENTATION_FLAG_ENABLED: indicates that this frame makes use of
++ *					the segmentation tool
++ * @V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP: indicates that the segmentation map
++ *					   should be updated during the
++ *					   decoding of this frame
++ * @V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE: indicates that the updates to
++ *						the segmentation map are coded
++ *						relative to the existing
++ *						segmentation map
++ * @V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA: indicates that new parameters are
++ *					    about to be specified for each
++ *					    segment
++ * @V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE: indicates that the
++ *						    segmentation parameters
++ *						    represent the actual values
++ *						    to be used
++ *
++ * Those are the flags you should pass to &v4l2_vp9_segmentation.flags. See
++ * section '7.2.10 Segmentation params syntax' of the VP9 specification for
++ * more details.
++ */
++enum v4l2_vp9_segmentation_flags {
++	V4L2_VP9_SEGMENTATION_FLAG_ENABLED = 1 << 0,
++	V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP = 1 << 1,
++	V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE = 1 << 2,
++	V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA = 1 << 3,
++	V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE = 1 << 4,
++};
++
++#define V4L2_VP9_SEGMENT_FEATURE_ENABLED(id)	(1 << (id))
++#define V4L2_VP9_SEGMENT_FEATURE_ENABLED_MASK	0xf
++
++/**
++ * enum v4l2_vp9_segment_feature - VP9 segment feature IDs
++ *
++ * @V4L2_VP9_SEGMENT_FEATURE_QP_DELTA: QP delta segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_LF: loop filter segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_REF_FRAME: reference frame segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_SKIP: skip segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_CNT: number of segment features
++ *
++ * Segment feature IDs. See section '7.2.10 Segmentation params syntax' of the
++ * VP9 specification for more details.
++ */
++enum v4l2_vp9_segment_feature {
++	V4L2_VP9_SEGMENT_FEATURE_QP_DELTA,
++	V4L2_VP9_SEGMENT_FEATURE_LF,
++	V4L2_VP9_SEGMENT_FEATURE_REF_FRAME,
++	V4L2_VP9_SEGMENT_FEATURE_SKIP,
++	V4L2_VP9_SEGMENT_FEATURE_CNT,
++};
++
++/**
++ * struct v4l2_vp9_segmentation - VP9 segmentation parameters
++ *
++ * @flags: combination of V4L2_VP9_SEGMENTATION_FLAG_* flags
++ * @tree_probs: specifies the probability values to be used when
++ *              decoding a Segment-ID. See '5.15. Segmentation map'
++ *              section of the VP9 specification for more details.
++ * @pred_prob: specifies the probability values to be used when decoding a
++ *	       Predicted-Segment-ID. See '6.4.14. Get segment id syntax'
++ *	       section of :ref:`vp9` for more details..
++ * @padding: padding used to make things aligned on 64 bits. Shall be zero
++ *	     filled
++ * @feature_enabled: bitmask defining which features are enabled in each
++ *		     segment
++ * @feature_data: data attached to each feature. Data entry is only valid if
++ *		  the feature is enabled
++ *
++ * Encodes the quantization parameters. See section '7.2.10 Segmentation
++ * params syntax' of the VP9 specification for more details.
++ */
++struct v4l2_vp9_segmentation {
++	__u8 flags;
++	__u8 tree_probs[7];
++	__u8 pred_probs[3];
++	__u8 padding[5];
++	__u8 feature_enabled[8];
++	__s16 feature_data[8][4];
++};
++
++/**
++ * enum v4l2_vp9_intra_prediction_mode - VP9 Intra prediction modes
++ *
++ * @V4L2_VP9_INTRA_PRED_DC: DC intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_V: vertical intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_H: horizontal intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D45: D45 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D135: D135 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D117: D117 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D153: D153 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D207: D207 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D63: D63 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_TM: True Motion intra prediction
++ *
++ * See section '7.4.5 Intra frame mode info semantics' for more details.
++ */
++enum v4l2_vp9_intra_prediction_mode {
++	V4L2_VP9_INTRA_PRED_MODE_DC,
++	V4L2_VP9_INTRA_PRED_MODE_V,
++	V4L2_VP9_INTRA_PRED_MODE_H,
++	V4L2_VP9_INTRA_PRED_MODE_D45,
++	V4L2_VP9_INTRA_PRED_MODE_D135,
++	V4L2_VP9_INTRA_PRED_MODE_D117,
++	V4L2_VP9_INTRA_PRED_MODE_D153,
++	V4L2_VP9_INTRA_PRED_MODE_D207,
++	V4L2_VP9_INTRA_PRED_MODE_D63,
++	V4L2_VP9_INTRA_PRED_MODE_TM,
++};
++
++/**
++ * struct v4l2_vp9_mv_probabilities - VP9 Motion vector probabilities
++ * @joint: motion vector joint probabilities
++ * @sign: motion vector sign probabilities
++ * @class: motion vector class probabilities
++ * @class0_bit: motion vector class0 bit probabilities
++ * @bits: motion vector bits probabilities
++ * @class0_fr: motion vector class0 fractional bit probabilities
++ * @fr: motion vector fractional bit probabilities
++ * @class0_hp: motion vector class0 high precision fractional bit probabilities
++ * @hp: motion vector high precision fractional bit probabilities
++ */
++struct v4l2_vp9_mv_probabilities {
++	__u8 joint[3];
++	__u8 sign[2];
++	__u8 class[2][10];
++	__u8 class0_bit[2];
++	__u8 bits[2][10];
++	__u8 class0_fr[2][2][3];
++	__u8 fr[2][3];
++	__u8 class0_hp[2];
++	__u8 hp[2];
++};
++
++/**
++ * struct v4l2_vp9_probabilities - VP9 Probabilities
++ *
++ * @tx8: TX 8x8 probabilities
++ * @tx16: TX 16x16 probabilities
++ * @tx32: TX 32x32 probabilities
++ * @coef: coefficient probabilities
++ * @skip: skip probabilities
++ * @inter_mode: inter mode probabilities
++ * @interp_filter: interpolation filter probabilities
++ * @is_inter: is inter-block probabilities
++ * @comp_mode: compound prediction mode probabilities
++ * @single_ref: single ref probabilities
++ * @comp_ref: compound ref probabilities
++ * @y_mode: Y prediction mode probabilities
++ * @uv_mode: UV prediction mode probabilities
++ * @partition: partition probabilities
++ * @mv: motion vector probabilities
++ *
++ * Structure containing most VP9 probabilities. See the VP9 specification
++ * for more details.
++ */
++struct v4l2_vp9_probabilities {
++	__u8 tx8[2][1];
++	__u8 tx16[2][2];
++	__u8 tx32[2][3];
++	__u8 coef[4][2][2][6][6][3];
++	__u8 skip[3];
++	__u8 inter_mode[7][3];
++	__u8 interp_filter[4][2];
++	__u8 is_inter[4];
++	__u8 comp_mode[5];
++	__u8 single_ref[5][2];
++	__u8 comp_ref[5];
++	__u8 y_mode[4][9];
++	__u8 uv_mode[10][9];
++	__u8 partition[16][3];
++
++	struct v4l2_vp9_mv_probabilities mv;
++};
++
++/**
++ * enum v4l2_vp9_reset_frame_context - Valid values for
++ *			&v4l2_ctrl_vp9_frame_decode_params->reset_frame_context
++ *
++ * @V4L2_VP9_RESET_FRAME_CTX_NONE: don't reset any frame context
++ * @V4L2_VP9_RESET_FRAME_CTX_SPEC: reset the frame context pointed by
++ *			&v4l2_ctrl_vp9_frame_decode_params.frame_context_idx
++ * @V4L2_VP9_RESET_FRAME_CTX_ALL: reset all frame contexts
++ *
++ * See section '7.2 Uncompressed header semantics' of the VP9 specification
++ * for more details.
++ */
++enum v4l2_vp9_reset_frame_context {
++	V4L2_VP9_RESET_FRAME_CTX_NONE,
++	V4L2_VP9_RESET_FRAME_CTX_SPEC,
++	V4L2_VP9_RESET_FRAME_CTX_ALL,
++};
++
++/**
++ * enum v4l2_vp9_interpolation_filter - VP9 interpolation filter types
++ *
++ * @V4L2_VP9_INTERP_FILTER_8TAP: height tap filter
++ * @V4L2_VP9_INTERP_FILTER_8TAP_SMOOTH: height tap smooth filter
++ * @V4L2_VP9_INTERP_FILTER_8TAP_SHARP: height tap sharp filter
++ * @V4L2_VP9_INTERP_FILTER_BILINEAR: bilinear filter
++ * @V4L2_VP9_INTERP_FILTER_SWITCHABLE: filter selection is signaled at the
++ *				       block level
++ *
++ * See section '7.2.7 Interpolation filter semantics' of the VP9 specification
++ * for more details.
++ */
++enum v4l2_vp9_interpolation_filter {
++	V4L2_VP9_INTERP_FILTER_8TAP,
++	V4L2_VP9_INTERP_FILTER_8TAP_SMOOTH,
++	V4L2_VP9_INTERP_FILTER_8TAP_SHARP,
++	V4L2_VP9_INTERP_FILTER_BILINEAR,
++	V4L2_VP9_INTERP_FILTER_SWITCHABLE,
++};
++
++/**
++ * enum v4l2_vp9_reference_mode - VP9 reference modes
++ *
++ * @V4L2_VP9_REF_MODE_SINGLE: indicates that all the inter blocks use only a
++ *			      single reference frame to generate motion
++ *			      compensated prediction
++ * @V4L2_VP9_REF_MODE_COMPOUND: requires all the inter blocks to use compound
++ *				mode. Single reference frame prediction is not
++ *				allowed
++ * @V4L2_VP9_REF_MODE_SELECT: allows each individual inter block to select
++ *			      between single and compound prediction modes
++ *
++ * See section '7.3.6 Frame reference mode semantics' of the VP9 specification
++ * for more details.
++ */
++enum v4l2_vp9_reference_mode {
++	V4L2_VP9_REF_MODE_SINGLE,
++	V4L2_VP9_REF_MODE_COMPOUND,
++	V4L2_VP9_REF_MODE_SELECT,
++};
++
++/**
++ * enum v4l2_vp9_tx_mode - VP9 TX modes
++ *
++ * @V4L2_VP9_TX_MODE_ONLY_4X4: transform size is 4x4
++ * @V4L2_VP9_TX_MODE_ALLOW_8X8: transform size can be up to 8x8
++ * @V4L2_VP9_TX_MODE_ALLOW_16X16: transform size can be up to 16x16
++ * @V4L2_VP9_TX_MODE_ALLOW_32X32: transform size can be up to 32x32
++ * @V4L2_VP9_TX_MODE_SELECT: bitstream contains transform size for each block
++ *
++ * See section '7.3.1 Tx mode semantics' of the VP9 specification for more
++ * details.
++ */
++enum v4l2_vp9_tx_mode {
++	V4L2_VP9_TX_MODE_ONLY_4X4,
++	V4L2_VP9_TX_MODE_ALLOW_8X8,
++	V4L2_VP9_TX_MODE_ALLOW_16X16,
++	V4L2_VP9_TX_MODE_ALLOW_32X32,
++	V4L2_VP9_TX_MODE_SELECT,
++};
++
++/**
++ * enum v4l2_vp9_ref_id - VP9 Reference frame IDs
++ *
++ * @V4L2_REF_ID_LAST: last reference frame
++ * @V4L2_REF_ID_GOLDEN: golden reference frame
++ * @V4L2_REF_ID_ALTREF: alternative reference frame
++ * @V4L2_REF_ID_CNT: number of reference frames
++ *
++ * See section '7.4.12 Ref frames semantics' of the VP9 specification for more
++ * details.
++ */
++enum v4l2_vp9_ref_id {
++	V4L2_REF_ID_LAST,
++	V4L2_REF_ID_GOLDEN,
++	V4L2_REF_ID_ALTREF,
++	V4L2_REF_ID_CNT,
++};
++
++/**
++ * enum v4l2_vp9_frame_flags - VP9 frame flags
++ * @V4L2_VP9_FRAME_FLAG_KEY_FRAME: the frame is a key frame
++ * @V4L2_VP9_FRAME_FLAG_SHOW_FRAME: the frame should be displayed
++ * @V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT: the decoding should be error resilient
++ * @V4L2_VP9_FRAME_FLAG_INTRA_ONLY: the frame does not reference other frames
++ * @V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV: the frame might can high precision
++ *					    motion vectors
++ * @V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX: frame context should be updated
++ *					   after decoding
++ * @V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE: parallel decoding is used
++ * @V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING: vertical subsampling is enabled
++ * @V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING: horizontal subsampling is enabled
++ * @V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING: full UV range is used
++ *
++ * Check the VP9 specification for more details.
++ */
++enum v4l2_vp9_frame_flags {
++	V4L2_VP9_FRAME_FLAG_KEY_FRAME = 1 << 0,
++	V4L2_VP9_FRAME_FLAG_SHOW_FRAME = 1 << 1,
++	V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT = 1 << 2,
++	V4L2_VP9_FRAME_FLAG_INTRA_ONLY = 1 << 3,
++	V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV = 1 << 4,
++	V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX = 1 << 5,
++	V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE = 1 << 6,
++	V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING = 1 << 7,
++	V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING = 1 << 8,
++	V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING = 1 << 9,
++};
++
++#define V4L2_VP9_PROFILE_MAX		3
++
++/**
++ * struct v4l2_ctrl_vp9_frame_decode_params - VP9 frame decoding control
++ *
++ * @flags: combination of V4L2_VP9_FRAME_FLAG_* flags
++ * @compressed_header_size: compressed header size in bytes
++ * @uncompressed_header_size: uncompressed header size in bytes
++ * @profile: VP9 profile. Can be 0, 1, 2 or 3
++ * @reset_frame_context: specifies whether the frame context should be reset
++ *			 to default values. See &v4l2_vp9_reset_frame_context
++ *			 for more details
++ * @frame_context_idx: frame context that should be used/updated
++ * @bit_depth: bits per components. Can be 8, 10 or 12. Note that not all
++ *	       profiles support 10 and/or 12 bits depths
++ * @interpolation_filter: specifies the filter selection used for performing
++ *			  inter prediction. See &v4l2_vp9_interpolation_filter
++ *			  for more details
++ * @tile_cols_log2: specifies the base 2 logarithm of the width of each tile
++ *		    (where the width is measured in units of 8x8 blocks).
++ *		    Shall be less than or equal to 6
++ * @tile_rows_log2: specifies the base 2 logarithm of the height of each tile
++ *		    (where the height is measured in units of 8x8 blocks)
++ * @tx_mode: specifies the TX mode. See &v4l2_vp9_tx_mode for more details
++ * @reference_mode: specifies the type of inter prediction to be used. See
++ *		    &v4l2_vp9_reference_mode for more details
++ * @padding: needed to make this struct 64 bit aligned. Shall be filled with
++ *	     zeros
++ * @frame_width_minus_1: add 1 to it and you'll get the frame width expressed
++ *			 in pixels
++ * @frame_height_minus_1: add 1 to it and you'll get the frame height expressed
++ *			  in pixels
++ * @frame_width_minus_1: add 1 to it and you'll get the expected render width
++ *			 expressed in pixels. This is not used during the
++ *			 decoding process but might be used by HW scalers to
++ *			 prepare a frame that's ready for scanout
++ * @frame_height_minus_1: add 1 to it and you'll get the expected render height
++ *			 expressed in pixels. This is not used during the
++ *			 decoding process but might be used by HW scalers to
++ *			 prepare a frame that's ready for scanout
++ * @refs: array of reference frames. See &v4l2_vp9_ref_id for more details
++ * @lf: loop filter parameters. See &v4l2_vp9_loop_filter for more details
++ * @quant: quantization parameters. See &v4l2_vp9_quantization for more details
++ * @seg: segmentation parameters. See &v4l2_vp9_segmentation for more details
++ * @probs: probabilities. See &v4l2_vp9_probabilities for more details
++ */
++struct v4l2_ctrl_vp9_frame_decode_params {
++	__u32 flags;
++	__u16 compressed_header_size;
++	__u16 uncompressed_header_size;
++	__u8 profile;
++	__u8 reset_frame_context;
++	__u8 frame_context_idx;
++	__u8 bit_depth;
++	__u8 interpolation_filter;
++	__u8 tile_cols_log2;
++	__u8 tile_rows_log2;
++	__u8 tx_mode;
++	__u8 reference_mode;
++	__u8 padding[6];
++	__u16 frame_width_minus_1;
++	__u16 frame_height_minus_1;
++	__u16 render_width_minus_1;
++	__u16 render_height_minus_1;
++	__u64 refs[V4L2_REF_ID_CNT];
++	struct v4l2_vp9_loop_filter lf;
++	struct v4l2_vp9_quantization quant;
++	struct v4l2_vp9_segmentation seg;
++	struct v4l2_vp9_probabilities probs;
++};
++
++#define V4L2_VP9_NUM_FRAME_CTX	4
++
++/**
++ * struct v4l2_ctrl_vp9_frame_ctx - VP9 frame context control
++ *
++ * @probs: VP9 probabilities
++ *
++ * This control is accessed in both direction. The user should initialize the
++ * 4 contexts with default values just after starting the stream. Then before
++ * decoding a frame it should query the current frame context (the one passed
++ * through &v4l2_ctrl_vp9_frame_decode_params.frame_context_idx) to initialize
++ * &v4l2_ctrl_vp9_frame_decode_params.probs. The probs are then adjusted based
++ * on the bitstream info and passed to the kernel. The codec should update
++ * the frame context after the frame has been decoded, so that next time
++ * userspace query this context it contains the updated probabilities.
++ */
++struct v4l2_ctrl_vp9_frame_ctx {
++	struct v4l2_vp9_probabilities probs;
++};
++
++#endif /* _VP9_CTRLS_H_ */

From adb3a1a1690157962760d7d8c0723d769bf8a73c Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 9 Jul 2020 13:24:25 +0200
Subject: [PATCH 06/10] ffmpeg: create libreelec patch

Patch created using revisions 8e12af2..1485078
from branch 4.3-libreelec-misc of https://github.com/LibreELEC/FFmpeg

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 .../libreelec/ffmpeg-001-libreelec.patch      | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch

diff --git a/packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch b/packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch
new file mode 100644
index 0000000000..9db1e95648
--- /dev/null
+++ b/packages/multimedia/ffmpeg/patches/libreelec/ffmpeg-001-libreelec.patch
@@ -0,0 +1,73 @@
+From 823b70bfa0f451a0f8cd0539e1707f7bb7ff5891 Mon Sep 17 00:00:00 2001
+From: Lukas Rusak <lorusak@gmail.com>
+Date: Wed, 10 Apr 2019 13:39:21 -0700
+Subject: [PATCH 1/2] libavcodec/libdav1d: add libdav1d_get_format method to
+ call ff_get_format
+
+This will allow applications to properly init the decoder in
+cases where a hardware decoder is tried first and and software
+decoder is tried after by calling the get_format callback.
+
+Even though there is no hardware pixel formats available
+we still need to return the software pixel format.
+
+Tested with Kodi by checking if multithreaded software
+decoding is properly activated.
+---
+ libavcodec/libdav1d.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
+index bbb3ec1e6c..d8a7555c29 100644
+--- a/libavcodec/libdav1d.c
++++ b/libavcodec/libdav1d.c
+@@ -55,6 +55,16 @@ static const enum AVPixelFormat pix_fmt_rgb[3] = {
+     AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12,
+ };
+ 
++static enum AVPixelFormat libdav1d_get_format(AVCodecContext *avctx, const Dav1dPicture *p)
++{
++   enum AVPixelFormat pix_fmts[2], *fmt = pix_fmts;
++
++   *fmt++ = pix_fmt[p->p.layout][p->seq_hdr->hbd];
++   *fmt = AV_PIX_FMT_NONE;
++
++   return ff_get_format(avctx, pix_fmts);
++}
++
+ static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl)
+ {
+     AVCodecContext *c = opaque;
+@@ -259,6 +269,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
+     c->profile = p->seq_hdr->profile;
+     c->level = ((p->seq_hdr->operating_points[0].major_level - 2) << 2)
+                | p->seq_hdr->operating_points[0].minor_level;
++    frame->format = c->pix_fmt = libdav1d_get_format(c, p);
+     frame->width = p->p.w;
+     frame->height = p->p.h;
+     if (c->width != p->p.w || c->height != p->p.h) {
+
+From 1485078472d107806d1d3f52f89e3ff47ae8715c Mon Sep 17 00:00:00 2001
+From: chewitt <github@chrishewitt.net>
+Date: Sun, 11 Aug 2019 07:08:19 +0000
+Subject: [PATCH 2/2] add long-term yuv2rgb logging patch
+
+---
+ libswscale/yuv2rgb.c | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
+index 588462504e..20364ff318 100644
+--- a/libswscale/yuv2rgb.c
++++ b/libswscale/yuv2rgb.c
+@@ -688,10 +688,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
+     if (t)
+         return t;
+ 
+-    av_log(c, AV_LOG_WARNING,
+-           "No accelerated colorspace conversion found from %s to %s.\n",
+-           av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
+-
+     switch (c->dstFormat) {
+     case AV_PIX_FMT_BGR48BE:
+     case AV_PIX_FMT_BGR48LE:

From efccd337ea2e51a714500b903c783a89ec733efb Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 9 Jul 2020 13:24:26 +0200
Subject: [PATCH 07/10] ffmpeg: create rpi patch

Patch created using revisions f6ae50a..3d6229e
from branch test/4.3/kodi_main of https://github.com/jc-kynesim/rpi-ffmpeg

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch   | 54599 ++++++++++++++++
 1 file changed, 54599 insertions(+)
 create mode 100644 packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch

diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
new file mode 100644
index 0000000000..81673ca32a
--- /dev/null
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -0,0 +1,54599 @@
+diff --git a/.gitignore b/.gitignore
+index 2450ee8fc5..4bcc3ae643 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -1,6 +1,7 @@
+ *.a
+ *.o
+ *.o.*
++*.bin
+ *.d
+ *.def
+ *.dll
+@@ -26,6 +27,7 @@
+ .\#*
+ /.config
+ /.version
++/build/
+ /ffmpeg
+ /ffplay
+ /ffprobe
+diff --git a/configure b/configure
+index 8569a60bf8..96b3527650 100755
+--- a/configure
++++ b/configure
+@@ -274,6 +274,7 @@ External library support:
+   --enable-libtls          enable LibreSSL (via libtls), needed for https support
+                            if openssl, gnutls or mbedtls is not used [no]
+   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
++  --enable-libudev         enable libudev [no]
+   --enable-libv4l2         enable libv4l2/v4l-utils [no]
+   --enable-libvidstab      enable video stabilization using vid.stab [no]
+   --enable-libvmaf         enable vmaf filter via libvmaf [no]
+@@ -336,12 +337,16 @@ External library support:
+   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
+   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
+   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
++  --enable-rpi             enable other rpi specific stuff [no]
++  --enable-sand            enable sand video formats [rpi]
++  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
+   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
+   --disable-nvenc          disable Nvidia video encoding code [autodetect]
+   --enable-omx             enable OpenMAX IL code [no]
+   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
+   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
+   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
++  --enable-v4l2-request    enable V4L2 request API code [no]
+   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
+   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
+   --disable-videotoolbox   disable VideoToolbox code [autodetect]
+@@ -1807,6 +1812,7 @@ EXTERNAL_LIBRARY_LIST="
+     libtesseract
+     libtheora
+     libtwolame
++    libudev
+     libv4l2
+     libvorbis
+     libvpx
+@@ -1861,7 +1867,10 @@ HWACCEL_LIBRARY_LIST="
+     mmal
+     omx
+     opencl
++    v4l2_request
+     vulkan
++    rpi4_8
++    rpi4_10
+ "
+ 
+ DOCUMENT_LIST="
+@@ -1877,12 +1886,15 @@ FEATURE_LIST="
+     gray
+     hardcoded_tables
+     omx_rpi
++    rpi
+     runtime_cpudetect
+     safe_bitstream_reader
++    sand
+     shared
+     small
+     static
+     swscale_alpha
++    vout_drm
+ "
+ 
+ # this list should be kept in linking order
+@@ -1923,6 +1935,7 @@ SUBSYSTEM_LIST="
+     pixelutils
+     network
+     rdft
++    rpi
+ "
+ 
+ # COMPONENT_LIST needs to come last to ensure correct dependency checking
+@@ -2405,9 +2418,11 @@ CONFIG_EXTRA="
+     rangecoder
+     riffdec
+     riffenc
++    rpi
+     rtpdec
+     rtpenc_chain
+     rv34dsp
++    sand
+     scene_sad
+     sinewin
+     snappy
+@@ -2737,6 +2752,8 @@ hap_decoder_select="snappy texturedsp"
+ hap_encoder_deps="libsnappy"
+ hap_encoder_select="texturedspenc"
+ hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
++hevc_rpi_decoder_deps="rpi"
++hevc_rpi_decoder_select="hevc_decoder sand"
+ huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
+ huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
+ hymt_decoder_select="huffyuv_decoder"
+@@ -2903,6 +2920,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
+ dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
+ ffnvcodec_deps_any="libdl LoadLibrary"
+ nvdec_deps="ffnvcodec"
++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
+ vaapi_x11_deps="xlib"
+ videotoolbox_hwaccel_deps="videotoolbox pthreads"
+ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
+@@ -2920,6 +2938,8 @@ h264_dxva2_hwaccel_deps="dxva2"
+ h264_dxva2_hwaccel_select="h264_decoder"
+ h264_nvdec_hwaccel_deps="nvdec"
+ h264_nvdec_hwaccel_select="h264_decoder"
++h264_v4l2request_hwaccel_deps="v4l2_request"
++h264_v4l2request_hwaccel_select="h264_decoder"
+ h264_vaapi_hwaccel_deps="vaapi"
+ h264_vaapi_hwaccel_select="h264_decoder"
+ h264_vdpau_hwaccel_deps="vdpau"
+@@ -2934,6 +2954,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
+ hevc_dxva2_hwaccel_select="hevc_decoder"
+ hevc_nvdec_hwaccel_deps="nvdec"
+ hevc_nvdec_hwaccel_select="hevc_decoder"
++hevc_rpi4_10_hwaccel_deps="rpi"
++hevc_rpi4_10_hwaccel_select="hevc_decoder"
++hevc_rpi4_8_hwaccel_deps="rpi"
++hevc_rpi4_8_hwaccel_select="hevc_decoder"
++hevc_v4l2request_hwaccel_deps="v4l2_request"
++hevc_v4l2request_hwaccel_select="hevc_decoder"
+ hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
+ hevc_vaapi_hwaccel_select="hevc_decoder"
+ hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
+@@ -2962,6 +2988,8 @@ mpeg2_dxva2_hwaccel_deps="dxva2"
+ mpeg2_dxva2_hwaccel_select="mpeg2video_decoder"
+ mpeg2_nvdec_hwaccel_deps="nvdec"
+ mpeg2_nvdec_hwaccel_select="mpeg2video_decoder"
++mpeg2_v4l2request_hwaccel_deps="v4l2_request mpeg2_v4l2_request"
++mpeg2_v4l2request_hwaccel_select="mpeg2video_decoder"
+ mpeg2_vaapi_hwaccel_deps="vaapi"
+ mpeg2_vaapi_hwaccel_select="mpeg2video_decoder"
+ mpeg2_vdpau_hwaccel_deps="vdpau"
+@@ -2992,6 +3020,8 @@ vc1_vdpau_hwaccel_deps="vdpau"
+ vc1_vdpau_hwaccel_select="vc1_decoder"
+ vp8_nvdec_hwaccel_deps="nvdec"
+ vp8_nvdec_hwaccel_select="vp8_decoder"
++vp8_v4l2request_hwaccel_deps="v4l2_request"
++vp8_v4l2request_hwaccel_select="vp8_decoder"
+ vp8_vaapi_hwaccel_deps="vaapi"
+ vp8_vaapi_hwaccel_select="vp8_decoder"
+ vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9"
+@@ -3002,6 +3032,8 @@ vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9"
+ vp9_dxva2_hwaccel_select="vp9_decoder"
+ vp9_nvdec_hwaccel_deps="nvdec"
+ vp9_nvdec_hwaccel_select="vp9_decoder"
++vp9_v4l2request_hwaccel_deps="v4l2_request"
++vp9_v4l2request_hwaccel_select="vp9_decoder"
+ vp9_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferVP9_bit_depth"
+ vp9_vaapi_hwaccel_select="vp9_decoder"
+ vp9_vdpau_hwaccel_deps="vdpau VdpPictureInfoVP9"
+@@ -3403,6 +3435,9 @@ v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_indev_suggest="libv4l2"
+ v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_outdev_suggest="libv4l2"
++vout_drm_outdev_deps="libdrm vout_drm"
++vout_rpi_outdev_deps="rpi"
++vout_rpi_outdev_select="sand"
+ vfwcap_indev_deps="vfw32 vfwcap_defines"
+ xcbgrab_indev_deps="libxcb"
+ xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
+@@ -3618,6 +3653,8 @@ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping"
+ tonemap_opencl_filter_deps="opencl const_nan"
+ transpose_opencl_filter_deps="opencl"
+ transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
++unsand_filter_deps="rpi"
++unsand_filter_select="sand"
+ unsharp_opencl_filter_deps="opencl"
+ uspp_filter_deps="gpl avcodec"
+ vaguedenoiser_filter_deps="gpl"
+@@ -6376,6 +6413,7 @@ enabled libtls            && require_pkg_config libtls libtls tls.h tls_configur
+ enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame &&
+                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
+                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
++enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
+ enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
+ enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
+ enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf
+@@ -6430,11 +6468,12 @@ enabled mbedtls           && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt
+                                check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
+                                die "ERROR: mbedTLS not found"; }
+ enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+-enabled mmal              && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
++( enabled rpi ||
++  enabled mmal )          && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+                                { ! enabled cross_compile &&
+                                  add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+                                  add_ldflags -L/opt/vc/lib/ &&
+-                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
++                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
+                                die "ERROR: mmal not found" &&
+                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+ enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
+@@ -6475,6 +6514,10 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
+                                { enabled libdrm ||
+                                  die "ERROR: rkmpp requires --enable-libdrm"; }
+                              }
++enabled v4l2_request      && { enabled libdrm ||
++                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
++                             { enabled libudev ||
++                               die "ERROR: v4l2-request requires --enable-libudev"; }
+ enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
+ 
+ 
+@@ -6556,6 +6599,13 @@ if enabled v4l2_m2m; then
+     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
+ fi
+ 
++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;"
++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
++check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;"
++check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;"
++check_cc vp9_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP9_FRAME;"
++
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+ 
+diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
+index 2e9448ea2b..faa8501dd0 100644
+--- a/fftools/ffmpeg.c
++++ b/fftools/ffmpeg.c
+@@ -2118,8 +2118,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
+                        ifilter->channel_layout != frame->channel_layout;
+         break;
+     case AVMEDIA_TYPE_VIDEO:
+-        need_reinit |= ifilter->width  != frame->width ||
+-                       ifilter->height != frame->height;
++        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
++                       ifilter->height != av_frame_cropped_height(frame);
+         break;
+     }
+ 
+@@ -2367,6 +2367,8 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
+         if (ist->dec_ctx->codec_id == AV_CODEC_ID_H264) {
+             ist->st->codecpar->video_delay = ist->dec_ctx->has_b_frames;
+         } else
++        {
++#if 0
+             av_log(ist->dec_ctx, AV_LOG_WARNING,
+                    "video_delay is larger in decoder than demuxer %d > %d.\n"
+                    "If you want to help, upload a sample "
+@@ -2374,6 +2376,8 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
+                    "and contact the ffmpeg-devel mailing list. (ffmpeg-devel@ffmpeg.org)\n",
+                    ist->dec_ctx->has_b_frames,
+                    ist->st->codecpar->video_delay);
++#endif
++        }
+     }
+ 
+     if (ret != AVERROR_EOF)
+@@ -2400,8 +2404,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
+         decoded_frame->top_field_first = ist->top_field_first;
+ 
+     ist->frames_decoded++;
+-
+-    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
++    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
+         if (err < 0)
+             goto fail;
+@@ -2913,6 +2916,15 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+             return ret;
+         }
+ 
++#if CONFIG_HEVC_RPI_DECODER
++        ret = -1;
++        if (strcmp(codec->name, "hevc_rpi") == 0 &&
++            (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
++            ist->dec = codec = avcodec_find_decoder_by_name("hevc");
++            av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
++        }
++        if (ret < 0)
++#endif
+         if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
+             if (ret == AVERROR_EXPERIMENTAL)
+                 abort_codec_experimental(codec, 0);
+diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
+index 828cb2a4ff..55d4db293e 100644
+--- a/fftools/ffmpeg.h
++++ b/fftools/ffmpeg.h
+@@ -61,6 +61,7 @@ enum HWAccelID {
+     HWACCEL_GENERIC,
+     HWACCEL_VIDEOTOOLBOX,
+     HWACCEL_QSV,
++    HWACCEL_RPI,
+ };
+ 
+ typedef struct HWAccel {
+@@ -590,6 +591,7 @@ extern int video_sync_method;
+ extern float frame_drop_threshold;
+ extern int do_benchmark;
+ extern int do_benchmark_all;
++extern int no_cvt_hw;
+ extern int do_deinterlace;
+ extern int do_hex_dump;
+ extern int do_pkt_dump;
+@@ -653,6 +655,7 @@ int ffmpeg_parse_options(int argc, char **argv);
+ 
+ int videotoolbox_init(AVCodecContext *s);
+ int qsv_init(AVCodecContext *s);
++int rpi_init(AVCodecContext *s);
+ 
+ HWDevice *hw_device_get_by_name(const char *name);
+ int hw_device_init_from_string(const char *arg, HWDevice **dev);
+diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
+index 422e1268e9..deb89c076d 100644
+--- a/fftools/ffmpeg_filter.c
++++ b/fftools/ffmpeg_filter.c
+@@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
+ 
+     ifilter->format = frame->format;
+ 
+-    ifilter->width               = frame->width;
+-    ifilter->height              = frame->height;
++    ifilter->width               = av_frame_cropped_width(frame);
++    ifilter->height              = av_frame_cropped_height(frame);
+     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
+ 
+     ifilter->sample_rate         = frame->sample_rate;
+diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
+index 2eb4e1c973..98207be2e2 100644
+--- a/fftools/ffmpeg_opt.c
++++ b/fftools/ffmpeg_opt.c
+@@ -130,12 +130,22 @@ static const char *opt_name_enc_time_bases[]            = {"enc_time_base", NULL
+     }\
+ }
+ 
++#if CONFIG_RPI
++int rpi_init(AVCodecContext *avctx) {
++    return 0;
++}
++#endif
++
+ const HWAccel hwaccels[] = {
+ #if CONFIG_VIDEOTOOLBOX
+     { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
+ #endif
+ #if CONFIG_LIBMFX
+     { "qsv",   qsv_init,   HWACCEL_QSV,   AV_PIX_FMT_QSV },
++#endif
++#if CONFIG_RPI
++    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
++    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
+ #endif
+     { 0 },
+ };
+@@ -155,6 +165,7 @@ float frame_drop_threshold = 0;
+ int do_deinterlace    = 0;
+ int do_benchmark      = 0;
+ int do_benchmark_all  = 0;
++int no_cvt_hw         = 0;
+ int do_hex_dump       = 0;
+ int do_pkt_dump       = 0;
+ int copy_ts           = 0;
+@@ -755,7 +766,9 @@ static AVCodec *choose_decoder(OptionsContext *o, AVFormatContext *s, AVStream *
+         st->codecpar->codec_id = codec->id;
+         return codec;
+     } else
++    {
+         return avcodec_find_decoder(st->codecpar->codec_id);
++    }
+ }
+ 
+ /* Add all the streams from the given input file to the global
+@@ -3460,6 +3473,8 @@ const OptionDef options[] = {
+         "add timings for benchmarking" },
+     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
+       "add timings for each task" },
++    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
++      "do not auto-convert hw frames to sw" },
+     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
+       "write program-readable progress information", "url" },
+     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 5a6ea59715..c9d056101d 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h                                                  \
+           mediacodec.h                                                  \
+           packet.h                                                      \
+           qsv.h                                                         \
++          rpi_zc.h                                                      \
+           vaapi.h                                                       \
+           vdpau.h                                                       \
+           version.h                                                     \
+@@ -138,6 +139,7 @@ OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
+ OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
+ OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
+ OBJS-$(CONFIG_RDFT)                    += rdft.o
++OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
+ OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
+ OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
+ OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
+@@ -153,6 +155,7 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
+ OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
+ OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
+ OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
++OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_request.o v4l2_phase.o
+ OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
+ OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
+ 
+@@ -381,6 +384,15 @@ OBJS-$(CONFIG_HCOM_DECODER)            += hcom.o
+ OBJS-$(CONFIG_HEVC_DECODER)            += hevcdec.o hevc_mvs.o \
+                                           hevc_cabac.o hevc_refs.o hevcpred.o    \
+                                           hevcdsp.o hevc_filter.o hevc_data.o
++OBJS-$(CONFIG_RPI)                     += rpi_mem.o \
++                                          rpi_mailbox.o rpi_zc.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
++                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
++                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
++                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
++                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
++                                          rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
++OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuvid.o
+ OBJS-$(CONFIG_HEVC_AMF_ENCODER)        += amfenc_hevc.o
+ OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuviddec.o
+ OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
+@@ -902,6 +914,7 @@ OBJS-$(CONFIG_H264_D3D11VA_HWACCEL)       += dxva2_h264.o
+ OBJS-$(CONFIG_H264_DXVA2_HWACCEL)         += dxva2_h264.o
+ OBJS-$(CONFIG_H264_NVDEC_HWACCEL)         += nvdec_h264.o
+ OBJS-$(CONFIG_H264_QSV_HWACCEL)           += qsvdec_h2645.o
++OBJS-$(CONFIG_H264_V4L2REQUEST_HWACCEL)   += v4l2_request_h264.o
+ OBJS-$(CONFIG_H264_VAAPI_HWACCEL)         += vaapi_h264.o
+ OBJS-$(CONFIG_H264_VDPAU_HWACCEL)         += vdpau_h264.o
+ OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL)  += videotoolbox.o
+@@ -909,8 +922,11 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
+ OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec_h2645.o
++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o
+ OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o
++OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
++OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
+ OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
+ OBJS-$(CONFIG_MJPEG_VAAPI_HWACCEL)        += vaapi_mjpeg.o
+ OBJS-$(CONFIG_MPEG1_NVDEC_HWACCEL)        += nvdec_mpeg12.o
+@@ -921,6 +937,7 @@ OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL)      += dxva2_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL)        += dxva2_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_NVDEC_HWACCEL)        += nvdec_mpeg12.o
+ OBJS-$(CONFIG_MPEG2_QSV_HWACCEL)          += qsvdec_other.o
++OBJS-$(CONFIG_MPEG2_V4L2REQUEST_HWACCEL)  += v4l2_request_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL)        += vaapi_mpeg2.o
+ OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL)        += vdpau_mpeg12.o
+ OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o
+@@ -936,10 +953,12 @@ OBJS-$(CONFIG_VC1_QSV_HWACCEL)            += qsvdec_other.o
+ OBJS-$(CONFIG_VC1_VAAPI_HWACCEL)          += vaapi_vc1.o
+ OBJS-$(CONFIG_VC1_VDPAU_HWACCEL)          += vdpau_vc1.o
+ OBJS-$(CONFIG_VP8_NVDEC_HWACCEL)          += nvdec_vp8.o
++OBJS-$(CONFIG_VP8_V4L2REQUEST_HWACCEL)    += v4l2_request_vp8.o
+ OBJS-$(CONFIG_VP8_VAAPI_HWACCEL)          += vaapi_vp8.o
+ OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL)        += dxva2_vp9.o
+ OBJS-$(CONFIG_VP9_DXVA2_HWACCEL)          += dxva2_vp9.o
+ OBJS-$(CONFIG_VP9_NVDEC_HWACCEL)          += nvdec_vp9.o
++OBJS-$(CONFIG_VP9_V4L2REQUEST_HWACCEL)    += v4l2_request_vp9.o
+ OBJS-$(CONFIG_VP9_VAAPI_HWACCEL)          += vaapi_vp9.o
+ OBJS-$(CONFIG_VP9_VDPAU_HWACCEL)          += vdpau_vp9.o
+ OBJS-$(CONFIG_VP8_QSV_HWACCEL)            += qsvdec_other.o
+@@ -1261,3 +1280,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+ $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+ endif
++
++ifdef CONFIG_HEVC_RPI_DECODER
++QASM_PY := ../local/bin/qasm.py
++VASMVIDCORE := ../local/bin/vasmvidcore_std
++
++ifneq ("$(wildcard $(QASM_PY))","")
++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
++	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++
++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
++	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++endif
++
++ifneq ("$(wildcard $(VASMVIDCORE))","")
++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
++
++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
++	python pi-util/make_array.py $<
++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
++	python pi-util/make_array.py $<
++endif
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
++endif
+diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+index 80f128cade..ac4cf9a90e 100644
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
+@@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder;
+ extern AVCodec ff_hevc_decoder;
+ extern AVCodec ff_hevc_qsv_decoder;
+ extern AVCodec ff_hevc_rkmpp_decoder;
++extern AVCodec ff_hevc_rpi_decoder;
+ extern AVCodec ff_hevc_v4l2m2m_decoder;
+ extern AVCodec ff_hnm4_video_decoder;
+ extern AVCodec ff_hq_hqa_decoder;
+@@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
+     }
+ }
+ 
++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
++{
++    const enum AVPixelFormat *pf = p->pix_fmts;
++
++    // Assume good if we lack info
++    if (pf == NULL)
++        return 1;
++    if (fmt == AV_PIX_FMT_NONE)
++        return 0;
++
++    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
++        if (*pf == fmt)
++            return 1;
++    }
++    return 0;
++}
++
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
++{
++    const AVCodec *p, *experimental = NULL;
++    void *i = 0;
++
++    id= remap_deprecated_codec_id(id);
++    while ((p = av_codec_iterate(&i))) {
++        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
++            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
++                experimental = p;
++            } else
++                return (AVCodec *)p;
++        }
++        p = p->next;
++    }
++    return (AVCodec *)experimental;
++}
++
+ static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
+ {
+     const AVCodec *p, *experimental = NULL;
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index c6be814153..442d60efe4 100644
+--- a/libavcodec/arm/Makefile
++++ b/libavcodec/arm/Makefile
+@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
+                                           arm/sbrdsp_init_arm.o
+ OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
+ OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o    \
++                                          arm/rpi_hevcpred_init_arm.o
+ OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
+ OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
+ OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
+@@ -140,10 +142,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
++                                          arm/hevcdsp_idct_neon.o    \
+                                           arm/hevcdsp_deblock_neon.o    \
+                                           arm/hevcdsp_idct_neon.o       \
+                                           arm/hevcdsp_qpel_neon.o       \
+                                           arm/hevcdsp_sao_neon.o
++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
++                                          arm/rpi_hevc_misc_neon.o       \
++                                          arm/rpi_hevcdsp_deblock_neon.o \
++                                          arm/rpi_hevcdsp_idct_neon.o    \
++                                          arm/rpi_hevcdsp_res8_neon.o    \
++                                          arm/rpi_hevcdsp_res16_neon.o   \
++                                          arm/rpi_hevcdsp_sao_neon.o     \
++                                          arm/rpi_hevcpred_init_neon.o   \
++                                          arm/rpi_hevcpred_intra_angular_neon.o \
++                                          arm/rpi_hevcpred_intra_dc_neon.o \
++                                          arm/rpi_hevcpred_intra_filter_neon.o \
++                                          arm/rpi_hevcpred_intra_hv_neon.o \
++                                          arm/rpi_hevcpred_intra_planar_neon.o
+ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
+ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
+                                           arm/rv40dsp_neon.o
+diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
+index fdbf86b45e..4755f20e2e 100644
+--- a/libavcodec/arm/cabac.h
++++ b/libavcodec/arm/cabac.h
+@@ -26,83 +26,209 @@
+ #include "libavutil/internal.h"
+ #include "libavcodec/cabac.h"
+ 
++
+ #define get_cabac_inline get_cabac_inline_arm
+ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
+-                                                 uint8_t *const state)
++                                                 uint8_t *state)
+ {
+-    int bit;
+-    void *reg_b, *reg_c, *tmp;
++    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
++    int bit, ptr, low, tmp1, tmp2;
++    __asm__ volatile (
++        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
++        "ldrb    %[ptr], [%[state]]                       \n\t"
++        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
++        "and     %[tmp2], %[bit], #0xc0                   \n\t"
++        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
++        "ldr     %[low], [%[c], %[low_off]]               \n\t"
++        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
++        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
++        "mov     %[tmp1], %[bit]                          \n\t"
++        "cmp     %[low], %[bit], lsl #17                  \n\t"
++        "itt     ge                                       \n\t"
++        "movge   %[tmp1], %[tmp2]                         \n\t"
++        "mvnge   %[ptr], %[ptr]                           \n\t"
++        "clz     %[tmp2], %[tmp1]                         \n\t"
++        "it      ge                                       \n\t"
++        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
++        "sub     %[tmp2], %[tmp2], #23                    \n\t"
++        "and     %[bit], %[ptr], #1                       \n\t"
++        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
++        "lsl     %[low], %[low], %[tmp2]                  \n\t"
++        "lsls    %[ptr], %[low], #16                      \n\t"
++        "bne     1f                                       \n\t"
++        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
++        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
++#if UNCHECKED_BITSTREAM_READER
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "rbit    %[state], %[low]                         \n\t"
++        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
++#else
++        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "rbit    %[state], %[low]                         \n\t"
++        "cmp     %[tmp1], %[ptr]                          \n\t"
++#if CONFIG_THUMB
++        "it      cs                                       \n\t"
++        "ldrhcs  %[tmp1], [%[ptr]], #2                    \n\t"
++#else
++        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
++#endif
++#endif
++        "clz     %[state], %[state]                       \n\t"
++        "movw    %[mlps_tables], #0xffff                  \n\t"
++        "sub     %[state], %[state], #16                  \n\t"
++        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
++        "rev     %[tmp1], %[tmp1]                         \n\t"
++        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
++        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
++        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
++#if CONFIG_THUMB
++        "lsl     %[tmp1], %[tmp1], %[state]               \n\t"
++        "add     %[low], %[low], %[tmp1]                  \n\t"
++#else
++        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
++#endif
++        "str     %[low], [%[c], %[low_off]]               \n\t"
++        "b       2f                                       \n\t"
++        "1:                                               \n\t"
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
++        "str     %[low], [%[c], %[low_off]]               \n\t"
++        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
++        "2:                                               \n\t"
++    :  // Outputs
++             [state]"+r"(state),
++       [mlps_tables]"+r"(mlps_tables),
++               [bit]"=&r"(bit),
++               [ptr]"=&r"(ptr),
++               [low]"=&r"(low),
++              [tmp1]"=&r"(tmp1),
++              [tmp2]"=&r"(tmp2)
++    :  // Inputs
++               [c]"r"(c),
++         [low_off]"J"(offsetof(CABACContext, low)),
++       [range_off]"J"(offsetof(CABACContext, range)),
++         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
++         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++    :  // Clobbers
++       "cc", "memory"
++    );
++    return bit;
++}
+ 
+-    __asm__ volatile(
+-        "ldrb       %[bit]        , [%[state]]                  \n\t"
+-        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
+-        "mov        %[tmp]        , %[range]                    \n\t"
+-        "and        %[range]      , %[range]    , #0xC0         \n\t"
+-        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+-        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
+-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+-        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
+-        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
+-        "cmp        %[tmp]        , %[low]                      \n\t"
+-        "it         gt                                          \n\t"
+-        "movgt      %[range]      , %[r_c]                      \n\t"
+-        "itt        cc                                          \n\t"
+-        "mvncc      %[bit]        , %[bit]                      \n\t"
+-        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
+-        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
+-        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
+-        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
+-        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+-        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+-        "uxth       %[r_c]        , %[low]                      \n\t"
+-        "strb       %[r_b]        , [%[state]]                  \n\t"
+-        "tst        %[r_c]        , %[r_c]                      \n\t"
+-        "bne        2f                                          \n\t"
+-        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
++#define get_cabac_bypass get_cabac_bypass_arm
++static inline int get_cabac_bypass_arm(CABACContext * const c)
++{
++    uint32_t low = c->low, range, ptr, tmp;
++    int rv;
++    __asm volatile (
++        "ldr        %[range] , [%[c], %[range_off]] \n\t"
++        "mov        %[rv]    , #0                   \n\t"
++        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "lsl        %[low]   , #1                   \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
++#endif
++        "cmp        %[low]   , %[range], lsl #17    \n\t"
++        "itt         cs                              \n\t"
++        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
++        "movcs      %[rv]    , #1                   \n\t"
+ #if UNCHECKED_BITSTREAM_READER
+-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+-        "add        %[r_c]        , %[r_c]      , #2            \n\t"
+-        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
++        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
++#else
++        "cmp        %[tmp]   , %[ptr]               \n\t"
++#if CONFIG_THUMB
++        "it         cs                              \n\t"
++        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
+ #else
+-        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
+-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+-        "cmp        %[r_c]        , %[r_b]                      \n\t"
+-        "itt        lt                                          \n\t"
+-        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
+-        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
++        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
++#endif
+ #endif
+-        "sub        %[r_c]        , %[low]      , #1            \n\t"
+-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+-        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
+-        "rev        %[tmp]        , %[tmp]                      \n\t"
+-        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
+-        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+-        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
+-        "movw       %[r_b]        , #0xFFFF                     \n\t"
+-        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+-        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
+-        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
+-        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+-        "2:                                                     \n\t"
+-        :    [bit]"=&r"(bit),
+-             [low]"+&r"(c->low),
+-           [range]"+&r"(c->range),
+-             [r_b]"=&r"(reg_b),
+-             [r_c]"=&r"(reg_c),
+-             [tmp]"=&r"(tmp)
+-        :        [c]"r"(c),
+-             [state]"r"(state),
+-            [tables]"r"(ff_h264_cabac_tables),
+-              [byte]"M"(offsetof(CABACContext, bytestream)),
+-               [end]"M"(offsetof(CABACContext, bytestream_end)),
+-          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
+-           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
+-          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+-        : "memory", "cc"
+-        );
++        "lsls       %[range] , %[low], #16          \n\t"
++        "bne        1f                              \n\t"
+ 
+-    return bit & 1;
++        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "rev        %[tmp]   , %[tmp]               \n\t"
++        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
++        "movw       %[tmp]   , 0xFFFF               \n\t"
++        "sub        %[low]   , %[tmp]               \n\t"
++        "1:                                         \n\t"
++        "str        %[low]   , [%[c], %[low_off]]   \n\t"
++        : // Outputs
++               [rv]"=&r"(rv),
++              [low]"+r"(low),
++            [range]"=&r"(range),
++              [ptr]"=&r"(ptr),
++              [tmp]"=&r"(tmp)
++        : // Inputs
++                    [c]"r"(c),
++              [low_off]"J"(offsetof(CABACContext, low)),
++            [range_off]"J"(offsetof(CABACContext, range)),
++              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++              [end_off]"J"(offsetof(CABACContext, bytestream_end))
++        : // Clobbers
++            "memory", "cc"
++    );
++    return rv;
+ }
++
++
++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
++{
++    uint32_t low = c->low, range, ptr, tmp;
++    __asm volatile (
++        "ldr        %[range] , [%[c], %[range_off]] \n\t"
++        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "lsl        %[low]   , #1                   \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
++#endif
++        "cmp        %[low]   , %[range], lsl #17    \n\t"
++        "it         cs                              \n\t"
++        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
++        "it         cc                              \n\t"
++        "rsbcc      %[rv]    , %[rv], #0            \n\t"
++#if UNCHECKED_BITSTREAM_READER
++        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
++#else
++        "cmp        %[tmp]   , %[ptr]               \n\t"
++#if CONFIG_THUMB
++        "it         cs                              \n\t"
++        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
++#else
++        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
++#endif
++#endif
++        "lsls       %[range] , %[low], #16          \n\t"
++        "bne        1f                              \n\t"
++
++        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "rev        %[tmp]   , %[tmp]               \n\t"
++        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
++        "movw       %[tmp]   , 0xFFFF               \n\t"
++        "sub        %[low]   , %[tmp]               \n\t"
++        "1:                                         \n\t"
++        "str        %[low]   , [%[c], %[low_off]]   \n\t"
++        : // Outputs
++               [rv]"+r"(rv),
++              [low]"+r"(low),
++            [range]"=&r"(range),
++              [ptr]"=&r"(ptr),
++              [tmp]"=&r"(tmp)
++        : // Inputs
++                    [c]"r"(c),
++              [low_off]"J"(offsetof(CABACContext, low)),
++            [range_off]"J"(offsetof(CABACContext, range)),
++              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++              [end_off]"J"(offsetof(CABACContext, bytestream_end))
++        : // Clobbers
++            "memory", "cc"
++    );
++    return rv;
++}
++
+ #endif /* HAVE_ARMV6T2_INLINE */
+ 
+ #endif /* AVCODEC_ARM_CABAC_H */
+diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
+new file mode 100644
+index 0000000000..c88dec6eff
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_cabac.h
+@@ -0,0 +1,607 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVC_CABAC_H
++#define AVCODEC_ARM_HEVC_CABAC_H
++
++#include "config.h"
++#if HAVE_ARMV6T2_INLINE
++
++#define hevc_mem_bits32 hevc_mem_bits32_arm
++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
++{
++    unsigned int n;
++    __asm__ (
++        "rev        %[n], %[x]                     \n\t"
++        : [n]"=r"(n)
++        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
++        :
++        );
++    return n << (bits & 7);
++}
++
++
++// ---------------------------------------------------------------------------
++//
++// Helper fns - little bits of code where ARM has an instraction that the
++// compiler doesn't know about / use
++
++#define trans_scale_sat trans_scale_sat_arm
++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++    int rv;
++    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
++
++    __asm__ (
++    "ssat %[rv], #16, %[t], ASR #1 \n\t"
++    : [rv]"=r"(rv)
++    : [t]"r"(t)
++    :
++    );
++    return rv;
++}
++
++#define update_rice update_rice_arm
++static inline void update_rice_arm(uint8_t * const stat_coeff,
++    const unsigned int last_coeff_abs_level_remaining,
++    const unsigned int c_rice_param)
++{
++    int t = last_coeff_abs_level_remaining << 1;
++    __asm__ (
++    "lsrs  %[t], %[t], %[shift]             \n\t"
++
++    "it    eq                               \n\t"
++    "subeq %[stat], %[stat], #1             \n\t"
++    "cmp   %[t], #6                         \n\t"
++    "adc   %[stat], %[stat], #0             \n\t"
++    "usat  %[stat], #8, %[stat]             \n\t"
++    : [stat]"+r"(*stat_coeff),
++         [t]"+r"(t)
++    :  [shift]"r"(c_rice_param)
++    : "cc"
++    );
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC get loops
++//
++// Where the loop is simple enough we can normally do 10-30% better than the
++// compiler
++
++// Get the residual greater than 1 bits
++
++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
++    uint8_t * const state0)
++{
++    unsigned int i, reg_b, st, tmp, bit, rv;
++     __asm__ (
++         "mov        %[i]          , #0                          \n\t"
++         "mov        %[rv]         , #0                          \n\t"
++         "1:                                                     \n\t"
++         "add        %[i]          , %[i]        , #1            \n\t"
++         "cmp        %[rv]         , #0                          \n\t"
++         "ite        eq                                          \n\t"
++         "usateq     %[st]         , #2          , %[i]          \n\t"
++         "movne      %[st]         , #0                          \n\t"
++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++
++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
++         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
++         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "cmp        %[low]        , %[range], lsl #17           \n\t"
++         "ittt       ge                                          \n\t"
++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++         "movge      %[range]      , %[tmp]                      \n\t"
++         "mvnge      %[bit]        , %[bit]                      \n\t"
++
++         "clz        %[tmp]        , %[range]                    \n\t"
++         "sub        %[tmp]        , #23                         \n\t"
++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "and        %[bit]        , %[bit]      , #1            \n\t"
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
++         "it         ne                                          \n\t"
++         "cmpne      %[n]          , %[i]                        \n\t"
++         "bne        1b                                          \n\t"
++
++// If reload is not required then we must have run out of flags to decode
++         "tst        %[tmp]        , %[tmp]                      \n\t"
++         "bne        2f                                          \n\t"
++
++// Do reload
++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "rbit       %[bit]        , %[low]                      \n\t"
++         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "clz        %[bit]        , %[bit]                      \n\t"
++         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "sub        %[bit]        , %[bit]      , #16           \n\t"
++         "cmp        %[n]          , %[i]                        \n\t"
++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
++#else
++         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
++#endif
++
++         "bne        1b                                          \n\t"
++         "2:                                                     \n\t"
++         :    [bit]"=&r"(bit),
++              [low]"+r"(c->low),
++            [range]"+r"(c->range),
++              [r_b]"=&r"(reg_b),
++             [bptr]"+r"(c->bytestream),
++                [i]"=&r"(i),
++              [tmp]"=&r"(tmp),
++               [st]"=&r"(st),
++               [rv]"=&r"(rv)
++          :  [state0]"r"(state0),
++                  [n]"r"(n),
++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++         : "memory", "cc"
++    );
++    return rv;
++}
++
++
++// n must be > 0 on entry
++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t * ctx_map,
++    uint8_t * p)
++{
++    unsigned int reg_b, tmp, st, bit;
++     __asm__ (
++// Get bin from map
++#if CONFIG_THUMB
++         "add        %[ctx_map]    , %[n]                        \n\t"
++         "ldrb       %[st]         , [%[ctx_map]]                \n\t"
++#else
++         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
++#endif
++         "1:                                                     \n\t"
++
++// Load state & ranges
++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
++         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
++         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "cmp        %[low]        , %[range], lsl #17           \n\t"
++         "ittt       ge                                          \n\t"
++         "mvnge      %[bit]        , %[bit]                      \n\t"
++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++         "movge      %[range]      , %[tmp]                      \n\t"
++
++// Renorm
++         "clz        %[tmp]        , %[range]                    \n\t"
++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "sub        %[tmp]        , #23                         \n\t"
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
++         "tst        %[bit]        , #1                          \n\t"
++         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++// GCC asm seems to need strbne written differently for thumb and arm
++#if CONFIG_THUMB
++         "it         ne                                          \n\t"
++         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
++#else
++         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
++#endif
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++         "subs       %[n]          , %[n]        , #1            \n\t"
++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
++#if CONFIG_THUMB
++         "itt        ne                                          \n\t"
++         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
++#else
++         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
++#endif
++         "bne        1b                                          \n\t"
++
++// If we have bits left then n must be 0 so give up now
++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
++         "bne        2f                                          \n\t"
++
++// Do reload
++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "rbit       %[bit]        , %[low]                      \n\t"
++         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "clz        %[bit]        , %[bit]                      \n\t"
++         "cmp        %[n]          , #0                          \n\t"
++         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "sub        %[bit]        , %[bit]      , #16           \n\t"
++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
++#else
++         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
++#endif
++
++// Check to see if we still have more to do
++         "bne        1b                                          \n\t"
++         "2:                                                     \n\t"
++         :    [bit]"=&r"(bit),
++              [low]"+r"(c->low),
++            [range]"+r"(c->range),
++              [r_b]"=&r"(reg_b),
++             [bptr]"+r"(c->bytestream),
++              [idx]"+r"(p),
++                [n]"+r"(n),
++              [tmp]"=&r"(tmp),
++               [st]"=&r"(st),
++          [ctx_map]"+r"(ctx_map)
++          :  [state0]"r"(state0),
++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++         : "memory", "cc"
++    );
++
++    return p;
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC_BY22 functions
++
++
++#define get_cabac_by22_start get_cabac_by22_start_arm
++static inline void get_cabac_by22_start_arm(CABACContext * const c)
++{
++    const uint8_t *ptr = c->bytestream;
++    register uint32_t low __asm__("r1"), range __asm__("r2");
++    uint32_t m, range8, bits;
++#if !USE_BY22_DIV
++    uintptr_t inv;
++#endif
++
++    av_assert2(offsetof (CABACContext, low) == 0);
++    av_assert2(offsetof (CABACContext, range) == 4);
++    av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
++    __asm__ volatile (
++        "ldmia   %[c], {%[low], %[range]}                         \n\t"
++        : // Outputs
++               [low]"=r"(low),
++             [range]"=r"(range)
++        : // Inputs
++                 [c]"r"(c)
++        : // Clobbers
++    );
++#if !USE_BY22_DIV
++    inv = (uintptr_t)cabac_by22_inv_range;
++#endif
++    __asm__ volatile (
++        "ldr     %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
++#if !USE_BY22_DIV
++        "uxtb    %[range8], %[range]                              \n\t"
++#endif
++        "rbit    %[bits], %[low]                                  \n\t"
++        "lsl     %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
++        "clz     %[bits], %[bits]                                 \n\t"
++        "str     %[ptr], [%[c], %[ptr_off]]                       \n\t"
++        "rev     %[m], %[m]                                       \n\t"
++        "rsb     %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
++        "eor     %[m], %[m], #0x80000000                          \n\t"
++#if !USE_BY22_DIV
++        "ldr     %[inv], [%[inv], %[range8], lsl #2]              \n\t"
++        "pkhbt   %[range], %[bits], %[range], lsl #16             \n\t"
++        "str     %[range], [%[c], %[bits_off]]                    \n\t"
++#else
++        "strh    %[bits], [%[c], %[bits_off]]                     \n\t"
++#endif
++#if CONFIG_THUMB
++        "lsr     %[m], %[ptr]                                     \n\t"
++        "eor     %[range], %[low], %[m]                           \n\t"
++#else
++        "eor     %[range], %[low], %[m], lsr %[ptr]               \n\t"
++#endif
++        : // Outputs
++               [ptr]"+&r"(ptr),
++               [low]"+&r"(low),
++             [range]"+&r"(range),
++#if !USE_BY22_DIV
++               [inv]"+&r"(inv),
++#endif
++                 [m]"=&r"(m),
++            [range8]"=&r"(range8),
++              [bits]"=&r"(bits)
++        : // Inputs
++                   [c]"r"(c),
++            [bits_off]"J"(offsetof (CABACContext, by22.bits)),
++             [ptr_off]"J"(offsetof (CABACContext, bytestream))
++        : // Clobbers
++            "memory"
++    );
++    c->low = range;
++#if !USE_BY22_DIV
++    c->range = inv;
++#endif
++}
++
++#define get_cabac_by22_peek get_cabac_by22_peek_arm
++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
++{
++    uint32_t rv = c->low &~ 1, tmp;
++    __asm__ (
++        "cmp      %[inv] , #0                    \n\t"
++        "it       ne                             \n\t"
++        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
++        :  // Outputs
++             [rv]"+r"(rv),
++             [tmp]"=r"(tmp)
++        :  // Inputs
++             [inv]"r"(c->range)
++        :  // Clobbers
++                "cc"
++    );
++    return rv << 1;
++}
++
++#define get_cabac_by22_flush get_cabac_by22_flush_arm
++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
++{
++    uint32_t bits, ptr, tmp1, tmp2;
++    __asm__ volatile (
++        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
++        "rsb     %[tmp1], %[n], #32                \n\t"
++        "add     %[bits], %[bits], %[n]            \n\t"
++        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
++        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
++        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
++#if CONFIG_THUMB
++        "add     %[ptr], %[ptr], %[bits], lsr #3   \n\t"
++        "ldr     %[ptr], [%[ptr]]                  \n\t"
++#else
++        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
++#endif
++        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
++        "and     %[tmp2], %[bits], #7              \n\t"
++        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
++        "rev     %[ptr], %[ptr]                    \n\t"
++        "lsl     %[tmp1], %[tmp1], #23             \n\t"
++#if CONFIG_THUMB
++        "lsl     %[val], %[n]                      \n\t"
++        "sub     %[val], %[tmp1]                   \n\t"
++#else
++        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
++#endif
++        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
++        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
++        "str     %[val], [%[cc], %[low_off]]       \n\t"
++        :  // Outputs
++            [val]"+r"(val),
++           [bits]"=&r"(bits),
++            [ptr]"=&r"(ptr),
++           [tmp1]"=&r"(tmp1),
++           [tmp2]"=&r"(tmp2)
++        :  // Inputs
++                  [cc]"r"(c),
++                   [n]"r"(n),
++            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
++             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++           [range_off]"J"(offsetof(CABACContext, by22.range)),
++             [low_off]"J"(offsetof(CABACContext, low))
++        :  // Clobbers
++           "memory"
++    );
++}
++
++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
++{
++    uint32_t last_coeff_abs_level_remaining;
++    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
++    __asm__ volatile (
++        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
++        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
++        "bic     %[remain], %[remain], #1                     \n\t"
++        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
++        "cmp     %[prefix], #0                                \n\t"
++        "it      ne                                           \n\t"
++        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
++        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
++        "lsl     %[remain], %[remain], #1                     \n\t"
++        "mvn     %[prefix], %[remain]                         \n\t"
++        "clz     %[prefix], %[prefix]                         \n\t"
++        "rsbs    %[n1], %[prefix], #2                         \n\t"
++        "bcc     1f                                           \n\t"
++        "adc     %[n1], %[rice], %[prefix]                    \n\t"
++        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
++        "rsb     %[n2], %[n1], #32                            \n\t"
++        "and     %[tmp1], %[tmp2], #7                         \n\t"
++        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
++        "lsr     %[n2], %[remain], %[n2]                      \n\t"
++        "mul     %[n2], %[range], %[n2]                       \n\t"
++        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
++        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
++        "rsb     %[tmp2], %[rice], #31                        \n\t"
++        "lsl     %[remain], %[remain], %[prefix]              \n\t"
++        "lsl     %[n2], %[n2], #23                            \n\t"
++#if CONFIG_THUMB
++        "lsl     %[range], %[n1]                              \n\t"
++        "sub     %[range], %[n2]                              \n\t"
++#else
++        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
++#endif
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
++#if CONFIG_THUMB
++        "lsr     %[remain], %[tmp2]                           \n\t"
++        "add     %[remain], %[n2]                             \n\t"
++#else
++        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
++#endif
++        "b       3f                                           \n\t"
++        "1:                                                   \n\t"
++        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
++        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
++        "bhi     2f                                           \n\t"
++        "sub     %[n1], %[n2], #2                             \n\t"
++        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
++        "rsb     %[n2], %[n1], #32                            \n\t"
++        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
++        "lsr     %[n2], %[remain], %[n2]                      \n\t"
++        "mul     %[n2], %[range], %[n2]                       \n\t"
++        "rsb     %[range], %[rice], #34                       \n\t"
++        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
++        "and     %[tmp1], %[tmp2], #7                         \n\t"
++        "lsl     %[remain], %[remain], %[prefix]              \n\t"
++        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
++        "rsb     %[prefix], %[prefix], %[range]               \n\t"
++        "orr     %[remain], %[remain], #0x80000000            \n\t"
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "lsl     %[n2], %[n2], #23                            \n\t"
++        "mov     %[range], #2                                 \n\t"
++#if CONFIG_THUMB
++        "lsl     %[tmp2], %[n1]                               \n\t"
++        "sub     %[tmp2], %[n2]                               \n\t"
++#else
++        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
++#endif
++        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
++        "lsl     %[rice], %[range], %[rice]                   \n\t"
++        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
++#if CONFIG_THUMB
++        "lsr     %[remain], %[prefix]                         \n\t"
++        "add     %[remain], %[rice]                           \n\t"
++#else
++        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
++#endif
++        "b       4f                                           \n\t"
++        "2:                                                   \n\t"
++        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
++#if CONFIG_THUMB
++        "add     %[tmp2], %[ptr], %[n1], lsr #3               \n\t"
++        "ldr     %[tmp2], [%[tmp2]]                           \n\t"
++#else
++        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
++#endif
++        "rsb     %[tmp1], %[prefix], #32                      \n\t"
++        "push    {%[rice]}                                    \n\t"
++        "and     %[rice], %[n1], #7                           \n\t"
++        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
++        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
++        "mul     %[remain], %[range], %[tmp1]                 \n\t"
++        "rev     %[tmp2], %[tmp2]                             \n\t"
++        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
++        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
++        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
++        "sub     %[tmp2], %[n2], #2                           \n\t"
++        "lsl     %[remain], %[remain], #23                    \n\t"
++#if CONFIG_THUMB
++        "lsl     %[ptr], %[prefix]                            \n\t"
++        "rsb     %[remain], %[ptr]                            \n\t"
++#else
++        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
++#endif
++        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
++        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
++        "bic     %[n1], %[remain], #1                         \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
++        "cmp     %[tmp1], #0                                  \n\t"
++        "rsb     %[rice], %[tmp2], #32                        \n\t"
++        "it      ne                                           \n\t"
++        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
++        "and     %[tmp1], %[prefix], #7                       \n\t"
++#if CONFIG_THUMB
++        "add     %[ptr], %[ptr], %[prefix], lsr #3            \n\t"
++        "ldr     %[ptr], [%[ptr]]                             \n\t"
++#else
++        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
++#endif
++        "lsl     %[n1], %[n1], #1                             \n\t"
++        "lsr     %[rice], %[n1], %[rice]                      \n\t"
++        "rsb     %[n2], %[n2], #34                            \n\t"
++        "mul     %[range], %[range], %[rice]                  \n\t"
++        "pop     {%[rice]}                                    \n\t"
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "orr     %[n1], %[n1], #0x80000000                    \n\t"
++        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
++        "mov     %[prefix], #2                                \n\t"
++        "lsl     %[range], %[range], #23                      \n\t"
++#if CONFIG_THUMB
++        "lsl     %[remain], %[tmp2]                           \n\t"
++        "rsb     %[range], %[remain]                          \n\t"
++#else
++        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
++#endif
++        "lsl     %[remain], %[prefix], %[rice]                \n\t"
++#if CONFIG_THUMB
++        "lsr     %[n1], %[n2]                                 \n\t"
++        "add     %[remain], %[n1]                             \n\t"
++#else
++        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
++#endif
++        "3:                                                   \n\t"
++        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
++        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
++        "4:                                                   \n\t"
++        "str     %[range], [%[cc], %[low_off]]                \n\t"
++        :  // Outputs
++            [remain]"=&r"(last_coeff_abs_level_remaining),
++              [rice]"+r"(rice_param),
++            [prefix]"=&r"(prefix),
++                [n1]"=&r"(n1),
++             [range]"=&r"(range),
++                [n2]"=&r"(n2),
++               [ptr]"=&r"(ptr),
++              [tmp1]"=&r"(tmp1),
++              [tmp2]"=&r"(tmp2)
++        :  // Inputs
++                          [cc]"r"(c),
++            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
++                     [low_off]"J"(offsetof(CABACContext, low)),
++                   [range_off]"J"(offsetof(CABACContext, range)),
++               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
++              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
++                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
++        :  // Clobbers
++           "cc", "memory"
++    );
++    return last_coeff_abs_level_remaining;
++}
++
++#endif /* HAVE_ARMV6T2_INLINE */
++
++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
+new file mode 100644
+index 0000000000..978b7b6947
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
+@@ -0,0 +1,183 @@
++/*
++ * ARM NEON optimised IDCT functions for HEVC decoding
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++@ Included multiple times from hevc_idct_neon.S
++@ Macros defined there
++
++#define DC_SHIFT  (15 - BIT_DEPTH)
++#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
++#define TRN_SHIFT (20 - BIT_DEPTH)
++
++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q0, r1
++        vdup.16     q1, r1
++        vst1.16     {q0, q1}, [r0]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r2, r0, #32
++        mov         r3, #64
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vst1.16     {q8, q9}, [r0], r3
++        vst1.16     {q8, q9}, [r2], r3
++        vst1.16     {q8, q9}, [r0]
++        vst1.16     {q8, q9}, [r2]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r2, r0, #32
++        mov         r3, #64
++        add         r1, #DC_ADD
++        mov         ip, #16*16
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++1:      vst1.16     {q8, q9}, [r0], r3
++        subs        ip, ip, #32
++        vst1.16     {q8, q9}, [r2], r3
++        bhi         1b
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r2, r0, #32
++        mov         r3, #64
++        add         r1, #DC_ADD
++        mov         ip, #32*32
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++1:      vst1.16     {q8, q9}, [r0], r3
++        subs        ip, ip, #32
++        vst1.16     {q8, q9}, [r2], r3
++        bhi         1b
++        bx lr
++endfunc
++
++
++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
++        vldr.i32    s0, =0x00240053 // 36 and 83
++        vld1.16     {q14, q15}, [r0 :256]  // coeffs
++
++        tr4_shift   #7
++
++        vzip.16     d28, d29
++        vzip.16     d30, d31
++        vzip.32     q14, q15
++
++        tr4_shift   #TRN_SHIFT
++
++        vst4.16     {q14, q15}, [r0 :256]
++        bx lr
++
++        .ltorg
++endfunc
++
++
++
++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++        vmov.i32    d0, #0x4a  // 74
++        vld1.16     {q14, q15}, [r0 :256]  // coeffs
++        vmov.i32    d1, #0x1d  // 29
++        vmov.i32    d2, #0x37  // 55
++
++        tr4_luma_shift #7
++
++        vzip.16     d28, d29
++        vzip.16     d30, d31
++        vzip.32     q14, q15
++
++        tr4_luma_shift #TRN_SHIFT
++
++        vst4.16     {q14, q15}, [r0 :256]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
++        add      r2, r0, #16
++        adr      r3, tr4f
++        vpush    {d8-d15}
++        vld1.16  {d0, d1}, [r3]
++        mov      r3, #32
++
++        tr8_vert  d16, d17, d18, d19, d24, d25, d26, d27, q8,  q9,  \
++            "sub      r0, r0, #128-8",                              \
++            "sub      r2, r2, #128-8",                              \
++            "cmp      r1, #4"
++        ble      2f
++
++        tr8_vert  d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
++            "sub      r0, r0, #128+8",                              \
++            "sub      r2, r2, #128+8+16-32",                        \
++            "mov      r3, #64"
++
++        vzip.16  d16, d17
++        vzip.16  d18, d19
++
++        vzip.16  d20, d21
++        vzip.16  d22, d23
++        vzip.16  d28, d29
++        vzip.16  d30, d31
++        vzip.32  q10, q11
++        vzip.32  q14, q15
++1:
++        vzip.16  d24, d25
++        vzip.16  d26, d27
++        vzip.32  q8, q9
++        vzip.32  q12, q13
++
++        tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8,  q9,  TRN_SHIFT
++        tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
++
++        vpop     {d8-d15}
++        bx       lr
++
++2:      vmov.i64 q10, #0
++        sub      r0, r0, #8
++        vmov.i64 q11, #0
++        sub      r2, r2, #8+16-32
++        vmov.i64 q14, #0
++        mov      r3, #64
++        vmov.i64 q15, #0
++
++        vzip.16  d16, d17
++        vzip.16  d18, d19
++
++        b        1b
++
++endfunc
++
++#undef DC_SHIFT
++#undef DC_ADD
++#undef TRN_SHIFT
++
+diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
+new file mode 100644
+index 0000000000..161bb0d7c9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.S
+@@ -0,0 +1,267 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ rpi_zap_coeff_vals_neon(
++@   uint16_t * buf,          [r0]
++@   unsigned int log_n_m2)   [r1]
++
++function rpi_zap_coeff_vals_neon, export=1
++        mov      ip, #1
++        vmov.i64 q0, #0
++        teq      r1, #0
++        vmov.i64 q1, #0
++        beq      2f
++
++        lsl      ip, r1    @ 2, 4 or 8
++        add      r2, r0, #32
++        lsl      ip, r1    @ 4, 16 or 64 = number of 32-byte blocks to zero
++        mov      r3, #64
++1:      vst1.8   {q0,q1}, [r0:256], r3
++        subs     ip, #2
++        vst1.8   {q0,q1}, [r2:256], r3
++        bne      1b
++        bx       lr
++
++2:      vst1.8   {q0,q1}, [r0:256]
++        bx       lr
++endfunc
++
++@ PIC jump tables are more expensive than absolute for A32 code
++.set jent_pic, CONFIG_PIC || CONFIG_THUMB
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++T       .short ((0 + \lab) - (0 + 98b)) / 2
++A       .short (0 + \lab) - (4 + 98b)
++.else
++T       .word   1 + \lab
++A       .word   \lab
++.endif
++.endm
++
++.set expected_next, 0
++
++.macro cpy_compound val, p1, p2, drop_thru=0
++.if \p1 + \p2 != \val
++.error "Bad addition!  \p1 + \p2 != \val"
++.endif
++.if expected_next != 0 && expected_next != \val
++.error "Drop thru failure"
++.endif
++\val\():
++        push       {r0-r3}
++        bl          100\p1\()b
++        pop        {r0-r3}
++        add         r0, #\p1
++        add         r2, #\p1
++.if \drop_thru == 0
++        b           \p2\()b
++.set expected_next, 0
++.else
++.set expected_next, \p2
++.endif
++.endm
++
++@ ff_hevc_cpy_blks8x4_neon(
++@   dst         [r0]
++@   dst_stride  [r1]
++@   src         [r2]
++@   src_stride  [r3]
++@   width       [sp, #0] (bytes)
++@   height)     [sp, #4]
++@
++@ Power of 2 widths are directly coded, all others are done in stripes
++@ We expect the vast majority of calls to be power of 2
++@
++@ Currently has min width of 8, but we could make that 4 without issue
++@ Min height is 4
++
++function ff_hevc_rpi_cpy_blks8x4_neon, export=1
++        ldr         r12, [sp, #0]
++        push       {r11, lr}
++.if jent_pic
++A       adr         lr,  98f - 2
++.else
++A       adr         lr,  98f - 4
++.endif
++        lsr         r12, #3
++        ldr         r11, [sp, #(8 + 4)]
++.if jent_pic
++A       lsl         r12, #1
++A       ldrsh       lr,  [lr,  r12]
++A       add         pc,  lr
++T       tbh         [pc, r12, lsl #1]
++.else
++        @ A32 only, Thumb is always PIC
++        ldr         pc,  [lr,  r12, lsl #2]
++.endif
++
++98:
++T       .short      0 @ unused
++        jent        8f
++        jent        16f
++        jent        24f
++        jent        32f
++        jent        40f
++        jent        48f
++        jent        56f
++        jent        64f
++        jent        72f
++        jent        80f
++        jent        88f
++        jent        96f
++        jent        104f
++        jent        112f
++        jent        120f
++        jent        128f
++
++1008:
++        push       {r11, lr}
++8:
++        add         lr,  r2,  r3
++        lsl         r3,  #1
++        add         r12, r0,  r1
++        lsl         r1,  #1
++1:
++        vld1.32    {d0 }, [r2],  r3
++        vld1.32    {d1 }, [lr],  r3
++        vld1.32    {d2 }, [r2],  r3
++        vld1.32    {d3 }, [lr],  r3
++        subs        r11,  #4
++        vst1.32    {d0 }, [r0],  r1
++        vst1.32    {d1 }, [r12], r1
++        vst1.32    {d2 }, [r0],  r1
++        vst1.32    {d3 }, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++10016:
++        push       {r11, lr}
++16:
++        add         lr,  r2,  r3
++        lsl         r3,  #1
++        add         r12, r0,  r1
++        lsl         r1,  #1
++1:
++        vld1.32    {q0 }, [r2],  r3
++        vld1.32    {q1 }, [lr],  r3
++        vld1.32    {q2 }, [r2],  r3
++        vld1.32    {q3 }, [lr],  r3
++        subs        r11, #4
++        vst1.32    {q0 }, [r0],  r1
++        vst1.32    {q1 }, [r12], r1
++        vst1.32    {q2 }, [r0],  r1
++        vst1.32    {q3 }, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++10032:
++        push       {r11, lr}
++32:
++        add         lr,  r2,  r3
++        lsl         r3,  #1
++        add         r12, r0,  r1
++        lsl         r1,  #1
++1:
++        vld1.32    {q8,  q9 }, [r2],  r3
++        vld1.32    {q10, q11}, [lr],  r3
++        vld1.32    {q12, q13}, [r2],  r3
++        vld1.32    {q14, q15}, [lr],  r3
++        subs        r11, #4
++        vst1.32    {q8,  q9 }, [r0],  r1
++        vst1.32    {q10, q11}, [r12], r1
++        vst1.32    {q12, q13}, [r0],  r1
++        vst1.32    {q14, q15}, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++10064:
++        push       {r11, lr}
++64:
++        add         lr,  r2,  #32
++        add         r12, r0,  #32
++1:
++        vld1.32    {q8,  q9 }, [r2],  r3
++        vld1.32    {q10, q11}, [lr],  r3
++        vld1.32    {q12, q13}, [r2],  r3
++        vld1.32    {q14, q15}, [lr],  r3
++        subs        r11, #2
++        vst1.32    {q8,  q9 }, [r0],  r1
++        vst1.32    {q10, q11}, [r12], r1
++        vst1.32    {q12, q13}, [r0],  r1
++        vst1.32    {q14, q15}, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++128:
++        push       {r4, r5}
++        @ We could do this with fewer registers if we jump around but I
++        @ have a primative urge to load sequentially
++        mov         r4,  #64
++        add         lr,  r2,  #32
++        add         r12, r0,  #32
++        sub         r3,  r4
++        sub         r1,  r4
++1:
++        vld1.32    {q8,  q9 }, [r2],  r4
++        vld1.32    {q10, q11}, [lr],  r4
++        vld1.32    {q12, q13}, [r2],  r3
++        vld1.32    {q14, q15}, [lr],  r3
++        subs        r11, #1
++        vst1.32    {q8,  q9 }, [r0],  r4
++        vst1.32    {q10, q11}, [r12], r4
++        vst1.32    {q12, q13}, [r0],  r1
++        vst1.32    {q14, q15}, [r12], r1
++        bgt         1b
++        pop        {r4, r5, r11, pc}
++
++@ Use drop_thru where we can
++cpy_compound 104, 64, 40, 1
++cpy_compound 40, 32, 8
++
++cpy_compound 112, 64, 48, 1
++cpy_compound 48, 32, 16
++
++cpy_compound 120, 64, 56, 1
++cpy_compound 56, 32, 24, 1
++cpy_compound 24, 16, 8
++
++cpy_compound 72, 64, 8
++cpy_compound 80, 64, 16
++cpy_compound 88, 64, 24
++cpy_compound 96, 64, 32
++
++
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h
+new file mode 100644
+index 0000000000..9d21f6a882
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.h
+@@ -0,0 +1,438 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
++#define AVCODEC_ARM_RPI_HEVC_MISC_H
++
++#include "config.h"
++#if HAVE_NEON_INLINE && !CONFIG_THUMB
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                       ptrdiff_t stride_src)
++{
++    const uint8_t *src2 = src + stride_src;
++    stride_src <<= 1;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {q0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {q1}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.32     {q0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.32     {q1}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.16     d0, d1                            \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.16     d2, d3                            \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d2}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vzip.16     d0, d1                            \n\t"
++                "vst1.16     {d0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vzip.16     d2, d3                            \n\t"
++                "vst1.16     {d2}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.8      d0, d1                            \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.8      d2, d3                            \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d2}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vzip.8      d0, d1                            \n\t"
++                "vst1.8      {d0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vzip.8      d2, d3                            \n\t"
++                "vst1.8      {d2}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                      ptrdiff_t stride_dst)
++{
++    uint8_t *dst2 = dst + stride_dst;
++    stride_dst <<= 1;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "subs        %[height], #4                     \n\t"
++                "vld1.32     {q0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.32     {q1}, [%[src]]!                   \n\t"
++                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.32     {q0}, [%[src]]!                   \n\t"
++                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
++                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
++                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "subs        %[height], #4                     \n\t"
++                "vld1.16     {d0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.16     {d2}, [%[src]]!                   \n\t"
++                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.16     {d0}, [%[src]]!                   \n\t"
++                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
++                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
++                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "subs        %[height], #8                     \n\t"
++                "vld1.8      {d0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.8      {d2}, [%[src]]!                   \n\t"
++                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.8      {d0}, [%[src]]!                   \n\t"
++                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
++                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
++                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    int x, y;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "ldr         %[x], [%[src]], %[stride_src] \n\t"
++                "ldr         %[y], [%[src]], %[stride_src] \n\t"
++                "str         %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldr         %[x], [%[src]], %[stride_src] \n\t"
++                "str         %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldr         %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "str         %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "str         %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
++                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
++                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
++                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "strh        %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
++                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
++                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
++                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "strb        %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
++                                              int pixel_shift, int height,
++                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    if (stride_dst == 1 << pixel_shift)
++        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
++    else if (stride_src == 1 << pixel_shift)
++        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
++    else
++        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
++}
++
++#endif /* HAVE_NEON_INLINE */
++
++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
+diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h
+new file mode 100644
+index 0000000000..325c26a49b
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_mv_arm.h
+@@ -0,0 +1,93 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
++#define AVCODEC_ARM_RPI_HEVC_MV_H
++
++#if HAVE_ARMV6T2_INLINE
++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
++{
++    MvXY r;
++    __asm__ (
++        "sadd16    %[r], %[a], %[b]        \n\t"
++        : [r]"=r"(r)
++        : [a]"r"(a),
++          [b]"r"(b)
++        :
++        );
++    return r;
++}
++#define mvxy_add mvxy_add_arm
++#endif
++
++#if HAVE_ARMV6T2_INLINE
++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
++{
++    int t;
++    __asm__ (
++    "ssat   %[td], #8,    %[td]          \n\t"
++    "ssat   %[tb], #8,    %[tb]          \n\t"
++    "eor    %[t],  %[td], %[td], asr #31 \n\t"
++    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
++    "asr    %[t],  #1                    \n\t"
++    "add    %[t],  #0x4000               \n\t"
++    "it ne                               \n\t"
++    "sdivne %[t],  %[t],  %[td]          \n\t"
++    "mov    %[td], #32                   \n\t"
++    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
++    "ssat   %[td], #13,   %[td], asr #6  \n\t"
++    "mov    %[tb], #127                  \n\t"
++    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
++    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
++// This takes the sign of x & y for rounding at the "wrong" point
++// (i.e. after adding 127) but for the range of values (-1,-127)
++// where it does the wrong thing you get the right answer (0) anyway
++    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
++    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
++    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
++    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
++    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
++    :
++         [t]"=&r"(t),
++        [xy]"+r"(xy),
++        [td]"+r"(td),
++        [tb]"+r"(tb)
++    :
++    :
++        "cc"
++    );
++    return xy;
++}
++#define mv_scale_xy mv_scale_xy_arm
++#endif
++#endif
++
++#endif // AVCODEC_ARM_RPI_HEVC_MV_H
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
+new file mode 100644
+index 0000000000..62b9326532
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_arm.h
+@@ -0,0 +1,26 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
++#define AVCODEC_ARM_HEVCDSP_ARM_H
++
++#include "libavcodec/rpi_hevcdsp.h"
++
++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
+diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+new file mode 100644
+index 0000000000..88a3b4e5e7
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+@@ -0,0 +1,1634 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
++        vsubl.u8  q0, \Q0a, \P0a
++        vsubl.u8  q1, \P1a, \Q1a
++        vdup.16   d4, r2
++        \I1
++        vshl.i16  q0, #2
++        \I2
++        vadd.i16  q0, q1
++        \I3
++        vmovl.u8  q2, d4
++        \I4
++        vneg.s16  q1, q2
++        \I5
++        vrshr.s16 q0, #3
++        \I6
++        \I7
++        \I8
++        vmin.s16  q0, q2
++        vmovl.u8  q2, \Q0a
++        vmax.s16  q0, q1
++        vaddw.u8  q1, q0, \P0a
++        vsub.i16  q0, q2, q0
++        vqmovun.s16 \P0a, q1
++        vqmovun.s16 \Q0a, q0
++.endm
++
++
++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
++        vsubl.u8  q0, \Q0a, \P0a  @ q0a - p0a
++        lsr       r12, r2, #16
++        vsubl.u8  q1, \Q0b, \P0b  @ q0b - p0b
++        vsubl.u8  q2, \P1a, \Q1a  @ p1a - q1a
++        vsubl.u8  q3, \P1b, \Q1b  @ p1b - q1b
++        vshl.i16  q0, #2          @ (q0a - p0a) * 4
++        vshl.i16  q1, #2          @ (q0b - p0b) * 4
++        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
++        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
++        vdup.16   d4, r2          @ tc0a, tc0b
++        vdup.16   d6, r12         @ tc1a, tc1b
++        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++        \I1
++        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++        \I2
++        vmovl.u8  q2, d4          @ tc0a, tc0b
++        \I3
++        vmovl.u8  q3, d6          @ tc1a, tc1b
++        \I4
++        vmin.s16  q0, q2
++        \I5
++        vneg.s16  q2, q2          @ -tc0a, -tc0b
++        \I6
++        vmin.s16  q1, q3
++        \I7
++        vneg.s16  q3, q3          @ -tc1a, -tc1b
++        vmax.s16  q0, q2          @ delta0a
++        vmovl.u8  q2, \Q0a
++        vmax.s16  q1, q3          @ delta0b
++        vaddw.u8  q3, q0, \P0a    @ p0a + delta0a
++        vsub.i16  q0, q2, q0      @ q0a - delta0a
++        vmovl.u8  q2, \Q0b
++        vsub.i16  q2, q1          @ q0b - delta0b
++        vaddw.u8  q1, \P0b        @ p0b + delta0b
++        vqmovun.s16 \Q0a, q0
++        vqmovun.s16 \P0a, q3
++        vqmovun.s16 \Q0b, q2
++        vqmovun.s16 \P0b, q1
++.endm
++
++
++@ Preserves r12
++@ Clobbers r2
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@   [0..7]   tc U a
++@   [8..15]  tc V a
++
++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
++        vsub.i16  q0, \Q0a, \P0a
++        vsub.i16  q1, \P1a, \Q1a
++        vdup.16   d4, r2
++        \I1
++        vshl.i16  q0, #2
++        \I2
++        vadd.i16  q0, q1
++        \I3
++        vshll.u8  q2, d4, #\bit_depth - 8
++        \I4
++        vneg.s16  q1, q2
++        \I5
++        vrshr.s16 q0, #3
++        \I6
++        \I7
++        \I8
++        vmin.s16  q0, q2
++        vmov.i16  q2, #0
++        vmax.s16  q0, q1
++        vadd.i16  \P0a, q0
++        vsub.i16  \Q0a, q0
++        vmov.i16  q1, #(1 << \bit_depth) - 1
++        vmax.s16  \P0a, q2
++        vmax.s16  \Q0a, q2
++        vmin.s16  \P0a, q1
++        vmin.s16  \Q0a, q1
++.endm
++
++@ Clobbers r2, r12
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@   [0..7]   tc U a
++@   [8..15]  tc V a
++@  [16..23]  tc U b
++@  [24..31]  tc V b
++
++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
++        vsub.i16  q0, \Q0a, \P0a  @ q0a - p0a
++        lsr       r12, r2, #16
++        vsub.i16  q1, \Q0b, \P0b  @ q0b - p0b
++        vsub.i16  q2, \P1a, \Q1a  @ p1a - q1a
++        vsub.i16  q3, \P1b, \Q1b  @ p1b - q1b
++        vshl.i16  q0, #2          @ (q0a - p0a) * 4
++        vshl.i16  q1, #2          @ (q0b - p0b) * 4
++        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
++        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
++        vdup.16   d4, r2          @ tc0a, tc0b
++        vdup.16   d6, r12         @ tc1a, tc1b
++        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++        \I1
++        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++        \I2
++        vshll.u8  q2, d4, #\bit_depth - 8 @ tc0a, tc0b
++        \I3
++        vshll.u8  q3, d6, #\bit_depth - 8 @ tc1a, tc1b
++        \I4
++        vmin.s16  q0, q2
++        \I5
++        vneg.s16  q2, q2          @ -tc0a, -tc0b
++        \I6
++        vmin.s16  q1, q3
++        \I7
++        vneg.s16  q3, q3          @ -tc1a, -tc1b
++        vmax.s16  q0, q2          @ delta0a
++        vadd.i16  \P0a, q0        @ p0a + delta0a
++        vsub.i16  \Q0a, q0        @ q0a - delta0a
++        vmax.s16  q1, q3          @ delta0b
++        vadd.i16  \P0b, q1        @ p0b + delta0b
++        vsub.i16  \Q0b, q1        @ q0b - delta0b
++        vmov.i16  q2, #0
++        vmov.i16  q3, #(1 << \bit_depth) - 1
++        vmax.s16  \P0a, q2
++        vmax.s16  \Q0a, q2
++        vmax.s16  \P0b, q2
++        vmax.s16  \Q0b, q2
++        vmin.s16  \P0a, q3
++        vmin.s16  \Q0a, q3
++        vmin.s16  \P0b, q3
++        vmin.s16  \Q0b, q3
++.endm
++
++
++
++@   uint8_t *_no_p,     [sp+0]
++@   uint8_t *_no_q)     [sp+4]
++
++.macro hevc_loop_filter_luma_start
++        ldr     r12, [r3]
++        ldr      r3, [r3, #4]
++        orrs     r3, r12, r3, lsl #16
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldrd     r4, r5, [sp, #32]      @ &_no_p
++        ldrb     r4, [r4]
++        ldrb     r5, [r5]
++        movs     r10, r4
++        it ne
++        movne    r10, #1
++        cmp      r5, #0
++        it ne
++        orrne    r10, #2
++.endm
++
++@ Input:
++@  r2          beta    (raw: needs shift for bitdepth > 8)
++@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
++@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
++@
++@ Input & output
++@  8-bit: d16-d23      (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
++@ 16-bit:  q8-q15
++@
++@  r1         -r1
++@  r10        b1->C, b0->N  (r10 junk)
++@
++@ Junks:
++@  r5, r6, r7, r8, r9
++
++.macro m_filter_luma bit_depth, Q11, Q15
++.if \bit_depth == 8
++        vmovl.u8    q14, d22      @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
++        vmovl.u8    q13, d21      @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
++        vmovl.u8    q12, d20      @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
++        vmovl.u8    \Q11, d19     @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
++        vmovl.u8    q10, d18      @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
++        vmovl.u8    q9, d17       @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
++.endif
++        vadd.i16    q0, q9, \Q11  @ P2 + P0
++.if \bit_depth > 8
++        lsl         r3, r3, #(\bit_depth - 8)
++.endif
++        vadd.i16    q1, q14, q12  @ Q2 + Q0
++.if \bit_depth > 8
++        lsl         r2, r2, #(\bit_depth - 8)
++.endif
++        vsub.i16    q0, q10       @ P2 - P1 + P0
++        lsr         r5, r3, #16
++        vsub.i16    q1, q13       @ Q2 - Q1 + Q0
++.if \bit_depth == 8
++        vmovl.u8    q8, d16       @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
++        vmovl.u8    \Q15, d23     @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
++.endif
++        vabd.s16    q0, q10       @ dp0 = abs(P2 - 2 * P1 + P0)
++        vabd.s16    q1, q13       @ dq0 = abs(Q2 - 2 * Q1 + Q0)
++        vmov.i64    q2, #0xffffffff0000
++        vbic        q0, q2        @ only dp0(') and dp3(')
++        vbic        q1, q2        @ only dq0(') and dq3(')
++        vsra.u64    q0, #16
++        vsra.u64    q1, #16
++        vdup.16     q3, r2        @ beta
++        vdup.16     d14, r3       @ tC[0]
++        vdup.16     d15, r5       @ tC[1]
++        vabd.s16    q4, q8, \Q11  @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
++        vmovn.i32   d0, q0        @ dp3' dp0' dp3 dp0
++        vmovn.i32   d1, q1        @ dq3' dq0' dq3 dq0
++        vadd.i16    d5, d0, d1    @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
++        vabd.s16    q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
++        vaba.s16    q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
++        vpadd.i16   d2, d5, d5    @ dontcare dontcare d0'+d3' d0+d3
++        vshl.s16    q6, q7, #2    @ tC[] * 4
++        vrhadd.s16  q6, q7        @ tc25 = (tc[] * 5 + 1) >> 1
++        vcgt.s16    d2, d6, d2    @ if (d0 + d3 < beta)
++        vmov        r7, s4        @ (d2) r7 = mask of blocks to apply filtering (16b/block)
++        vshr.s16    q1, q3, #3    @ beta_3 = beta >> 3
++        cmp         r7, #0
++        beq         .Lbypasswrite
++
++        vcgt.s16    q5, q6, q5    @ if < tc25
++        vcgt.s16    q4, q1, q4    @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
++        vand        q4, q5
++        vbic        d8, d4
++        vbic        d9, d4
++        vshr.s16    q3, #2        @ beta_2 = beta >> 2
++        vsra.u64    q4, #16
++        vshl.s16    d5, #1        @ d3'<<1 d0'<<1 d3<<1 d0<<1
++        vshl.i16    q7, #1        @ tc2 = tC[] << 1
++        vcgt.s16    d6, d5        @ if (d3'<<1 < beta_2) etc
++        vmovn.i32   d8, q4        @ beta_3 && tc25 tests, prime block in ms half
++        vand        d6, d8        @ && beta_2 tests, prime in ms half
++        vpadd.i16   d0, d1        @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
++        vneg.s16    q6, q7        @ -tc2
++        vmovn.i32   d8, q3
++        vshrn.i32   d6, q3, #16
++        vand        d6, d8
++        vmov        r5, r6, d0    @ r5 = dp0'+dp3' dp0+dp3  r6 = dq0'+dq3' dq0+dq3
++        vmov        r8, s12       @ (d6) r8 = mask of strong filtering blocks (16b/block)
++        vadd.i16    q0, \Q11, q12 @ p0 + q0
++        ands        r9, r7, r8
++        beq         1f
++
++        vadd.i16    q2, q0, q10   @ p1 + p0 + q0
++        vadd.i16    q3, q0, q13   @ p0 + q0 + q1
++        lsr         r3, r9, #16
++        vadd.i16    q1, q2, q9    @ p2 + p1 + p0 + q0 (new P1 before clipping)
++        vadd.i16    q4, q3, q14   @ p0 + q0 + q1 + q2 (new Q1 before clipping)
++        vadd.i16    q0, q8, q9    @ p3 + p2
++        vadd.i16    q5, \Q15, q14 @ q2 + q3
++        vadd.i16    q2, q1        @ p2 + 2 * p1 + 2 * p0 + 2 * q0
++        vadd.i16    q3, q4        @ 2 * p0 + 2 * q0 + 2 * q1 + q2
++        vshl.i16    q0, #1        @ 2 * p3 + 2 * p2
++        vshl.i16    q5, #1        @ 2 * q2 + 2 * q3
++        vadd.i16    q0, q1        @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
++        vadd.i16    q5, q4        @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
++        vadd.i16    q2, q13       @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
++        vadd.i16    q3, q10       @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
++        vrshr.s16   q0, #3        @ scale, with rounding
++        vrshr.s16   q5, #3
++        vrshr.s16   q1, #2
++        vrshr.s16   q4, #2
++        vrshr.s16   q2, #3
++        vrshr.s16   q3, #3
++        vsub.i16    q0, q9        @ find difference
++        vsub.i16    q5, q14
++        vsub.i16    q1, q10
++        vsub.i16    q4, q13
++        vsub.i16    q2, \Q11
++        vsub.i16    q3, q12
++        vmax.s16    q0, q6        @ clip difference to -tc2 .. tc2
++        vmax.s16    q5, q6
++        vmax.s16    q1, q6
++        vmax.s16    q4, q6
++        vmax.s16    q2, q6
++        vmax.s16    q3, q6
++        vdup.16     d12, r9       @ expand mask, reuse q6 due to register pressure
++        vdup.16     d13, r3
++        vmin.s16    q0, q7
++        vmin.s16    q5, q7
++        vmin.s16    q1, q7
++        vmin.s16    q4, q7
++        vmin.s16    q2, q7
++        vmin.s16    q3, q7
++        vadd.i16    q0, q9        @ apply difference
++        vadd.i16    q5, q14
++        vadd.i16    q1, q10
++        vadd.i16    q4, q13
++        vadd.i16    q2, \Q11
++        vadd.i16    q3, q12
++        vbit        q9, q0, q6    @ apply filtered values according to mask
++        vbit        q14, q5, q6
++        vbit        q10, q1, q6
++        vbit        q13, q4, q6
++        vbit        \Q11, q2, q6
++        vbit        q12, q3, q6
++        vneg.s16    q6, q7        @ restore -tc2
++
++1:
++        bics        r9, r7, r8
++        beq         2f
++
++        vsub.i16    q0, q12, \Q11 @ q0 - p0
++        vsub.i16    q1, q13, q10  @ q1 - p1
++        lsr         r3, r9, #16
++        vshl.i16    q2, q0, #3
++        lsr         r7, r5, #16
++        vadd.i16    q3, q0, q2    @ 9 * (q0 - p0)
++        lsr         r8, r6, #16
++        vshl.i16    q2, q1, #1
++        vadd.i16    q4, q1, q2    @ 3 * (q1 - p1)
++        vshr.s16    q6, #1        @ -tc = -tc2 >> 1
++        vsub.i16    q5, q3, q4
++        vrhadd.s16  q1, q9, \Q11  @ (p2 + p0 + 1) >> 1
++        vrhadd.s16  q3, q14, q12  @ (q2 + q0 + 1) >> 1
++        vrshr.s16   q5, #4        @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
++        vsub.i16    q1, q10       @ ((p2 + p0 + 1) >> 1) - p1
++        vsub.i16    q3, q13       @ ((q2 + q0 + 1) >> 1) - q1
++        vmax.s16    q6, q5        @
++        vshr.s16    q4, q7, #1    @ tc = tc2 >> 1
++        vdup.16     q0, r2        @ beta
++        vmin.s16    q6, q4        @ delta0 clamped to [-tc, tc]
++        vshr.s16    q4, #1        @ tc_2 = tc >> 1
++        vhadd.s16   q1, q6        @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
++        vhsub.s16   q3, q6        @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
++        vshr.s16    q2, q0, #1    @ beta >> 1
++        vadd.i16    q2, q0        @ beta + (beta >> 1)
++        vneg.s16    q0, q4        @ -tc_2
++        vabs.s16    q5, q5        @ abs(original delta0)
++        vshr.s16    q2, #3        @ (beta + (beta >> 1)) >> 3
++        vmax.s16    q1, q0
++        vmax.s16    q3, q0
++        vshl.s16    q0, q7, #2    @ 8 * tc
++        vadd.i16    q7, q0        @ 10 * tc
++        vdup.16     d0, r9
++        vdup.16     d1, r3        @ q0 = mask of blocks to apply filtering
++        vmin.s16    q1, q4        @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
++        vmin.s16    q3, q4        @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
++        vdup.16     d8, r5        @ dp0 + dp3
++        vdup.16     d9, r7        @ dp0' + dp3'
++        vcgt.s16    q7, q5        @ if ((10 * tc) > abs(delta0))
++        vdup.16     d10, r6       @ dq0 + dq3
++        vdup.16     d11, r8       @ dq0' + dq3'
++        vand        q7, q0        @ AND block and line masks
++        vcgt.s16    q4, q2, q4    @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
++        vadd.i16    q0, q1, q10   @ p1 + deltap1
++        vcgt.s16    q5, q2, q5    @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
++        vadd.i16    q3, q3, q13   @ q1 + deltaq1
++        vadd.i16    q1, \Q11, q6  @ p0 + delta0
++        vsub.i16    q2, q12, q6   @ q0 - delta0
++        vand        q4, q7        @ AND nd_p test with block/line masks
++        vand        q5, q7        @ AND nd_q test with block/line masks
++        vbit        q10, q0, q4
++        vbit        \Q11, q1, q7
++        vbit        q12, q2, q7
++        vbit        q13, q3, q5
++
++2:
++.if \bit_depth == 8
++        vmovn.i16 d16, q8
++        vmovn.i16 d23, \Q15
++        neg       r1, r1
++        vqmovun.s16 d17, q9
++        vqmovun.s16 d18, q10
++        vqmovun.s16 d19, \Q11
++        lsls      r10, #31
++        vqmovun.s16 d20, q12
++        vqmovun.s16 d21, q13
++        vqmovun.s16 d22, q14
++.else
++        vmov.i16  q0, #0
++        vmov.i16  q1, #(1 << \bit_depth - 1)
++        @ q8 & q15 should be unaltered and so don't require clipping
++        neg       r1, r1
++        vmax.s16  q9,  q0
++        vmax.s16  q10, q0
++        vmax.s16  q11, q0
++        vmax.s16  q12, q0
++        vmax.s16  q13, q0
++        vmax.s16  q14, q0
++        lsls      r10, #31
++        vmin.s16  q9,  q1
++        vmin.s16  q10, q1
++        vmin.s16  q11, q1
++        vmin.s16  q12, q1
++        vmin.s16  q13, q1
++        vmin.s16  q14, q1
++.endif
++        bx        lr
++.endm
++
++function hevc_loop_filter_luma_body
++        m_filter_luma 8, q15, q11
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
++@   uint8_t *_pix,      [r0]
++@   ptrdiff_t _stride,  [r1]
++@   int _beta,          [r2]
++@   int *_tc,           [r3]
++@   uint8_t *_no_p,     [sp+0]
++@   uint8_t *_no_q)     [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
++        hevc_loop_filter_luma_start
++
++        sub      r4, r0, #4
++        b        .Lv_loop_luma_common
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
++@   uint8_t * pix_r,    [r0]
++@   ptrdiff_t _stride,  [r1]
++@   int _beta,          [r2]
++@   int tc2,            [r3]
++@   int no_f,           [sp+0]
++@   uint8_t * pix_l)    [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r4, [sp, #36]
++        ldr      r10, [sp, #32]
++
++.Lv_loop_luma_common:
++        vpush    {d8-d15}
++
++        @ It's slightly faster to do unlaned loads and transpose in the
++        @ 8-bit case, even though it needs more instructions, because
++        @ VLD4.8 is a really slow way to read from memory.
++        vld1.32 {d16[0]}, [r4:32], r1
++        vld1.32 {d20[0]}, [r0:32], r1
++        vld1.32 {d16[1]}, [r4:32], r1
++        vld1.32 {d20[1]}, [r0:32], r1
++        vld1.32 {d17[0]}, [r4:32], r1
++        vld1.32 {d21[0]}, [r0:32], r1
++        vld1.32 {d17[1]}, [r4:32], r1
++        vld1.32 {d21[1]}, [r0:32], r1
++        vld1.32 {d18[0]}, [r4:32], r1
++        vld1.32 {d22[0]}, [r0:32], r1
++        vld1.32 {d18[1]}, [r4:32], r1
++        vld1.32 {d22[1]}, [r0:32], r1
++        vld1.32 {d19[0]}, [r4:32], r1
++        vld1.32 {d23[0]}, [r0:32], r1
++        vld1.32 {d19[1]}, [r4:32]
++        vld1.32 {d23[1]}, [r0:32]
++        vuzp.16 q8, q9
++        vuzp.16 q10, q11
++        vuzp.8  q8, q9
++        vuzp.8  q10, q11
++        vswp    d17, d18
++        vswp    d21, d22
++
++        bl hevc_loop_filter_luma_body
++
++        add     r6, r4, r1
++        add     r2, r0, r1
++        lsl     r1, #1
++
++        vpop     {d8-d15}
++
++        @ no_p[1]
++        bmi     1f
++        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
++        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
++        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
++
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
++1:
++        @ no_q[1]
++        bcs     1f
++        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
++        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
++        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
++
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
++1:
++        pop      {r4-r10,pc}
++
++.Lbypasswrite:
++        vpop     {d8-d15}
++        pop      {r4-r10,pc}
++endfunc
++
++.macro m_filter_v_luma_16 bit_depth
++        vpush    {d8-d15}
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        add      r6, r4, r1
++        add      r2, r0, r1
++        lsl      r1, #1
++
++        vpop     {d8-d15}
++
++        @ p[1]
++        bmi      1f
++        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
++        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
++        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
++        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
++        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r6]
++1:
++        @ q[1]
++        bcs      1f
++        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
++        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
++        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
++        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
++        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
++1:
++        pop      {r4-r10,pc}
++.endm
++
++
++
++
++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
++@                                 ptrdiff_t stride, [r1]
++@                                 int beta,         [r2]
++@                                 int32_t *tc,      [r3]
++@                                 uint8_t *no_p,    sp[0]
++@                                 uint8_t *no_q);   sp[4]
++@
++@ Src should always be on 8 byte boundry & all in the same slice
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
++        hevc_loop_filter_luma_start
++        b        .Lh_loop_filter_luma_common_8
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r10, [sp, #32]
++
++.Lh_loop_filter_luma_common_8:
++        sub      r4, r0, r1, lsl #2
++        add      r0, r4, r1
++        lsl      r1, #1
++        vpush    {d8-d15}
++
++        vld1.8  {d16}, [r4], r1
++        vld1.8  {d17}, [r0], r1
++        vld1.8  {d18}, [r4], r1
++        vld1.8  {d19}, [r0], r1
++        vld1.8  {d20}, [r4], r1
++        vld1.8  {d21}, [r0], r1
++        vld1.8  {d22}, [r4]
++        vld1.8  {d23}, [r0]
++
++        bl hevc_loop_filter_luma_body
++
++        add      r0, r0, r1, lsl #1
++        add      r2, r4, r1, lsl #1
++        add      r6, r4, r1, asr #1
++        vpop     {d8-d15}
++
++        @ P2-P0
++        bcs      1f
++        vst1.8   {d22}, [r4], r1
++        vst1.8   {d21}, [r6]
++        vst1.8   {d20}, [r4]
++1:
++        @ Q0-Q2
++        bmi      1f
++        vst1.8   {d19}, [r0], r1
++        vst1.8   {d18}, [r2]
++        vst1.8   {d17}, [r0]
++1:
++        pop      {r4-r10,pc}
++endfunc
++
++
++.macro m_filter_h_luma_16 bit_depth
++        sub      r4, r0, r1, lsl #2
++        add      r0, r4, r1
++        lsl      r1, #1
++        vpush    {d8-d15}
++
++        vld1.16 { q8}, [r4], r1
++        vld1.16 { q9}, [r0], r1
++        vld1.16 {q10}, [r4], r1
++        vld1.16 {q11}, [r0], r1
++        vld1.16 {q12}, [r4], r1
++        vld1.16 {q13}, [r0], r1
++        vld1.16 {q14}, [r4]
++        vld1.16 {q15}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        add      r0, r0, r1, lsl #1
++        add      r2, r4, r1, lsl #1
++        add      r6, r4, r1, asr #1
++        vpop     {d8-d15}
++
++        @ P2-P0
++        bcs      1f
++        vst1.16  {q14}, [r4], r1
++        vst1.16  {q13}, [r6]
++        vst1.16  {q12}, [r4]
++1:
++        bmi      1f
++        vst1.16  {q11}, [r0], r1
++        vst1.16  {q10}, [r2]
++        vst1.16  { q9}, [r0]
++1:
++        pop      {r4-r10,pc}
++.endm
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no_f
++@ 0  tl P0
++@ 1  tr P1
++@ 2  bl Q0
++@ 3  br Q1
++@
++@ Probably not worth having the P/Qa only special case in this direction
++@ Given layout we won't save any memory reads or avoid any cache dirtying
++@ We would save a bit of computation but I expect the partials to be less
++@ common in the H direction than V due to how we arrange deblock.
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
++        sub      r12, r0, r1
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        vld1.8   {d26,d27}, [r0]
++        lsl      r1, #1
++        sub      r0, r1
++        vld1.8   {d18,d19}, [r12], r1
++        vld1.8   {d16,d17}, [r0], r1
++        vld1.8   {d28,d29}, [r12]
++
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
++        "sub      r12, r0, r1, asr #1"
++
++        lsls     r3, #29                @ b2 -> N, b3 -> C
++        it pl
++        vstrpl   d26, [r0, #0]
++        it cc
++        vstrcc   d27, [r0, #8]
++        lsls     r3, #2                 @ b0 -> N, b1 -> C
++        it pl
++        vstrpl   d18, [r12, #0]
++        it cc
++        vstrcc   d19, [r12, #8]
++        bx       lr
++
++endfunc
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@
++@ Macro here actual function near bottom
++
++.macro m_filter_h_uv_16 bit_depth
++        sub      r12, r0, r1
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        vld1.16  {q12, q13}, [r0]
++        lsl      r1, #1
++        sub      r0, r1
++        vld1.16  {q10, q11}, [r12], r1
++        vld1.16  {q8,  q9 }, [r0], r1
++        vld1.16  {q14, q15}, [r12]
++
++        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
++        "sub      r12, r0, r1, asr #1", \
++        "cmp      r3, #0"
++
++        bne      1f
++        vst1.16  {q10, q11}, [r12]
++        vst1.16  {q12, q13}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:
++        lsls     r3, #29                @ b2 -> N, b3 -> C
++        itt pl
++        vstrpl   d24, [r0, #0]
++        vstrpl   d25, [r0, #8]
++        itt cc
++        vstrcc   d26, [r0, #16]
++        vstrcc   d27, [r0, #24]
++        lsls     r3, #2                 @ b0 -> N, b1 -> C
++        itt pl
++        vstrpl   d20, [r12, #0]
++        vstrpl   d21, [r12, #8]
++        itt cc
++        vstrcc   d22, [r12, #16]
++        vstrcc   d23, [r12, #24]
++        bx       lr
++.endm
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no_f:
++@ 0  tl P0
++@ 1  tr Q0
++@ 2  bl P1
++@ 3  br Q1
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        push     {lr}
++        vld2.16  {d16[0], d18[0]}, [r3], r1
++        vld2.16  {d20[0], d22[0]}, [r0], r1
++
++        cmp      r2, #0x10000
++        vld2.16  {d16[1], d18[1]}, [r3], r1
++        vld2.16  {d20[1], d22[1]}, [r0], r1
++
++        vld2.16  {d16[2], d18[2]}, [r3], r1
++        vld2.16  {d20[2], d22[2]}, [r0], r1
++
++        vld2.16  {d16[3], d18[3]}, [r3], r1
++        vld2.16  {d20[3], d22[3]}, [r0], r1
++        blo      10f
++
++        vld2.16  {d17[0], d19[0]}, [r3], r1
++        vld2.16  {d21[0], d23[0]}, [r0], r1
++
++        sub      ip, r0, r3
++        vld2.16  {d17[1], d19[1]}, [r3], r1
++        vld2.16  {d21[1], d23[1]}, [r0], r1
++
++        cmp      ip, #4
++        vld2.16  {d17[2], d19[2]}, [r3], r1
++        vld2.16  {d21[2], d23[2]}, [r0], r1
++
++        vld2.16  {d17[3], d19[3]}, [r3]
++        vld2.16  {d21[3], d23[3]}, [r0]
++
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
++        "ldr      lr, [sp, #4]", \
++        "neg      r1, r1",       \
++        "it eq; cmpeq lr, #0",   \
++        "add      r3, #2",       \
++        "add      ip, r3, r1",   \
++        "add      r2, r0, r1",   \
++        "lsl      r1, #1"
++
++        bne      1f
++
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst2.16   {d19[3], d21[3]}, [r3], r1    @ P0b, Q0b
++        vst2.16   {d19[2], d21[2]}, [ip], r1
++        vst2.16   {d19[1], d21[1]}, [r3], r1
++        vst2.16   {d19[0], d21[0]}, [ip], r1
++        vst2.16   {d18[3], d20[3]}, [r3], r1    @ P0a, Q0a
++        vst2.16   {d18[2], d20[2]}, [ip], r1
++        vst2.16   {d18[1], d20[1]}, [r3]
++        vst2.16   {d18[0], d20[0]}, [ip]
++        pop       {pc}
++
++@ Either split or partial
++1:
++        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++        ittt cs
++        addcs    r0, r0, r1, lsl #1
++        addcs    r2, r2, r1, lsl #1
++        bcs      1f
++        @ Q0b
++        vst1.16  {d21[3]}, [r0], r1
++        vst1.16  {d21[2]}, [r2], r1
++        vst1.16  {d21[1]}, [r0], r1
++        vst1.16  {d21[0]}, [r2], r1
++1:
++        ittt mi
++        addmi    r3, r3, r1, lsl #1
++        addmi    ip, ip, r1, lsl #1
++        bmi      1f
++        @ P0b
++        vst1.16  {d19[3]}, [r3], r1
++        vst1.16  {d19[2]}, [ip], r1
++        vst1.16  {d19[1]}, [r3], r1
++        vst1.16  {d19[0]}, [ip], r1
++1:
++        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++        bcs      1f
++        @ Q0a
++        vst1.16  {d20[3]}, [r0], r1
++        vst1.16  {d20[2]}, [r2], r1
++        vst1.16  {d20[1]}, [r0]
++        vst1.16  {d20[0]}, [r2]
++1:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.16  {d18[3]}, [r3], r1
++        vst1.16  {d18[2]}, [ip], r1
++        vst1.16  {d18[1]}, [r3]
++        vst1.16  {d18[0]}, [ip]
++        pop      {pc}
++
++@ Single lump (rather than double)
++10:
++        @ As we have post inced r0/r3 in the load the easiest thing to do is
++        @ to subtract and write forwards, rather than backwards (as above)
++        @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++        hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
++        "ldr      lr, [sp, #4]",       \
++        "add      r3, #2",             \
++        "sub      r0, r0, r1, lsl #2", \
++        "sub      r3, r3, r1, lsl #2", \
++        "lsls     lr, #31",            \
++        "add      r2, r0, r1",         \
++        "add      ip, r3, r1",         \
++        "lsl      r1, #1"
++
++        bcs      3f
++        @ Q0a
++        vst1.16  {d20[0]}, [r0], r1
++        vst1.16  {d20[1]}, [r2], r1
++        vst1.16  {d20[2]}, [r0]
++        vst1.16  {d20[3]}, [r2]
++3:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.16  {d18[0]}, [r3], r1
++        vst1.16  {d18[1]}, [ip], r1
++        vst1.16  {d18[2]}, [r3]
++        vst1.16  {d18[3]}, [ip]
++        pop      {pc}
++
++endfunc
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++
++@ no_f
++@ 0  tl P0a
++@ 1  tr Q0a
++@ 2  bl P0b
++@ 3  br Q0b
++
++@ P1: q8,  q12
++@ P0: q9,  q13
++@ Q0: q10, q14
++@ Q1: q11, q15
++
++.macro m_filter_v_uv2_16 bit_depth
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        push     {lr}
++        vld2.32  {d16[0], d18[0]}, [r3], r1
++        vld2.32  {d20[0], d22[0]}, [r0], r1
++
++        cmp      r2, #0x10000
++        vld2.32  {d16[1], d18[1]}, [r3], r1
++        vld2.32  {d20[1], d22[1]}, [r0], r1
++
++        vld2.32  {d17[0], d19[0]}, [r3], r1
++        vld2.32  {d21[0], d23[0]}, [r0], r1
++
++        vld2.32  {d17[1], d19[1]}, [r3], r1
++        vld2.32  {d21[1], d23[1]}, [r0], r1
++        blo      10f
++
++        vld2.32  {d24[0], d26[0]}, [r3], r1
++        vld2.32  {d28[0], d30[0]}, [r0], r1
++
++        sub      ip, r0, r3
++        vld2.32  {d24[1], d26[1]}, [r3], r1
++        vld2.32  {d28[1], d30[1]}, [r0], r1
++
++        cmp      ip, #8
++        vld2.32  {d25[0], d27[0]}, [r3], r1
++        vld2.32  {d29[0], d31[0]}, [r0], r1
++
++        vld2.32  {d25[1], d27[1]}, [r3]
++        vld2.32  {d29[1], d31[1]}, [r0]
++
++        hevc_loop_filter_uv_body2_16  q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
++        "ldr      lr, [sp, #4]", \
++        "neg      r1, r1",       \
++        "it eq; cmpeq lr, #0",   \
++        "add      r3, #4",       \
++        "add      ip, r3, r1",   \
++        "add      r2, r0, r1",   \
++        "lsl      r1, #1"
++
++        bne      1f
++
++@ Much/most of the time r0 == r3 + 8 and no_f == 0
++@ so it is worth having this special case
++        vst2.32   {d27[1], d29[1]}, [r3], r1    @ P0b, Q0b
++        vst2.32   {d27[0], d29[0]}, [ip], r1
++        vst2.32   {d26[1], d28[1]}, [r3], r1
++        vst2.32   {d26[0], d28[0]}, [ip], r1
++        vst2.32   {d19[1], d21[1]}, [r3], r1    @ P0a, Q0a
++        vst2.32   {d19[0], d21[0]}, [ip], r1
++        vst2.32   {d18[1], d20[1]}, [r3]
++        vst2.32   {d18[0], d20[0]}, [ip]
++        pop       {pc}
++
++@ Either split or partial
++1:
++        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++        ittt cs
++        addcs    r0, r0, r1, lsl #1
++        addcs    r2, r2, r1, lsl #1
++        bcs      1f
++        @ Q0b
++        vst1.32  {d29[1]}, [r0], r1
++        vst1.32  {d29[0]}, [r2], r1
++        vst1.32  {d28[1]}, [r0], r1
++        vst1.32  {d28[0]}, [r2], r1
++1:
++        ittt mi
++        addmi    r3, r3, r1, lsl #1
++        addmi    ip, ip, r1, lsl #1
++        bmi      1f
++        @ P0b
++        vst1.32  {d27[1]}, [r3], r1
++        vst1.32  {d27[0]}, [ip], r1
++        vst1.32  {d26[1]}, [r3], r1
++        vst1.32  {d26[0]}, [ip], r1
++1:
++        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++        bcs      1f
++        @ Q0a
++        vst1.32  {d21[1]}, [r0], r1
++        vst1.32  {d21[0]}, [r2], r1
++        vst1.32  {d20[1]}, [r0]
++        vst1.32  {d20[0]}, [r2]
++1:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.32  {d19[1]}, [r3], r1
++        vst1.32  {d19[0]}, [ip], r1
++        vst1.32  {d18[1]}, [r3]
++        vst1.32  {d18[0]}, [ip]
++        pop      {pc}
++
++@ Single lump (rather than double)
++10:
++        @ As we have post inced r0/r3 in the load the easiest thing to do is
++        @ to subtract and write forwards, rather than backwards (as above)
++        @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++        hevc_loop_filter_uv_body1_16  q8, q9, q10, q11, \bit_depth, \
++        "ldr      lr, [sp, #4]",       \
++        "add      r3, #4",             \
++        "sub      r0, r0, r1, lsl #2", \
++        "sub      r3, r3, r1, lsl #2", \
++        "lsls     lr, #31",            \
++        "add      r2, r0, r1",         \
++        "add      ip, r3, r1",         \
++        "lsl      r1, #1"
++
++        bcs      3f
++        @ Q0a
++        vst1.32  {d20[0]}, [r0], r1
++        vst1.32  {d20[1]}, [r2], r1
++        vst1.32  {d21[0]}, [r0]
++        vst1.32  {d21[1]}, [r2]
++3:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.32  {d18[0]}, [r3], r1
++        vst1.32  {d18[1]}, [ip], r1
++        vst1.32  {d19[0]}, [r3]
++        vst1.32  {d19[1]}, [ip]
++        pop      {pc}
++.endm
++
++
++@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
++@ But in real world testing it is ~20% slower, presumably due to code size
++
++#if 0 // NEON version
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ *                                            int in_inc0, int in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++        mov         ip, sp
++        push        {a1-a3,v1-v8,lr}
++        ldm         ip, {v1-v6}
++        cmp         a1, #2
++        bls         2f
++        vpush       {d8-d13}
++        sub         v5, v5, #10
++        sub         v6, v6, #10
++1:
++        vld2.32     {d0[0], d2[0]}, [a3]!
++        vld2.32     {d4[0], d6[0]}, [a4]!
++          vmov.u8     q12, #0
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        v8, [a3], #1
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[0]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[0]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d16[0]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d20[0]}, [ip]
++        vld1.32     {d18[0]}, [v8]
++        vld1.32     {d22[0]}, [lr]
++
++        vld2.32     {d0[1], d2[1]}, [a3]!
++        vld2.32     {d4[1], d6[1]}, [a4]!
++        ldrb        a2, [a3], #1
++          vmov.u16    d12, #1
++        ldrb        ip, [a4], #1
++          vmov.u16    d13, #2
++        ldrb        v8, [a3], #1
++          vmov.u16    d27, #4
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[2]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[2]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d16[1]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d20[1]}, [ip]
++        vld1.32     {d18[1]}, [v8]
++        vld1.32     {d22[1]}, [lr]
++
++        vld2.32     {d1[0], d3[0]}, [a3]!
++        vld2.32     {d5[0], d7[0]}, [a4]!
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        lr, [a4], #1
++        ldrb        v8, [a3], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[4]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[4]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d17[0]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d21[0]}, [ip]
++        vld1.32     {d19[0]}, [v8]
++        vld1.32     {d23[0]}, [lr]
++
++        vld2.32     {d1[1], d3[1]}, [a3]!
++        vld2.32     {d5[1], d7[1]}, [a4]!
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        v8, [a3], #1
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[6]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[6]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d17[1]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d21[1]}, [ip]
++        vld1.32     {d19[1]}, [v8]
++        vld1.32     {d23[1]}, [lr]
++
++        @ So now we have:
++        @ q0.32[i]  = curr[i].mv[0]
++        @ q1.32[i]  = curr[i].mv[1]
++        @ q2.32[i]  = neigh[i].mv[0]
++        @ q3.32[i]  = neigh[i].mv[1]
++        @ q8.32[i]  = curr_rpl0[curr[i].ref_idx[0]]
++        @ q9.32[i]  = curr_rpl1[curr[i].ref_idx[1]]
++        @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++        @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++        @ d24.16[i] = curr[i].pred_flag
++        @ d25.16[i] = neigh[i].pred_flag
++
++        vtst.16     d28, d24, d12
++        vtst.16     d29, d24, d13
++        vadd.i16    d8, d24, d12
++        vadd.i16    d9, d25, d12
++        vtst.16     d30, d25, d12
++        vtst.16     d31, d25, d13
++        veor        d26, d8, d9
++          ldr         lr, [sp, 6*8 + 1*4]
++        vmovl.s16   q4, d28
++        vmovl.s16   q5, d29
++          teq         lr, #1
++        vmovl.s16   q14, d30
++          it ne
++          lslne       v1, lr, #1
++        vmovl.s16   q15, d31
++          it ne
++          rsbne       v2, v1, #32
++        vbif        q0, q1, q4
++        vbif        q2, q3, q14
++        vbif        q1, q0, q5
++        vbif        q3, q2, q15
++        vabd.s16    q12, q0, q2
++        vabd.s16    q2, q1
++        vabd.s16    q0, q3
++        vabd.s16    q1, q3
++        vbif        q8, q9, q4
++        vbif        q10, q11, q14
++        vbif        q9, q8, q5
++        vbif        q11, q10, q15
++        vclt.u16    d6, d24, d27
++        vclt.u16    d8, d2, d27
++        vclt.u16    d7, d25, d27
++        vclt.u16    d9, d3, d27
++        vclt.u16    d2, d0, d27
++        vclt.u16    d0, d4, d27
++        vclt.u16    d3, d1, d27
++        vclt.u16    d1, d5, d27
++        vceq.i32    q12, q10, q8
++        vceq.i32    q10, q9
++        vceq.i32    q8, q11
++        vceq.i32    q9, q11
++        vshrn.i32   d6, q3, #8
++        vshrn.i32   d7, q4, #8
++        vshrn.i32   d8, q1, #8
++        vshrn.i32   d9, q0, #8
++        vmovn.i32   d4, q12
++        vmovn.i32   d2, q10
++        vmovn.i32   d3, q8
++        vmovn.i32   d5, q9
++        vand        q2, q3
++        vrev16.8    q3, q3
++        vand        q2, q3
++        vand        q1, q4
++        vrev16.8    q4, q4
++        vand        q1, q4
++        vand        d4, d5
++        vand        d2, d3
++        vbic        d0, d12, d4
++        vshr.u16    d26, #2
++        vbic        d0, d2
++        vmov.i16    d1, #0x5555
++        vorr        d0, d26
++          bne         10f
++
++        @ Merge results into result word, no duplicates
++        vmov        a2, s0
++        vmov        v8, s1
++        vmov.u16    ip, d0[1]
++        vmov.u16    lr, d0[3]
++        lsl         a2, #30
++        lsl         v8, #30
++        lsl         ip, #30
++        lsl         lr, #30
++        orr         a2, ip, a2, lsr #2
++        orr         v8, lr, v8, lsr #2
++        orr         a2, v8, a2, lsr #4
++        subs        a1, #4
++        orr         v7, a2, v7, lsr #8
++        bhi         1b
++
++        mov         a1, #32
++        ldr         a3, [sp, #6*8]
++        vpop        {d8-d13}
++        sub         a1, a1, a3, lsl #1
++        mov         a1, v7, lsr a1
++        pop         {a2-a4,v1-v8,pc}
++10:
++        @ Merge results into result word, with duplicates
++        vmul.i16    d0, d1
++        vmov        a2, s0
++        vmov        v8, s1
++        vmov.u16    ip, d0[1]
++        vmov.u16    lr, d0[3]
++        lsl         a2, v2
++        subs        a1, #4
++        lsl         v8, v2
++        lsl         ip, v2
++        lsl         lr, v2
++        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
++T       lsr         a2, v1
++T       orr         a2, ip, a2
++A       orr         a2, ip, a2, lsr v1
++        lsl         ip, v1, #1
++T       lsr         v8, v1
++T       orr         v8, lr, v8
++A       orr         v8, lr, v8, lsr v1
++        lsl         lr, v1, #2
++T       lsr         a2, ip
++T       orr         a2, v8, a2
++A       orr         a2, v8, a2, lsr ip
++        ldr         v1, [sp, #6*8 + 12*4]
++T       lsr         v7, lr
++T       orr         v7, a2, v7
++A       orr         v7, a2, v7, lsr lr
++        bhi         1b
++
++        mov         a1, #32
++        ldrd        a3, a4, [sp, #6*8]
++        vpop        {d8-d13}
++        mls         a1, a3, a4, a1
++        mls         a1, a3, a4, a1
++        mov         a1, v7, lsr a1
++        pop         {a2-a4,v1-v8,pc}
++
++
++2:
++        sub         v5, v5, #10
++        sub         v6, v6, #10
++        vmov.u8     d16, #0
++        blo         3f
++        vld2.32     {d0[0], d1[0]}, [a3]!
++        vld2.32     {d2[0], d3[0]}, [a4]!
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        lr, [a4], #1
++        ldrb        v8, [a3], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d16[0]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d16[4]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d4[0]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d5[0]}, [ip]
++        vld1.32     {d6[0]}, [v8]
++        vld1.32     {d7[0]}, [lr]
++
++3:
++        vld2.32     {d0[1], d1[1]}, [a3]!
++        vld2.32     {d2[1], d3[1]}, [a4]!
++        ldrb        a2, [a3], #1
++          vmov.u16    d17, #1
++        ldrb        ip, [a4], #1
++          vmov.u16    d18, #2
++        ldrb        v8, [a3], #1
++          vmov.u16    d19, #4
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d16[2]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d16[6]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d4[1]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d5[1]}, [ip]
++        vld1.32     {d6[1]}, [v8]
++        vld1.32     {d7[1]}, [lr]
++
++        @ So now we have:
++        @ d0.32[i]  = curr[i].mv[0]
++        @ d1.32[i]  = curr[i].mv[1]
++        @ d2.32[i]  = neigh[i].mv[0]
++        @ d3.32[i]  = neigh[i].mv[1]
++        @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++        @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++        @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++        @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++        @ d16.16[i] = curr[i].pred_flag
++        @ d16.16[2+i] = neigh[i].pred_flag
++
++        vtst.16     d20, d16, d17
++        vtst.16     d22, d16, d18
++        vadd.i16    d30, d16, d17
++        vswp        d2, d3
++        ldr         lr, [sp, #1*4]
++        vmovl.s16   q10, d20
++          teq         lr, #1
++        vmovl.s16   q11, d22
++          it ne
++          lslne       v1, lr, #1
++        vbif        d0, d1, d20
++        vbif        d4, d6, d20
++        vbif        d3, d2, d21
++        vbif        d5, d7, d21
++        vbif        d1, d0, d22
++        vbif        d6, d4, d22
++        vbif        d2, d3, d23
++        vbif        d7, d5, d23
++        vshr.u16    d30, #2
++        vabd.s16    d24, d0, d3
++        vabd.s16    d25, d1, d2
++        vabd.s16    q0, q0, q1
++        vceq.i32    d2, d4, d5
++        vceq.i32    d20, d5, d6
++        vceq.i32    d21, d4, d7
++        vceq.i32    d3, d6, d7
++        vclt.u16    d6, d24, d19
++        vclt.u16    d7, d25, d19
++        vclt.u16    d22, d1, d19
++        vclt.u16    d23, d0, d19
++        vshrn.i32   d6, q3, #8
++        vmovn.i32   d2, q1
++        vshrn.i32   d7, q11, #8
++        vmovn.i32   d3, q10
++        vand        q0, q3, q1
++          it ne
++          rsbne       v2, v1, #32
++        vrev16.8    q3, q3
++        vand        q0, q3
++        vsra.u64    d30, #32
++        vshr.u64    q1, q0, #32
++        vand        q0, q1
++        vbic        d0, d17, d0
++        vand        d30, d30, d17
++        vbic        d0, d1
++        vmov.i16    d1, #0x5555
++        vorr        d0, d30
++          bne         10f
++
++        @ Construct result word, no duplicates
++        cmp         a1, #2
++        vmov.u16    a1, d0[1]
++        vmov.u16    a2, d0[0]
++        it eq
++        orreq       a1, a2, a1, lsl #2
++        pop         {a2-a4,v1-v8,pc}
++10:
++        @ Construct result word, with duplicates
++        cmp         a1, #2
++        vmul.i16    d0, d1
++        vmov        a2, s0
++        vmov.u16    a1, d0[1]
++        lsl         a2, #16
++        pkhbt       a1, a1, a1, lsl #16
++        lsr         a2, v2
++        lsr         a1, v2
++T       itt eq
++T       lsleq       a1, v1
++T       orreq       a1, a2, a1
++A       orreq       a1, a2, a1, lsl v1
++        pop         {a2-a4,v1-v8,pc}
++endfunc
++
++
++
++#else // non-NEON version
++
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ *                                            int in_inc0, in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++        add         ip, sp, #4*4
++        push        {a2-a4,v1-v8,lr}
++        mov         v6, #32
++1:      ldmdb       ip, {v1-v4}
++        ldrsb       v5, [a3, #8]    @ curr->ref_idx
++        ldrsb       v8, [a3, #9]
++        ldrsb       ip, [a4, #8]    @ neigh->ref_idx
++        ldrsb       lr, [a4, #9]
++        ldr         v1, [v1, v5, lsl #2]
++        ldrb        v5, [a3, #10]   @ curr->pred_flag
++        ldr         v2, [v2, v8, lsl #2]
++        ldrb        v8, [a4, #10]   @ neigh->pred_flag
++        ldr         v3, [v3, ip, lsl #2]
++        ldr         v4, [v4, lr, lsl #2]
++        teq         v5, #3
++        beq         20f
++        teq         v8, #3
++        beq         90f
++
++        tst         v5, #1
++        itee        ne
++        ldrne       v5, [a3, #0]    @ curr->mv[0]
++        moveq       v1, v2
++        ldreq       v5, [a3, #4]    @ curr->mv[1]
++        tst         v8, #1
++        itee        ne
++        ldrne       v8, [a4, #0]    @ neigh->mv[0]
++        moveq       v3, v4
++        ldreq       v8, [a4, #4]    @ neigh->mv[1]
++        teq         v1, v3
++        bne         10f
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v8, v5
++        ssub16      v5, v5, v8
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        @ drop through
++10:     it          ne
++        movne       v5, #1<<30
++11:
++        sub         v6, v6, #2
++T       mov         v7, v7, lsr #2
++        subs        a2, a2, #1
++A       orr         v7, v5, v7, lsr #2
++T       orr         v7, v5, v7
++        bhi         11b
++
++        ldrd        v3, v4, [sp, #16*4]
++        ldr         a2, [sp]
++        add         ip, sp, #16*4
++        subs        a1, a1, #1
++        add         a3, a3, v3
++        add         a4, a4, v4
++        bhi         1b
++        mov         a1, v7, lsr v6
++        pop         {a2-a4,v1-v8,pc}
++
++20:     teq         v8, #3
++        bne         10b
++
++        teq         v1, v3
++        it          eq
++        teqeq       v2, v4
++        bne         40f
++        teq         v1, v2
++        bne         30f
++
++        ldrd        v1, v2, [a3]    @ curr->mv
++        ldrd        v3, v4, [a4]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v3, v1
++        ssub16      v5, v1, v3
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        bne         25f
++        ssub16      ip, v4, v2
++        ssub16      v5, v2, v4
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        beq         11b
++        @ drop through
++25:     ssub16      ip, v4, v1
++        ssub16      v5, v1, v4
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        bne         10b
++        ssub16      ip, v3, v2
++        ssub16      v5, v2, v3
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        b           10b
++
++30:     ldrd        v1, v2, [a3]    @ curr->mv
++        ldrd        v3, v4, [a4]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v3, v1
++        ssub16      v5, v1, v3
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        bne         10b
++        ssub16      ip, v4, v2
++        ssub16      v5, v2, v4
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        b           10b
++
++40:     teq         v1, v4
++        ite         eq
++        teqeq       v2, v3
++        bne         10b
++
++        ldrd        v1, v2, [a3]    @ curr->mv
++        ldrd        v3, v4, [a4]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        b           25b
++
++90:
++        mov         v5, #1<<30
++        b           11b
++endfunc
++
++
++#endif
++
++
++@ =============================================================================
++@
++@ 10 bit
++
++function hevc_loop_filter_luma_body_10
++        m_filter_luma 10, q11, q15
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
++        hevc_loop_filter_luma_start
++        b        .Lh_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r10, [sp, #32]
++.Lh_loop_luma_common_10:
++        m_filter_h_luma_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
++        hevc_loop_filter_luma_start
++        sub      r4, r0, #8
++        b        .Lv_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r4, [sp, #36]
++        ldr      r10, [sp, #32]
++
++.Lv_loop_luma_common_10:
++        m_filter_v_luma_16 10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
++        m_filter_h_uv_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
++        m_filter_v_uv2_16 10
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+new file mode 100644
+index 0000000000..7ed5c7dc52
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+@@ -0,0 +1,184 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++/* uses registers q8 - q13 for temp values */
++.macro tr4_luma_shift shift
++        vaddl.s16   q8, d28, d30    // c0 = src0 + src2
++        vaddl.s16   q9, d30, d31    // c1 = src2 + src3
++        vsubl.s16   q10, d28, d31   // c2 = src0 - src3
++        vaddl.s16   q11, d28, d31   // src0 + src3
++
++        vmul.i32    q12, q8, d1[0]  // 29 * c0
++        vmul.i32    q13, q10, d2[0] // 55 * c2
++        vmul.i32    q8, q8, d2[0]   // 55 * c0
++        vmull.s16   q14, d29, d0[0] // c3 = 74 * src1
++
++        vsubw.s16   q11, q11, d30   // src0 - src2 + src3
++        vmla.i32    q12, q9, d2[0]  // 29 * c0 + 55 * c1
++        vmls.i32    q13, q9, d1[0]  // 55 * c2 - 29 * c1
++        vmla.i32    q8, q10, d1[0]  // 55 * c0 + 29 * c2
++
++        vmul.i32    q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
++        vadd.i32    q12, q12, q14   // dst0 = 29 * c0 + 55 * c1 + c3
++        vadd.i32    q13, q13, q14   // dst1 = 55 * c2 - 29 * c1 + c3
++        vsub.i32    q8, q8, q14     // dst3 = 55 * c0 + 29 * c2 - c3
++
++        vqrshrn.s32 d28, q12, \shift
++        vqrshrn.s32 d29, q13, \shift
++        vqrshrn.s32 d30, q11, \shift
++        vqrshrn.s32 d31, q8, \shift
++.endm
++
++/* uses registers q8 - q11 for temp values */
++.macro tr4_shift shift
++        vmull.s16   q9, d29, d0[0]   // 83 * src1
++        vmull.s16   q8, d29, d0[1]   // 36 * src1
++        vshll.s16   q14, d28, #6     // 64 * src0
++        vshll.s16   q10, d30, #6     // 64 * src2
++        vmlal.s16   q9, d31, d0[1]   // 83 * src1 + 36 * src3  o0
++        vmlsl.s16   q8, d31, d0[0]   // 36 * src1 - 83 * src3  o1
++        vadd.s32    q11, q14, q10    // 64 * (src0 + src2)     e0
++        vsub.s32    q10, q14, q10    // 64 * (src0 - src2)     e1
++        vadd.s32    q14, q11, q9     // e0 + o0
++        vadd.s32    q15, q10, q8     // e1 + o1
++        vsub.s32    q8, q10, q8      // e1 - o1
++        vsub.s32    q9, q11, q9      // e0 - o0
++
++        vqrshrn.s32 d28, q14, \shift
++        vqrshrn.s32 d29, q15, \shift
++        vqrshrn.s32 d30, q8, \shift
++        vqrshrn.s32 d31, q9, \shift
++.endm
++
++.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7,                         \
++                   tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
++                   tmp1, /* Q reg which doesn't alias with d7 or d0     */ \
++                   shift, I1, I2, I3
++
++        vmull.s16  q4, \d1, d1[1]        // 89 * src1
++        \I1
++        vmull.s16  q5, \d1, d1[0]        // 75 * src1
++        \I2
++        vmull.s16  q6, \d1, d1[3]        // 50 * src1
++        \I3
++        vmull.s16  q7, \d1, d1[2]        // 18 * src1
++        vmlal.s16  q4, \d3, d1[0]        // 75 * src3
++        vmlsl.s16  q5, \d3, d1[2]        //-18 * src3
++        vmlsl.s16  q6, \d3, d1[1]        //-89 * src3
++        vmlsl.s16  q7, \d3, d1[3]        //-50 * src3
++
++          // tr4
++          vmull.s16  q1, \d2, d0[0]      // 83 * src(1*2)
++          vmull.s16  q2, \d2, d0[1]      // 36 * src(1*2)
++
++        vmlal.s16  q4, \d5, d1[3]        // 50 * src5
++        vmlsl.s16  q5, \d5, d1[1]        //-89 * src5
++        vmlal.s16  q6, \d5, d1[2]        // 18 * src5
++        vmlal.s16  q7, \d5, d1[0]        // 75 * src5
++
++          vshll.s16  q3, \d0, #6         // 64 * src(0*2)
++          vshll.s16  \tmp0, \d4, #6      // 64 * src(2*2)
++          vmlal.s16  q1, \d6, d0[1]      // 83 * src(1*2) + 36 * src(3*2)  o0
++          vmlsl.s16  q2, \d6, d0[0]      // 36 * src(1*2) - 83 * src(3*2)  o1
++          vadd.i32   \tmp1, q3, \tmp0    // 64 * (src(0*2) + src(2*2))     e0
++          vsub.i32   \tmp0, q3, \tmp0    // 64 * (src(0*2) - src(2*2))     e1
++
++        vmlal.s16  q4, \d7, d1[2]        // 18 * src7
++        vmlsl.s16  q5, \d7, d1[3]        //-50 * src7
++        vmlal.s16  q6, \d7, d1[0]        // 75 * src7
++        vmlsl.s16  q7, \d7, d1[1]        //-89 * src7
++
++          vsub.i32   q3, \tmp1, q1       // e0 - o0
++          vadd.i32   \tmp1, \tmp1, q1    // e0 + o0
++          vadd.i32   q1, \tmp0, q2       // e1 + o1
++          vsub.i32   q2, \tmp0, q2       // e1 - o1
++
++        vadd.i32   \tmp0, \tmp1, q4      // e_8[0] + o_8[0], dst[0]
++        vsub.i32   q4, \tmp1, q4         // e_8[0] - o_8[0], dst[7]
++        vsub.i32   \tmp1, q3, q7         // e_8[3] - o_8[3], dst[4]
++        vadd.i32   q7, q3, q7            // e_8[3] + o_8[3], dst[3]
++        vadd.i32   q3, q1, q5            // e_8[1] + o_8[1], dst[1]
++        vsub.i32   q5, q1, q5            // e_8[1] - o_8[1], dst[6]
++        vsub.i32   q1, q2, q6            // e_8[2] - o_8[2], dst[5]
++        vadd.i32   q6, q2, q6            // e_8[2] + o_8[2], dst[2]
++        vqrshrn.s32   \d0, \tmp0, #\shift
++        vqrshrn.s32   \d4, \tmp1, #\shift
++        vqrshrn.s32   \d1, q3, #\shift
++        vqrshrn.s32   \d5, q1, #\shift
++        vqrshrn.s32   \d2, q6, #\shift
++        vqrshrn.s32   \d6, q5, #\shift
++        vqrshrn.s32   \d3, q7, #\shift
++        vqrshrn.s32   \d7, q4, #\shift
++.endm
++
++.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
++        vld1.16     {\d0}, [r0 :64], r3
++        vld1.16     {\d1}, [r2 :64], r3
++        vld1.16     {\d2}, [r0 :64], r3
++        vld1.16     {\d3}, [r2 :64], r3
++        vld1.16     {\d4}, [r0 :64], r3
++        vld1.16     {\d5}, [r2 :64], r3
++        vld1.16     {\d6}, [r0 :64], r3
++        vld1.16     {\d7}, [r2 :64], r3
++
++        tr8_process \
++            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++            \q01, \q23, 7, "\I1", "\I2", "\I3"
++.endm
++
++.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
++        tr8_process \
++            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++            \q01, \q23, \shift
++
++        vzip.16    \d0, \d4
++        vzip.16    \d1, \d5
++        vzip.16    \d2, \d6
++        vzip.16    \d3, \d7
++        vst4.16    {\d0-\d3}, [r0 :128], r3
++        vst4.16    {\d4-\d7}, [r2 :128], r3
++.endm
++
++#define BIT_DEPTH 8
++#include "rpi_hevc_idct_fn_neon.S"
++
++.text
++
++.align 4
++tr4f:
++.word 0x00240053  // 36 and d1[0] = 83
++.word 0x00000000
++tr8f:
++.word 0x0059004b  // 89, d0[0] = 75
++.word 0x00320012  // 50, d0[2] = 18
++tr16:
++.word 0x005a0057  // 90, d2[0] = 87
++.word 0x00500046  // 80, d2[2] = 70
++.word 0x0039002b  // 57, d2[0] = 43
++.word 0x00190009  // 25, d2[2] = 9
++
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "rpi_hevc_idct_fn_neon.S"
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+new file mode 100644
+index 0000000000..109fa98c29
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+@@ -0,0 +1,32 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++
++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags))
++        ff_hevcdsp_rpi_init_neon(c, bit_depth);
++}
+diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+new file mode 100644
+index 0000000000..9294ab8010
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+@@ -0,0 +1,467 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/bit_depth_template.c"
++
++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
++// have been removed from head as we never use them.
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                             uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                             uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++
++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
++
++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++
++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                                int in_inc0, int in_inc1);
++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
++
++
++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++}
++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
++}
++
++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++
++#if SAO_FILTER_N == 6
++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++
++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++#endif
++
++
++
++#if RPI_HEVC_SAO_BUF_STRIDE != 160
++#error SAO edge src stride not 160 - value used in .S
++#endif
++
++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
++{
++    if (bit_depth == 8) {
++        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
++        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
++        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
++        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
++        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
++        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
++        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
++        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
++        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
++        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
++        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
++        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
++        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
++        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
++        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
++        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
++        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
++        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
++        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
++        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
++        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
++        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
++        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
++        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
++        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
++        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
++        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
++        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
++        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
++        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
++        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
++        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
++        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
++        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
++        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
++        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
++        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
++        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
++        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
++        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
++        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
++#endif
++        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
++        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
++        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
++        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
++        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
++        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
++#endif
++    }
++    else if (bit_depth == 10) {
++        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
++        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
++        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
++        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
++        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
++        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
++        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
++        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
++        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
++        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
++        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
++        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
++        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
++        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
++        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
++        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
++        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
++        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
++        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
++        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
++        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
++        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
++        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
++        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
++        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
++        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
++        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
++        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
++        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
++        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
++        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
++        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
++        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
++        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
++        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
++
++        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
++        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
++        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
++        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
++        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
++        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
++#endif
++        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
++        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
++        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
++        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
++        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
++        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
++#endif
++    }
++
++    assert(offsetof(HEVCRpiMvField, mv) == 0);
++    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
++    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
++    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
++    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
++}
+diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+new file mode 100644
+index 0000000000..93876d14c0
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+@@ -0,0 +1,620 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
++        add         ip, r0, r2
++        vld1.16     {q10, q11}, [r1]
++        lsl         r2, #1
++        vld1.16     {d0}, [r0 :64], r2
++        vld1.16     {d1}, [ip :64], r2
++        vld1.16     {d2}, [r0 :64]
++        vld1.16     {d3}, [ip :64]
++        sub         r0, r2
++        vqadd.s16   q0,  q10
++        sub         ip, r2
++        vqadd.s16   q1,  q11
++        vmov.i16    q8,  #0
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0 :64], r2
++        vst1.16     {d1}, [ip :64], r2
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d3}, [ip :64]
++        bx          lr
++
++endfunc
++
++@ add_residual4x4_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
++        add         ip, r0, r1
++        vdup.16     q15, r2
++        lsl         r1, #1
++        vld1.16     {d0}, [r0 :64], r1
++        vld1.16     {d1}, [ip :64], r1
++        vld1.16     {d2}, [r0 :64]
++        vld1.16     {d3}, [ip :64]
++        sub         r0, r1
++        vqadd.s16   q0,  q15
++        sub         ip, r1
++        vqadd.s16   q1,  q15
++        vmov.i16    q8,  #0
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0 :64], r1
++        vst1.16     {d1}, [ip :64], r1
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d3}, [ip :64]
++        bx          lr
++
++endfunc
++
++
++@ add_residual8x8(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
++        mov         r3, #8
++        vmov.i64    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++1:
++        vldm        r1!, {q10-q13}
++        vld1.16     {q0}, [r0 :128], r2
++        vld1.16     {q1}, [ip :128], r2
++        vld1.16     {q2}, [r0 :128]
++        vld1.16     {q3}, [ip :128]
++        sub         r0, r2
++        vqadd.s16   q0,  q10
++        sub         ip, r2
++        vqadd.s16   q1,  q11
++        subs        r3, #4
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0}, [r0 :128], r2
++        vst1.16     {q1}, [ip :128], r2
++        vst1.16     {q2}, [r0 :128], r2
++        vst1.16     {q3}, [ip :128], r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual4x4_dc_c(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r3, #4
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual8x8_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r2
++        mov         r3, #8
++9:
++        vmov.i16    q8,  #0
++        add         ip, r0, r1
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r1, #1
++1:
++        vld1.16     {q0}, [r0 :128], r1
++        vld1.16     {q1}, [ip :128], r1
++        vld1.16     {q2}, [r0 :128]
++        vld1.16     {q3}, [ip :128]
++        sub         r0, r1
++        vqadd.s16   q0,  q15
++        sub         ip, r1
++        vqadd.s16   q1,  q15
++        subs        r3, #4
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0}, [r0 :128], r1
++        vst1.16     {q1}, [ip :128], r1
++        vst1.16     {q2}, [r0 :128], r1
++        vst1.16     {q3}, [ip :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual16x16(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
++        add         ip, r0, r2
++        vmov.i16    q8,  #0
++        lsl         r2, #1
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        mov         r3, #16
++1:
++        vldm        r1!, {q10-q13}
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0 :128]
++        subs        r3, #2
++        vld1.16     {q2, q3}, [ip :128]
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0 :128], r2
++        vst1.16     {q2, q3}, [ip :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r3, #8
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual16x16_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r3, #16
++9:
++        vmov.i16    q8,  #0
++        add         ip, r0, r1
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r1, #1
++1:
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0 :128]
++        subs        r3, #2
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vld1.16     {q2, q3}, [ip :128]
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0 :128], r1
++        vst1.16     {q2, q3}, [ip :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++
++@ add_residual32x32(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        mov         r3, #32
++        vmov.i16    q8,  #0
++        add         lr, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vldm        r1!, {q10-q13}
++        vldm        r0,  {q0-q3}
++        vqadd.s16   q0,  q10
++          pldw        [lr]
++        vqadd.s16   q1,  q11
++          add         lr, r2
++        vqadd.s16   q2,  q12
++        subs        r3, #1
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0-q1}, [r0], r2
++        vst1.16     {q2-q3}, [ip], r2
++        bne         1b
++        pop         {pc}
++
++endfunc
++
++@ add_residual16x16_dc_c(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r3, #16
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual32x32_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r2
++        mov         r3, #32
++9:
++        vmov.i16    q8,  #0
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vldm        r0,  {q0-q3}
++        vqadd.s16   q0,  q15
++        subs        r3, #1
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0-q1}, [r0], r1
++        vst1.16     {q2-q3}, [ip], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_u(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld1.16     {q10, q11}, [r1 :256]
++        lsl         r2, #1
++        vld2.16     {d0, d2}, [r0 :128], r2
++        vld2.16     {d1, d3}, [ip :128], r2
++        vld2.16     {d4, d6}, [r0 :128]
++        vld2.16     {d5, d7}, [ip :128]
++        sub         r0, r2
++        vmov.i16    q8,  #0
++        sub         ip, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0 :128], r2
++        vst2.16     {d1, d3}, [ip :128], r2
++        vst2.16     {d4, d6}, [r0 :128]
++        vst2.16     {d5, d7}, [ip :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_u(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        mov         r3, #8
++        vmov.i16    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        subs        r3, #2
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        vdup.16     q15, r3
++        mov         r3, #16
++        vmov.i16    q8,  #0
++        add         lr, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q10
++          pldw        [lr]
++        vqadd.s16   q1,  q15
++          add         lr, r2
++        vqadd.s16   q2,  q11
++        subs        r3, #1
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {pc}
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld1.16     {q10, q11}, [r1 :256]
++        lsl         r2, #1
++        vld2.16     {d0, d2}, [r0 :128], r2
++        vld2.16     {d1, d3}, [ip :128], r2
++        vld2.16     {d4, d6}, [r0 :128]
++        vld2.16     {d5, d7}, [ip :128]
++        sub         r0, r2
++        vmov.i16    q8,  #0
++        sub         ip, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        clip16_4    q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0 :128], r2
++        vst2.16     {d1, d3}, [ip :128], r2
++        vst2.16     {d4, d6}, [r0 :128]
++        vst2.16     {d5, d7}, [ip :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        mov         r3, #8
++        vmov.i16    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        subs        r3, #2
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        vdup.16     q15, r3
++        mov         r3, #16
++        vmov.i16    q8,  #0
++        add         lr, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q15
++          pldw        [lr]
++        vqadd.s16   q1,  q10
++          add         lr, r2
++        vqadd.s16   q2,  q15
++        subs        r3, #1
++        vqadd.s16   q3,  q11
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {pc}
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++        vmov.i16    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++        vldm        r1, {q10-q13}
++        vld2.16     {d0, d2}, [r0 :128], r2
++        vld2.16     {d1, d3}, [ip :128], r2
++        vld2.16     {d4, d6}, [r0 :128]
++        vld2.16     {d5, d7}, [ip :128]
++
++        sub         r0, r2
++        vqadd.s16   q0,  q10
++        sub         ip, r2
++        vqadd.s16   q1,  q12
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0 :128], r2
++        vst2.16     {d1, d3}, [ip :128], r2
++        vst2.16     {d4, d6}, [r0 :128]
++        vst2.16     {d5, d7}, [ip :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        add         ip, r0, r2
++        lsl         r2, #1
++        vmov.i16    q8,  #0
++        add         r3, r1, #(8*8*2)  @ Offset to V
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        mov         lr, #8
++1:
++        vld1.16     {q10, q11}, [r1 :256]!
++        subs        lr, #2
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q12, q13}, [r3 :256]!
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q12
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {pc}
++endfunc
++
++@ add_residual16x16_c(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++        push        {r4, lr}
++        vmov.i16    q8,  #0
++        add         r3,  r1, #(16*16*2)  @ Offset to V
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++        add         r4, r0, r2
++        mov         lr, #16
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vld1.16     {q12, q13}, [r3 :256]!
++        vqadd.s16   q0,  q10
++          pldw        [r4]
++        vqadd.s16   q1,  q12
++          add         r4, r2
++        vqadd.s16   q2,  q11
++        subs        lr, #1
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {r4,pc}
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
+new file mode 100644
+index 0000000000..d9a1d7d98c
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
+@@ -0,0 +1,741 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++@ General notes:
++@
++@ Residual is generally only guaranteed to be clipped to 16 bits.
++@ This means that we do need to do vmovl, vqadd, vqmovun
++@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
++@ with this).
++@
++@ There is an exception for the DC case because its transform is guaranteed
++@ to be small enough that overflow cannot occur during the first add.
++
++@ ============================================================================
++@ Y add
++
++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q0, q1}, [r1]
++        lsl         r2, #1
++        vld1.32     d4[0], [r0], r2
++        rsb         r3, r2, #0
++        vld1.32     d4[1], [ip], r2
++        vld1.32     d5[0], [r0], r3
++        vld1.32     d5[1], [ip], r3
++        vmovl.u8    q8, d4
++        vmovl.u8    q9, d5
++        vqadd.s16   q0, q8
++        vqadd.s16   q1, q9
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r2
++        vst1.32     d0[1], [ip], r2
++        vst1.32     d1[0], [r0]
++        vst1.32     d1[1], [ip]
++        bx          lr
++endfunc
++
++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
++        push        {r4, lr}
++        vld1.16     {q0, q1}, [r1]!
++        add         ip, r0, r2
++        vld1.8      {d6}, [r0]
++        add         r4, r0, r2, lsl #1
++        vld1.8      {d7}, [ip]
++        add         lr, ip, r2, lsl #1
++        lsl         r2, #1
++        mov         r3, #8-2
++        vmovl.u8    q2, d6
++        vmovl.u8    q3, d7
++        vqadd.s16   q2, q0
++        vqadd.s16   q3, q1
++1:
++          vld1.16     {q0, q1}, [r1]!
++        subs        r3, #2
++        vqmovun.s16 d4, q2
++        vqmovun.s16 d5, q3
++          vld1.8      {d6}, [r4], r2
++          vld1.8      {d7}, [lr], r2
++        vst1.8      {d4}, [r0], r2
++        vst1.8      {d5}, [ip], r2
++          vmovl.u8    q2, d6
++            pldw        [r4]
++          vmovl.u8    q3, d7
++          vqadd.s16   q2, q0
++          vqadd.s16   q3, q1
++        bne         1b
++
++          vqmovun.s16 d4, q2
++          vqmovun.s16 d5, q3
++          vst1.8      {d4}, [r0]
++          vst1.8      {d5}, [ip]
++          pop         {r4, pc}
++endfunc
++
++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
++        vld1.16     {q0, q1}, [r1]!
++        add         ip, r0, r2
++        vld1.8      {q3}, [r0]
++        mov         r3, #16-1
++        vmovl.u8    q2, d6
++        vmovl.u8    q3, d7
++        vqadd.s16   q2, q0
++        vqadd.s16   q3, q1
++1:
++          vld1.16     {q0, q1}, [r1]!
++        subs        r3, #1
++        vqmovun.s16 d4, q2
++        vqmovun.s16 d5, q3
++          vld1.8      {q3}, [ip], r2
++        vst1.8      {q2}, [r0], r2
++          vmovl.u8    q2, d6
++            pldw        [ip]
++          vmovl.u8    q3, d7
++          vqadd.s16   q2, q0
++          vqadd.s16   q3, q1
++        bne         1b
++
++          vqmovun.s16 d4, q2
++          vqmovun.s16 d5, q3
++          vst1.8      {q2}, [r0]
++          bx          lr
++endfunc
++
++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
++        vldm        r1!, {q0-q3}
++        vld1.8      {q8, q9}, [r0]
++        add         ip, r0, r2
++        vmovl.u8    q10, d16
++        mov         r3, #32-1
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vqadd.s16   q10, q0
++        vqadd.s16   q11, q1
++        vqadd.s16   q12, q2
++        vqadd.s16   q13, q3
++1:
++          vldm        r1!, {q0-q3}
++        vqmovun.s16 d20, q10
++        vqmovun.s16 d21, q11
++        vqmovun.s16 d22, q12
++        vqmovun.s16 d23, q13
++          vld1.8      {q8, q9}, [ip], r2
++        subs        r3, #1
++        vst1.8      {q10, q11}, [r0], r2
++          vmovl.u8    q10, d16
++            pldw        [ip]
++          vmovl.u8    q11, d17
++          vmovl.u8    q12, d18
++          vmovl.u8    q13, d19
++          vqadd.s16   q10, q0
++          vqadd.s16   q11, q1
++          vqadd.s16   q12, q2
++          vqadd.s16   q13, q3
++        bne     1b
++
++          vqmovun.s16 d20, q10
++          vqmovun.s16 d21, q11
++          vqmovun.s16 d22, q12
++          vqmovun.s16 d23, q13
++          vst1.8      {q10, q11}, [r0]
++          bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
++        add         ip, r0, r1
++        vdup.16     q15, r2
++        lsl         r1, #1
++        vld1.32     d4[0], [r0], r1
++        rsb         r3, r1, #0
++        vld1.32     d4[1], [ip], r1
++        vld1.32     d5[0], [r0], r3
++        vld1.32     d5[1], [ip], r3
++        vaddw.u8    q0, q15, d4
++        vaddw.u8    q1, q15, d5
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r1
++        vst1.32     d0[1], [ip], r1
++        vst1.32     d1[0], [r0]
++        vst1.32     d1[1], [ip]
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ DC Y or C add
++
++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
++        mov         r3,  #4-2
++        vdup.32     q15, r2
++        b           1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3, #8-2
++1:      vld1.8      d16, [r0]
++        add         ip, r0, r1
++        push        {r4, lr}
++        vld1.8      d17, [ip]
++        add         r4, r0, r1, lsl #1
++        vaddw.u8    q0, q15, d16
++        lsl         r1, #1
++        vaddw.u8    q1, q15, d17
++        add         lr, ip, r1
++1:
++          vld1.8      {d16}, [r4], r1
++          vld1.8      {d17}, [lr], r1
++        subs        r3, #2
++        vqmovun.s16 d4, q0
++        vqmovun.s16 d5, q1
++          vaddw.u8    q0, q15, d16
++          vaddw.u8    q1, q15, d17
++        vst1.8      {d4}, [r0], r1
++        vst1.8      {d5}, [ip], r1
++        bne         1b
++
++          vqmovun.s16 d4, q0
++          vqmovun.s16 d5, q1
++          vst1.8      {d4}, [r0]
++          vst1.8      {d5}, [ip]
++          pop         {r4, pc}
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
++        mov         r3,  #8-1
++        vdup.32     q15, r2
++        b           1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #16-1
++1:      vld1.8      {q8}, [r0]
++        add         ip, r0, r1
++        vaddw.u8    q0, q15, d16
++        vaddw.u8    q1, q15, d17
++1:
++          vld1.8      {q8}, [ip], r1
++        subs        r3, #1
++        vqmovun.s16 d4, q0
++        vqmovun.s16 d5, q1
++          vaddw.u8    q0, q15, d16
++          vaddw.u8    q1, q15, d17
++        vst1.8      {q2}, [r0], r1
++        bne         1b
++
++          vqmovun.s16 d4, q0
++          vqmovun.s16 d5, q1
++          vst1.8      {q2}, [r0]
++          bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
++        mov         r3,  #16-1
++        vdup.32     q15, r2
++        b           1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3, #32-1
++1:      vld1.8      {q8, q9}, [r0]
++        add         ip, r0, r1
++        vaddw.u8    q0, q15, d16
++        vaddw.u8    q1, q15, d17
++        vaddw.u8    q2, q15, d18
++        vaddw.u8    q3, q15, d19
++1:
++        vqmovun.s16 d20, q0
++        vqmovun.s16 d21, q1
++        vqmovun.s16 d22, q2
++        vqmovun.s16 d23, q3
++          vld1.8      {q8, q9}, [ip], r1
++        subs        r3, #1
++          vaddw.u8    q0, q15, d16
++          vaddw.u8    q1, q15, d17
++          vaddw.u8    q2, q15, d18
++          vaddw.u8    q3, q15, d19
++        vst1.8      {q10, q11}, [r0], r1
++        bne     1b
++
++          vqmovun.s16 d20, q0
++          vqmovun.s16 d21, q1
++          vqmovun.s16 d22, q2
++          vqmovun.s16 d23, q3
++          vst1.8      {q10, q11}, [r0]
++          bx          lr
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q0, q1}, [r1]
++        lsl         r2, #1
++        vld1.8      {d16}, [r0 :64], r2
++        vld1.8      {d17}, [ip :64], r2
++        vld1.8      {d18}, [r0 :64]
++        sub         r0, r2
++        vld1.8      {d19}, [ip :64]
++        sub         ip, r2
++        vdup.16     q2, r3
++        vdup.16     q3, r3
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0 :64], r2
++        vst1.8      {d1}, [ip :64], r2
++        vst1.8      {d2}, [r0 :64]
++        vst1.8      {d3}, [ip :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        push        {r4, lr}
++        vld2.8      {d16, d17}, [r0 :128]
++        lsl         r2, #1
++        vld2.8      {d18, d19}, [ip :128]
++        mov         r3, #8-2
++        vld1.16     {q0, q1}, [r1 :256]!
++        add         r4, r0, r2
++        vmovl.u8    q10, d16
++        add         lr, ip, r2
++        vmovl.u8    q11, d18
++        vqadd.s16   q0,  q10
++        vaddw.u8    q2,  q15, d17
++        vqadd.s16   q1,  q11
++        vaddw.u8    q3,  q15, d19
++1:
++        vqmovun.s16 d20,  q0
++        vqmovun.s16 d21,  q2
++          vld2.8      {d16, d17}, [r4 :128], r2
++        subs        r3, #2
++        vqmovun.s16 d22,  q1
++        vqmovun.s16 d23,  q3
++        vst2.8      {d20, d21}, [r0 :128], r2
++          vld2.8      {d18, d19}, [lr :128], r2
++        vst2.8      {d22, d23}, [ip :128], r2
++          vld1.16     {q0, q1}, [r1 :256]!
++          vmovl.u8    q10, d16
++          vmovl.u8    q11, d18
++          vqadd.s16   q0,  q10
++          vaddw.u8    q2,  q15, d17
++          vqadd.s16   q1,  q11
++          vaddw.u8    q3,  q15, d19
++        bne         1b
++
++          vqmovun.s16 d20,  q0
++          vqmovun.s16 d21,  q2
++          vqmovun.s16 d22,  q1
++          vqmovun.s16 d23,  q3
++          vst2.8      {d20, d21}, [r0 :128]
++          vst2.8      {d22, d23}, [ip :128]
++          pop         {r4, pc}
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld2.8      {q8, q9}, [r0 :256]
++        mov         r3, #16-1
++        vld1.16     {q0, q1}, [r1 :256]!
++        vmovl.u8    q11, d16
++        vmovl.u8    q12, d17
++        vqadd.s16   q0,  q11
++        vaddw.u8    q11, q15, d18
++        vqadd.s16   q1,  q12
++        vaddw.u8    q12, q15, d19
++1:
++          vld2.8      {q8, q9}, [ip :256], r2
++        subs        r3, #1
++        vqmovun.s16 d20, q0
++        vqmovun.s16 d22, q11
++        vqmovun.s16 d21, q1
++        vqmovun.s16 d23, q12
++          vld1.16     {q0, q1}, [r1 :256]!
++        vst2.8      {q10, q11}, [r0 :256], r2
++          vmovl.u8    q11, d16
++            pldw        [ip]
++          vmovl.u8    q12, d17
++          vqadd.s16   q0,  q11
++          vaddw.u8    q11, q15, d18
++          vqadd.s16   q1,  q12
++          vaddw.u8    q12, q15, d19
++        bne         1b
++
++          vqmovun.s16 d20, q0
++          vqmovun.s16 d22, q11
++          vqmovun.s16 d21, q1
++          vqmovun.s16 d23, q12
++          vst2.8      {q10, q11}, [r0 :256]
++          bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q2, q3}, [r1]
++        lsl         r2, #1
++        vld1.8      {d16}, [r0 :64], r2
++        vld1.8      {d17}, [ip :64], r2
++        vld1.8      {d18}, [r0 :64]
++        sub         r0, r2
++        vld1.8      {d19}, [ip :64]
++        sub         ip, r2
++        vdup.16     q0, r3
++        vdup.16     q1, r3
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0 :64], r2
++        vst1.8      {d1}, [ip :64], r2
++        vst1.8      {d2}, [r0 :64]
++        vst1.8      {d3}, [ip :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        push        {r4, lr}
++        vld2.8      {d16, d17}, [r0 :128]
++        lsl         r2, #1
++        vld2.8      {d18, d19}, [ip :128]
++        mov         r3, #8-2
++        vld1.16     {q0, q1}, [r1 :256]!
++        add         r4, r0, r2
++        vmovl.u8    q10, d17
++        add         lr, ip, r2
++        vmovl.u8    q11, d19
++        vqadd.s16   q0,  q10
++        vaddw.u8    q2,  q15, d16
++        vqadd.s16   q1,  q11
++        vaddw.u8    q3,  q15, d18
++1:
++        vqmovun.s16 d20,  q2
++        vqmovun.s16 d21,  q0
++          vld2.8      {d16, d17}, [r4 :128], r2
++        subs        r3, #2
++        vqmovun.s16 d22,  q3
++        vqmovun.s16 d23,  q1
++        vst2.8      {d20, d21}, [r0 :128], r2
++          vld2.8      {d18, d19}, [lr :128], r2
++        vst2.8      {d22, d23}, [ip :128], r2
++          vld1.16     {q0, q1}, [r1 :256]!
++          vmovl.u8    q10, d17
++          vmovl.u8    q11, d19
++          vqadd.s16   q0,  q10
++          vaddw.u8    q2,  q15, d16
++          vqadd.s16   q1,  q11
++          vaddw.u8    q3,  q15, d18
++        bne         1b
++
++          vqmovun.s16 d20,  q2
++          vqmovun.s16 d21,  q0
++          vqmovun.s16 d22,  q3
++          vqmovun.s16 d23,  q1
++          vst2.8      {d20, d21}, [r0 :128]
++          vst2.8      {d22, d23}, [ip :128]
++          pop         {r4, pc}
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld2.8      {q8, q9}, [r0 :256]
++        mov         r3, #16-1
++        vld1.16     {q0, q1}, [r1 :256]!
++        vmovl.u8    q11, d18
++        vmovl.u8    q12, d19
++        vqadd.s16   q0,  q11
++        vaddw.u8    q11, q15, d16
++        vqadd.s16   q1,  q12
++        vaddw.u8    q12, q15, d17
++1:
++          vld2.8      {q8, q9}, [ip :256], r2
++        subs        r3, #1
++        vqmovun.s16 d20, q11
++        vqmovun.s16 d22, q0
++        vqmovun.s16 d21, q12
++        vqmovun.s16 d23, q1
++          vld1.16     {q0, q1}, [r1 :256]!
++        vst2.8      {q10, q11}, [r0 :256], r2
++          vmovl.u8    q11, d18
++            pldw        [ip]
++          vmovl.u8    q12, d19
++          vqadd.s16   q0,  q11
++          vaddw.u8    q11, q15, d16
++          vqadd.s16   q1,  q12
++          vaddw.u8    q12, q15, d17
++        bne         1b
++
++          vqmovun.s16 d20, q11
++          vqmovun.s16 d22, q0
++          vqmovun.s16 d21, q12
++          vqmovun.s16 d23, q1
++          vst2.8      {q10, q11}, [r0 :256]
++          bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q0, q1}, [r1]!       @ all of U
++        lsl         r2, #1
++        vld1.8      {d16}, [r0 :64], r2
++        rsb         r3, r2, #0
++        vld1.8      {d17}, [ip :64], r2
++        vld1.16     {q2, q3}, [r1]        @ all of V
++        vld1.8      {d18}, [r0 :64], r3
++        vld1.8      {d19}, [ip :64], r3
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0 :64], r2
++        vst1.8      {d1}, [ip :64], r2
++        vst1.8      {d2}, [r0 :64]
++        vst1.8      {d3}, [ip :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
++        vld2.8      {d16, d17}, [r0 :128]
++        add         r3, r1, #(8*8*2)  @ Offset to V
++        vld1.16     {q0}, [r1 :128]!
++        add         ip, r0, r2
++        vld1.16     {q1}, [r3 :128]!
++        vmovl.u8    q10, d16
++        push        {lr}
++        vmovl.u8    q8,  d17
++        mov         lr, #8-1
++        vqadd.s16   q10, q0
++        vqadd.s16   q1,  q8
++1:
++          vld2.8      {d16, d17}, [ip :128], r2
++        subs        lr, #1
++          vld1.16     {q0}, [r1 :128]!
++        vqmovun.s16 d20, q10
++        vqmovun.s16 d21, q1
++          vld1.16     {q1}, [r3 :128]!
++        vst2.8      {d20, d21}, [r0 :128], r2
++          vmovl.u8    q10, d16
++            pldw        [ip]
++          vmovl.u8    q8,  d17
++          vqadd.s16   q10, q0
++          vqadd.s16   q1,  q8
++        bne         1b
++
++          vqmovun.s16 d20, q10
++          vqmovun.s16 d21, q1
++          vst2.8      {d20, d21}, [r0 :128]
++          pop         {pc}
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
++        vld2.8      {q8, q9}, [r0 :256]
++        add         r3, r1, #(16*16*2)  @ Offset to V
++        vld1.16     {q0, q1}, [r1 :256]!
++        add         ip, r0, r2
++        vld1.16     {q2, q3}, [r3 :256]!
++        vmovl.u8    q10, d16
++        push        {lr}
++        vmovl.u8    q8,  d17
++        mov         lr, #16-1
++        vmovl.u8    q11, d18
++        vmovl.u8    q9,  d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q8
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q9
++1:
++          vld2.8      {q8, q9}, [ip :256], r2
++        subs        lr, #1
++        vqmovun.s16 d20, q0
++        vqmovun.s16 d22, q2
++        vqmovun.s16 d21, q1
++        vqmovun.s16 d23, q3
++          vld1.16     {q0, q1}, [r1 :256]!
++        vst2.8      {d20-d23}, [r0 :256], r2
++          vld1.16     {q2, q3}, [r3 :256]!
++          vmovl.u8    q10, d16
++            pldw        [ip]
++          vmovl.u8    q8,  d17
++          vmovl.u8    q11, d18
++          vmovl.u8    q9,  d19
++          vqadd.s16   q0,  q10
++          vqadd.s16   q1,  q8
++          vqadd.s16   q2,  q11
++          vqadd.s16   q3,  q9
++        bne         1b
++
++          vqmovun.s16 d20, q0
++          vqmovun.s16 d22, q2
++          vqmovun.s16 d21, q1
++          vqmovun.s16 d23, q3
++          vst2.8      {d20-d23}, [r0 :256]
++          pop         {pc}
++endfunc
++
++@ 32x32 chroma never occurs so NIF
++
++@ ============================================================================
+diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+new file mode 100644
+index 0000000000..b56e0f9644
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+@@ -0,0 +1,2245 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.set EDGE_SRC_STRIDE, 160
++
++@ PIC jump tables are fractionally more expensive than absolute in our code
++.set jent_pic, CONFIG_PIC
++
++
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
++        vshr.u8   q12, q8, #3
++        \I1
++        vadd.i8   q8, \Q_K128
++        \I2
++        vshr.u8   q13, q9, #3
++        \I3
++        vadd.i8   q9, \Q_K128
++        \I4
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT0, d25
++        vtbl.8    d26, \XLAT1, d26
++        vtbl.8    d27, \XLAT1, d27
++
++        vqadd.s8  q8, q12
++        vshr.u8   q12, q10, #3
++        vadd.i8   q10, \Q_K128
++        vqadd.s8  q9, q13
++        vshr.u8   q13, q11, #3
++        vadd.i8   q11, \Q_K128
++
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT0, d25
++        vtbl.8    d26, \XLAT1, d26
++        vtbl.8    d27, \XLAT1, d27
++        vqadd.s8  q10, q12
++        vsub.i8   q8, \Q_K128
++        vqadd.s8  q11, q13
++        vsub.i8   q9, \Q_K128
++        vsub.i8   q10, \Q_K128
++        vsub.i8   q11, \Q_K128
++.endm
++
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
++        \L1
++        \L2
++        \L3
++        \L4
++        \L5
++        vadd.i8   q12, q8, \Q_K128
++        vshr.u8   q8, #3
++        vtbl.8    d16, \XLAT0, d16
++        vtbl.8    d17, \XLAT1, d17
++        vqadd.s8  q12, q8
++        bmi       2f
++1:        \L1
++          \L2
++          \L3
++          \L4
++          \L5
++        vsub.i8   q13, q12, \Q_K128
++          vadd.i8   q12, q8, \Q_K128
++          vshr.u8   q8, #3
++        \S1
++        \S2
++        \S3
++        \S4
++          vtbl.8    d16, \XLAT0, d16
++          vtbl.8    d17, \XLAT1, d17
++          vqadd.s8  q12, q8
++          bpl       1b
++2:        vsub.i8   q13, q12, \Q_K128
++          \S1
++          \S2
++          \S3
++          \S4
++.endm
++
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ Clobbers q12, q13
++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++        \I1
++        vtbl.8    d24, \XLAT0, d24
++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++        vtbl.8    d25, \XLAT1, d25
++        \I2
++        vtbl.8    d26, \XLAT0, d26
++        vtbl.8    d27, \XLAT1, d27
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vaddw.s8  \Q2, d26
++        vaddw.s8  \Q3, d27
++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
++
++@ Clobbers q10, q11, q12
++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
++        \L1
++        \L2
++        \L3
++        \L4
++        \L5
++        vshrn.i16 d24, \Q0, #\bit_depth - 5
++        vshrn.i16 d25, \Q1, #\bit_depth - 5
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vaddw.s8  q10, \Q0, d24
++        vaddw.s8  q11, \Q1, d25
++        bmi       2f
++1:        \L1
++          \L2
++          \L3
++          \L4
++          \L5
++        vmax.s16  q10, \Q_MIN
++        vmax.s16  q11, \Q_MIN
++          vshrn.i16 d24, \Q0, #\bit_depth - 5
++          vshrn.i16 d25, \Q1, #\bit_depth - 5
++        vmin.s16  q10, \Q_MAX
++        vmin.s16  q11, \Q_MAX
++        \S1
++        \S2
++        \S3
++        \S4
++          vtbl.8    d24, \XLAT0, d24
++          vtbl.8    d25, \XLAT1, d25
++          vaddw.s8  q10, \Q0, d24
++          vaddw.s8  q11, \Q1, d25
++          bpl       1b
++2:        vmax.s16  q10, \Q_MIN
++          vmax.s16  q11, \Q_MIN
++          vmin.s16  q10, \Q_MAX
++          vmin.s16  q11, \Q_MAX
++          \S1
++          \S2
++          \S3
++          \S4
++.endm
++
++
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
++
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
++
++@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
++function band_load_y
++        ldr       ip, [sp, #16]         @ &sao_offset_val[0]
++        ldr       r4, [sp, #20]         @ sao_left_class
++        vmov.i64  d4, #0
++        vmov.i64  q0, #0
++        pld       [r1]
++        vld2.8    {q8}, [ip]
++        sub       ip, sp, #8*5
++        vmov.i64  q1, #0
++        add       r4, ip, r4
++        vpush     {d0-d4}               @ Put zero array on stack
++        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
++        ldr       ip, [ip, #8*5 + 28]   @ height
++        vst1.32   {d16[0]}, [r4]
++        add       r4, r1, r3
++        vpop      {d0-d4}               @ Pop modified array
++        sub       ip, ip, #1
++        vorr      d0, d0, d4
++        bx        lr
++endfunc
++
++@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
++function band_load_c
++        ldr       ip, [sp, #16]         @ &sao_offset_val1[0]
++        ldr       r4, [sp, #20]         @ sao_left_class1
++        vmov.i64  d24, #0
++        vmov.i64  q10, #0
++        pld       [r1]
++        vld2.8    {q8}, [ip]
++        sub       ip, sp, #8*5
++        vmov.i64  q11, #0
++        add       r4, ip, r4
++        ldr       ip, [sp, #24]         @ &sao_offset_val2[0]
++        vpush     {d20-d24}             @ Put zero array on stack
++        vld2.8    {q9}, [ip]
++        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
++        ldr       ip, [sp, #8*5 + 28]   @ sao_left_class2
++        vst1.32   {d16[0]}, [r4]
++        add       ip, sp, ip
++        vshr.u64  d18, d18, #8          @ 1st interesting val is [1]
++        vldmia    sp, {d0-d3}           @ Load modified array
++        vldr      d16, [sp, #8*4]
++        add       r4, r1, r3
++        vstmia    sp, {d20-d24}         @ Put zero array on stack (again)
++        vst1.32   {d18[0]}, [ip]
++        vorr      d0, d0, d16
++        vldmia    sp, {d4-d7}           @ Load modified array
++        vldr      d18, [sp, #8*4]
++        ldr       ip, [sp, #8*5 + 36]   @ height
++        add       sp, sp, #8*5
++        vorr      d4, d4, d18
++        sub       ip, ip, #1
++        bx        lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_64_neon_8, export=1
++        push      {r4-r6, lr}
++        vmov.u8   q15, #128
++        bl        band_load_y
++
++1:      vldmia    r1, {q8-q11}
++        sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
++            "pld       [r4]",                 \
++            "subs      ip, #1",               \
++            "it ne; addne r4, r3",            \
++            "add       r1, r3"
++        vstmia    r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_32_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.u8   q15, #128
++        bl        band_load_y
++
++1:      vld1.8    { q8, q9 }, [r1, :128], r3
++        subs      ip, #2
++        vld1.8    {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++        vst1.8    { q8, q9 }, [r0, :128], r2
++        vst1.8    {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_16_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.u8   q15, #128
++        bl        band_load_y
++
++1:      vld1.8    { q8}, [r1, :128], r3
++        subs      ip, #4
++        vld1.8    { q9}, [r6, :128], r3
++        vld1.8    {q10}, [r1, :128], r3
++        vld1.8    {q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++        vst1.8    { q8}, [r0, :128], r2
++        vst1.8    { q9}, [r5, :128], r2
++        vst1.8    {q10}, [r0, :128], r2
++        vst1.8    {q11}, [r5, :128], r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_8_neon_8, export=1
++        ldr       ip, [sp, #8]          @ width
++        push      {r4-r6, lr}
++        vmov.u8   q15, #128
++        cmp       ip, #8
++        bl        band_load_y
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        blt       4f
++
++        sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
++            "vld1.8    {d16}, [r1, :64], r3", \
++            "subs      ip, #2",               \
++            "vld1.8    {d17}, [r6, :64], r3", \
++            "",                               \
++            "",                               \
++            "vst1.8 {d26}, [r0, :64], r2",    \
++            "vst1.8 {d27}, [r5, :64], r2"
++        pop       {r4-r6, pc}
++4:
++        sao_band_16b_8 {d0-d3}, {d0-d3}, q15,    \
++            "vld1.32   {d16[0]}, [r1, :32], r3", \
++            "subs      ip, #4",                  \
++            "vld1.32   {d16[1]}, [r6, :32], r3", \
++            "vld1.32   {d17[0]}, [r1, :32], r3", \
++            "vld1.32   {d17[1]}, [r6, :32], r3", \
++            "vst1.32   {d26[0]}, [r0, :32], r2", \
++            "vst1.32   {d26[1]}, [r5, :32], r2", \
++            "vst1.32   {d27[0]}, [r0, :32], r2", \
++            "vst1.32   {d27[1]}, [r5, :32], r2"
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_32_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, #32
++        add       r6, r1, #32
++        vmov.u8   q15, #128
++        bl        band_load_c
++
++1:      vld2.8    { q8, q9 }, [r1, :128], r3
++        subs      ip, #1
++        vld2.8    {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
++            "pld       [r4]",                 \
++            "it ne; addne r4, r3"
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop     {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_16_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.u8   q15, #128
++        bl        band_load_c
++
++1:      vld2.8    { q8, q9 }, [r1, :128], r3
++        subs      ip, #2
++        vld2.8    {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d4-d7}, q15
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop     {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_8_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
++        ldr       ip, [sp, #16]         @ width
++        push      {r4-r6, lr}
++        vmov.u8   q15, #128
++        cmp       ip, #8
++        bl        band_load_c
++        blt       4f
++
++        sao_band_16b_8 {d0-d3}, {d4-d7}, q15,      \
++            "vld2.8    {d16-d17}, [r1, :128], r3", \
++            "subs      ip, #1",                    \
++            "",                                    \
++            "",                                    \
++            "",                                    \
++            "vst2.8    {d26-d27}, [r0, :128], r2"
++        pop       {r4-r6, pc}
++4:
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
++            "vld1.8    {d16}, [r1, :64], r3", \
++            "subs      ip, #2",               \
++            "vld1.8    {d17}, [r6, :64], r3", \
++            "vuzp.8    d16, d17",             \
++            "",                               \
++            "vzip.8    d26, d27",             \
++            "vst1.8    {d26}, [r0, :64], r2", \
++            "vst1.8    {d27}, [r5, :64], r2"
++        pop       {r4-r6, pc}
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_64_16 bit_depth
++        push      {r4-r6, lr}
++        vmov.i64  q2, #0
++        vmov.i16  q3, #(1 << \bit_depth) - 1
++        bl        band_load_y
++        vpush     {q4-q7}
++
++1:      vldm      r1, {q4-q11}
++        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++            "subs      ip, #1",                                                  \
++            "add       r1, r3"
++        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
++        vstm      r0, {q4-q11}
++        add       r0, r2
++        bpl       1b
++
++        vpop      {q4-q7}
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_64_neon_10, export=1
++        band_64_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_32_16 bit_depth
++        push      {r4-r6, lr}
++        vmov.i64  q2, #0
++        vmov.i16  q3, #(1 << \bit_depth) - 1
++        bl        band_load_y
++
++1:      vldm      r1, {q8-q11}
++        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++            "subs      ip, #1",                                                   \
++            "add       r1, r3"
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_32_neon_10, export=1
++        band_32_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_16_16 bit_depth
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.i64  q14, #0
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_y
++
++1:      vld1.16   { q8, q9 }, [r1, :128], r3
++        subs      r12, #2
++        vld1.16   {q10, q11}, [r6, :128], r3
++        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
++        vst1.16   { q8, q9 }, [r0, :128], r2
++        vst1.16   {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_16_neon_10, export=1
++        band_16_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_8_16 bit_depth
++        ldr       ip, [sp, #8]          @ width
++        push      {r4-r6, lr}
++        vmov.i64  q14, #0
++        cmp       ip, #8
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_y
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        blt       4f
++
++        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++            "vld1.16   {q8}, [r1, :128], r3",                           \
++            "subs      ip, #2",                                         \
++            "vld1.16   {q9}, [r6, :128], r3",                           \
++            "",                                                         \
++            "",                                                         \
++            "vst1.16   {q10}, [r0, :128], r2",                          \
++            "vst1.16   {q11}, [r5, :128], r2"
++        pop       {r4-r6, pc}
++4:
++        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++            "vld1.16   {d16}, [r1, :64], r3",                           \
++            "subs      ip, #4",                                         \
++            "vld1.16   {d17}, [r6, :64], r3",                           \
++            "vld1.16   {d18}, [r1, :64], r3",                           \
++            "vld1.16   {d19}, [r6, :64], r3",                           \
++            "vst1.16   {d20}, [r0, :64], r2",                           \
++            "vst1.16   {d21}, [r5, :64], r2",                           \
++            "vst1.16   {d22}, [r0, :64], r2",                           \
++            "vst1.16   {d23}, [r5, :64], r2"
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_8_neon_10, export=1
++        band_8_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_32_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_32_16 bit_depth
++        push      {r4-r6, lr}
++        add       r5, r0, #32
++        add       r6, r1, #32
++        sub       r2, #64
++        sub       r3, #64
++        vmov.i64  q14, #0
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_c
++        mov       lr, #64
++        vpush     {q4-q7}
++
++1:      vld2.16   { q4, q5 }, [r1, :128], lr
++        subs      ip, #1
++        vld2.16   { q6, q7 }, [r6, :128], lr
++        vld2.16   { q8, q9 }, [r1, :128], r3
++        vld2.16   {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++            "pld       [r4]",                                                      \
++            "it ne; addne r4, r3"
++        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++        vst2.16   { q4, q5 }, [r0, :128], lr
++        vst2.16   { q6, q7 }, [r5, :128], lr
++        vst2.16   { q8, q9 }, [r0, :128], r2
++        vst2.16   {q10, q11}, [r5, :128], r2
++
++        bpl       1b
++
++        vpop      {q4-q7}
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
++        band_c_32_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_16_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_16_16 bit_depth
++        push      {r4-r6, lr}
++        add       r5, r0, #32
++        add       r6, r1, #32
++        vmov.i64  q14, #0
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_c
++
++1:      vld2.16   { q8, q9 }, [r1, :128], r3
++        subs      ip, #1
++        vld2.16   {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128], r2
++        vst2.16   {q10, q11}, [r5, :128], r2
++
++        bpl       1b
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
++        band_c_16_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_8_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_8_16 bit_depth
++        ldr       ip, [sp, #16]         @ width
++        push      {r4-r6, lr}
++        vmov.i64  q14, #0
++        cmp       ip, #8
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_c
++        blt       4f
++
++        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++            "vld2.16   {q8,q9}, [r1, :128], r3",                        \
++            "subs      ip, #1",                                         \
++            "",                                                         \
++            "",                                                         \
++            "",                                                         \
++            "vst2.16   {q10,q11}, [r0, :128], r2"
++        pop       {r4-r6, pc}
++4:
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++            "vld2.16   {d16,d18}, [r1, :128], r3",                      \
++            "subs      ip, #2",                                         \
++            "vld2.16   {d17,d19}, [r6, :128], r3",                      \
++            "",                                                         \
++            "",                                                         \
++            "vst2.16   {d20,d22}, [r0, :128], r2",                      \
++            "vst2.16   {d21,d23}, [r5, :128], r2"
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
++        band_c_8_16 10
++endfunc
++
++
++@ =============================================================================
++@ SAO EDGE
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ [r5]  translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
++
++function edge_64b_body_8
++
++        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8 q13,  q5,  q1
++        vcgt.u8 q14,  q6,  q2
++        vcgt.u8 q15,  q7,  q3
++
++        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q1,  q5
++        vcgt.u8  q2,  q6
++        vcgt.u8  q3,  q7
++
++        vsub.s8  q0,  q12       @ a = sign(c-a)
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
++
++        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q13, q5,  q9
++        vcgt.u8  q14, q6,  q10
++        vcgt.u8  q15, q7,  q11
++
++        vsub.s8  q0,  q12
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
++
++        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
++        vcgt.u8  q13, q9,  q5
++        vcgt.u8  q14, q10, q6
++        vcgt.u8  q15, q11, q7
++
++        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
++        vadd.s8  q1,  q13
++        vmov.u8  q12, #2
++        vadd.s8  q2,  q14
++        vadd.s8  q3,  q15
++
++        vadd.s8  q0,  q12
++        vadd.s8  q1,  q12
++
++        vld1.8   {d26, d27}, [r5]
++
++        vadd.s8  q2,  q12
++        vuzp.8   q0,  q1
++        vmov.u8  q15, #128
++        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
++
++        vtbl.8   d0,  {d26}, d0
++        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
++
++        vtbl.8   d1,  {d26}, d1
++        vadd.s8  q14, q5, q15
++
++        vtbl.8   d2,  {d27}, d2
++        vuzp.8   q2,  q3
++
++        vtbl.8   d3,  {d27}, d3
++
++        vtbl.8   d4,  {d26}, d4
++        vzip.8   q0,  q1
++
++        vtbl.8   d5,  {d26}, d5
++        vqadd.s8 q0,  q12
++        vqadd.s8 q1,  q14
++        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
++
++        vtbl.8   d6,  {d27}, d6
++        vtbl.8   d7,  {d27}, d7
++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
++        vzip.8   q2,  q3
++
++        vsub.s8  q0,  q15
++        vqadd.s8 q2,  q12
++        vqadd.s8 q3,  q14
++        vsub.s8  q1,  q15
++        vsub.s8  q2,  q15
++        vsub.s8  q3,  q15
++
++        bx      lr
++endfunc
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ r4    upper clip value
++@ [r5]  translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
++
++function edge_64b_body_16
++
++        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q1
++        vcgt.u16 q14, q6, q2
++        vcgt.u16 q15, q7, q3
++
++        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u16 q1, q1, q5
++        vcgt.u16 q2, q2, q6
++        vcgt.u16 q3, q3, q7
++
++        vsub.s16 q0, q0, q12 // a = sign(c-a)
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q9
++        vcgt.u16 q14, q6, q10
++        vcgt.u16 q15, q7, q11
++
++        vsub.s16 q0, q0, q12
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u16 q13, q9, q5
++        vcgt.u16 q14, q10, q6
++        vcgt.u16 q15, q11, q7
++
++        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s16 q1, q1, q13
++        vadd.s16 q2, q2, q14
++        vadd.s16 q3, q3, q15
++
++        vmov.u8  q12, #2
++
++        vmovn.s16 d0, q0
++        vmovn.s16 d1, q1
++        vmovn.s16 d2, q2
++        vmovn.s16 d3, q3
++
++        vldr     d26, [r5]
++
++        vuzp.8   q0, q1
++
++        vldr     d27, [r5, #8]
++
++        vadd.s8  q0, q0, q12
++        vadd.s8  q1, q1, q12
++
++        vmov.i64 q12, #0
++
++        vtbl.8   d0, {d26}, d0
++        vtbl.8   d1, {d26}, d1
++        vtbl.8   d2, {d27}, d2
++        vtbl.8   d3, {d27}, d3
++
++        vdup.i16 q13, r4
++
++        vzip.8   q0, q1
++
++        @ Avoid overwrite whilst widening
++        vaddw.s8 q2, q6, d2
++        vaddw.s8 q3, q7, d3
++        vaddw.s8 q1, q5, d1
++        vaddw.s8 q0, q4, d0
++
++        @ now clip
++        clip16_4 q2, q3, q1, q0, q12, q13
++
++        bx       lr
++endfunc
++
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3, q9, q10
++@
++@ d16, d17 (q8) xlat U, V
++@ q14.u8 #2
++@ q15.u8 #128
++
++function edge_16b_body_8
++        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
++        vadd.u8  q9,  q14, q9
++        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
++        vsub.u8  q9,  q9,  q0
++        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
++        vadd.u8  q9,  q9,  q0
++        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
++        vsub.u8  q0,  q9,  q0
++
++        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
++
++        vuzp.8   d0,  d1
++
++        vtbl.8   d0,  {d16}, d0
++        vtbl.8   d1,  {d17}, d1
++
++        vzip.8   d0,  d1
++        vqadd.s8 q0,  q3
++        vsub.s8  q0,  q15
++
++        bx      lr
++endfunc
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3
++@
++@ q12, #0
++@ d16, d17 xlat U, V
++@ q14.u8 #2
++@ q15.u16 max
++function edge_16b_body_16
++        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
++        vadd.u16 q9, q14, q9
++        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
++        vsub.u16 q9, q9, q0
++        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
++        vadd.u16 q9, q9, q0
++        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
++        vsub.u16 q0, q9, q0
++
++        vmovn.s16 d0, q0
++        @ d1 will have random contents that we transform but
++        @ that doesn't matter as we then discard them
++        vuzp.8   d0, d1
++
++        vtbl.8   d0, {d16}, d0
++        vtbl.8   d1, {d17}, d1
++
++        vzip.8   d0, d1
++
++        vaddw.s8 q0, q1, d0
++
++        @ now clip
++        vmax.s16 q0, q12
++        vmin.s16 q0, q15
++        bx       lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
++@   int eo,                           [sp, #sp_base + 0]
++@   int width,                        [sp, #sp_base + 4]
++@   int height)                       [sp, #sp_base + 8]
++
++@ Jumps via jump_tab with
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   EDGE_SRC_STRIDE                   [r3]
++@   (1 << \bit_depth) - 1             [r4]
++@   * xlat_table                      [r5]  // setup_64b only
++@   int height                        [r12]
++@
++@   0                                 [q12] // > 8 bit
++@   2                                 [q14]
++@   128                               [q15] // = 8 bit
++@   r4                                [q15] // > 8 bit
++
++.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
++
++@ Build translate registers
++@ As translate values can only be 0-4 we don't care about junk in the rest
++@ of the register
++.if \is_chroma
++        ldr      ip, [sp, #0]
++        push     {r4-r6, lr}    @ 16 bytes
++        vld1.8   {d16[2]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[2]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[0]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[0]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[1]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[1]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[3]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[3]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[4]}, [r3]
++        vld1.8   {d17[4]}, [ip]
++        movw     r3, EDGE_SRC_STRIDE
++.set sp_base, 20
++.else
++        add      ip, r3, #4
++        vld1.8   {d16[1]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[0]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[0]}, [r3]
++        add      r3, r3, #6
++        vld1.8   {d17[1]}, [ip]
++        vld1.8   {d16[2]}, [r3]
++        movw     r3, EDGE_SRC_STRIDE
++        push     {r4-r6, lr}    @ 16 bytes
++        vzip.8   d16, d17
++        vmov     d17, d16
++.set sp_base, 16
++.endif
++
++@ If setup_64b we need the xlat table on the stack
++.if \setup_64b
++        sub      r5, sp, #16
++.endif
++
++@ Get jump address
++@ We have a special case for width 4 as the calling code doesn't detect it
++@ If we may have w4 then we add a 2nd jump table after the 1st
++.if \check_w4
++        ldr      r12, [sp, #sp_base + 4]        @ width
++        adr      r6, \jump_tab
++        ldr      lr, [sp, #sp_base + 0]        @ e0
++        cmp      r12, #8
++        it lt
++        addlt    r6, #16
++.else
++        ldr      lr, [sp, #sp_base + 0]        @ e0
++        adr      r6, \jump_tab
++.endif
++
++        ldr      r12, [sp, #sp_base + 8]        @ height
++
++.if \bit_depth > 8
++        movw     r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++        vmov.i64 q12, #0
++        vdup.16  q15, r4
++        vmov.u16 q14, #2
++.else
++        vmov.u8  q15, #128
++        vmov.u8  q14, #2
++.endif
++.endif
++
++@ If setup_64b we need q4-q7 saved.
++.if \setup_64b
++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++        ldr      r6, [r6, lr, lsl #2]
++
++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
++.if \do2
++        push     {r0, r1, r6, r12}
++.if jent_pic
++        bl       98f
++.else
++        blx      r6
++.endif
++        pop      {r0, r1, r6, r12}
++
++        add      r0, #64
++        add      r1, #64
++.endif
++
++.if jent_pic
++        bl       98f
++.else
++        blx      r6
++.endif
++
++@ Tidy up & return
++.if \setup_64b
++        vpop     {q4-q8}        @ spurious but harmless load of q8
++.endif
++        pop      {r4-r6, pc}
++
++.if jent_pic && !\xjump
++@ Magic label - used as 98b in jent macro
++98:
++        add      pc, r6
++.endif
++.endm
++
++
++.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
++.endm
++
++.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
++.endm
++
++
++.macro  edge_64b_e0, body_fn, pb
++        sub      r1, #8
++        mov      r6, lr
++1:      vldm     r1, {d7-d16}
++        // load a
++        vext.8   q0,  q3,  q4, #(16 - \pb)
++        add      r1, r3
++        vext.8   q1,  q4,  q5, #(16 - \pb)
++        subs     r12, #1
++        vext.8   q2,  q5,  q6, #(16 - \pb)
++        vext.8   q3,  q6,  q7, #(16 - \pb)
++        pld      [r1]
++        // load b
++        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
++        pld      [r1, #64]
++        vext.8   q8,  q4,  q5, #\pb
++        vext.8   q9,  q5,  q6, #\pb
++        vext.8   q10, q6,  q7, #\pb
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_32bx2_e0, body_fn, pb
++        add      r6, r1, r3
++        push     {r7,lr}
++        sub      r1, #8
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      vldmia   r1, {d7-d12}
++        // load a
++        vext.8   q0, q3, q4, #16 - \pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q1, q4, q5, #16 - \pb
++        subs     r12, #2
++        // load b
++        vext.8   q8, q4, q5, #\pb
++        vext.8   q9, q5, q6, #\pb
++        vldr     d25, [r6, #-8]
++        vldmia   r6, {d12-d15}
++        vldr     d26, [r6, #32]
++        // load a
++        vext.8   q2, q12, q6, #16 - \pb
++        add      r6, r6, r3, lsl #1
++        vext.8   q3, q6, q7, #16 - \pb
++        // load b
++        vext.8   q10, q6, q7, #\pb
++        vext.8   q11, q7, q13, #\pb
++        bl       \body_fn
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        bgt      1b
++        pop      {r7,pc}
++.endm
++
++.macro  edge_16b_e0, body_fn, pb
++        sub      r1, #8
++        mov      r6, lr
++1:      vldmia   r1, {d1-d4}
++        add      r1, r3
++        subs     r12, #1
++        vext.8   q0, q0, q1, #16 - \pb
++        vext.8   q2, q1, q2, #\pb
++
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e0, body_fn, pb
++        add      r6, r1, r3
++        push     {r7,lr}
++        sub      r1, #8
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      vldmia   r1, {d1-d2}
++        vldmia   r6, {d3-d4}
++        vldr     d6, [r1, #16]
++        subs     r12, #2
++        vldr     d7, [r6, #-8]
++        add      r1, r1, r3, lsl #1
++        vext.8   d0, d1, d2, #8 - \pb
++        add      r6, r6, r3, lsl #1
++        vext.8   d5, d3, d4, #\pb
++        vext.8   d4, d2, d6, #\pb
++        vext.8   d1, d7, d3, #8 - \pb
++
++        bl       \body_fn
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        bgt      1b
++        pop      {r7,pc}
++.endm
++
++.macro  edge_4bx4_e0, body_fn, pb
++        add      r6, r1, r3
++        push     {r7,lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++
++        tst      r1, #4
++        bne      2f
++1:      // r1 (and assumed r6) are 64-bit aligned
++        vldr     d2, [r1]
++        vldr     d0, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        vldr     d20, [r6]
++        subs     r12, #4
++        vldr     d18, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d3, [r1]
++        vshr.u64 d4, d2, #\pb * 8
++        vldr     d1, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        vldr     d21, [r6]
++        vext.8   d0, d0, d2, #8 - \pb
++        vldr     d19, [r6,#-8]
++        add      r6, r6, r3, lsl #1
++        vshr.u64 d22, d20, #\pb * 8
++        vext.8   d18, d18, d20, #8 - \pb
++        vshr.u64 d5, d3, #\pb * 8
++        vext.8   d1, d1, d3, #8 - \pb
++        vshr.u64 d23, d21, #\pb * 8
++        vext.8   d19, d19, d21, #8 - \pb
++        vsli.64  q1, q10, #32
++        vsli.64  q2, q11, #32
++        vsli.64  q0, q9, #32
++
++        bl       \body_fn
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
++        bgt      1b
++        pop      {r7,pc}
++
++2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
++        vldr     d20, [r1, #-4]
++        vldr     d22, [r1, #4]
++        add      r1, r1, r3, lsl #1
++        vldr     d2, [r6, #-4]
++        subs     r12, #4
++        vldr     d4, [r6, #4]
++        add      r6, r6, r3, lsl #1
++        vldr     d21, [r1, #-4]
++        vshl.i64 d18, d20, #\pb * 8
++        vldr     d23, [r1, #4]
++        add      r1, r1, r3, lsl #1
++        vldr     d3, [r6, #-4]
++        vext.8   d22, d20, d22, #\pb
++        vldr     d5, [r6, #4]
++        add      r6, r6, r3, lsl #1
++        vshl.i64 d0, d2, #\pb * 8
++        vext.8   d4, d2, d4, #\pb
++        vshl.i64 d19, d21, #\pb * 8
++        vext.8   d23, d21, d23, #\pb
++        vshl.i64 d1, d3, #\pb * 8
++        vext.8   d5, d3, d5, #\pb
++        vsri.64  q1, q10, #32
++        vsri.64  q0, q9, #32
++        vsri.64  q2, q11, #32
++
++        bl       \body_fn
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
++        bgt      2b
++        pop      {r7,pc}
++.endm
++
++
++.macro  edge_64b_e1, body_fn
++        sub      r1, r3
++        push     {lr}
++        add      r6, r1, #32
++        // load a
++        vld1.8   {q0-q1}, [r1, :256], r3
++        vld1.8   {q2-q3}, [r6, :256], r3
++        // load c
++        vld1.8   {q4-q5}, [r1, :256], r3
++        vld1.8   {q6-q7}, [r6, :256], r3
++1:      // load b
++        vld1.8   {q8-q9}, [r1, :256], r3
++        subs     r12, #1
++        vld1.8   {q10-q11}, [r6, :256], r3
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        // copy c to a
++        vmov.64  q0, q4
++        pld      [r1, r3]
++        vmov.64  q1, q5
++        it       le
++        pople    {lr}
++        vmov.64  q2, q6
++        it       le
++        bxle     lr
++        vmov.64  q3, q7
++        add      r0, r0, r2
++        // copy b to c
++        vmov.64  q4, q8
++        vmov.64  q5, q9
++        vmov.64  q6, q10
++        vmov.64  q7, q11
++        b        1b
++.endm
++
++.macro  edge_32bx2_e1, body_fn
++        sub      r6, r1, r3
++        vld1.8   {q2-q3}, [r1, :256], r3
++        vld1.8   {q0-q1}, [r6, :256]
++        mov      r6, lr
++
++1:      @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vld1.8   {q8-q9}, [r1, :256], r3
++        subs     r12, #2
++        vmov     q4, q2
++        vmov     q5, q3
++        vld1.8   {q10-q11}, [r1, :256], r3
++        vmov     q6, q8
++        vmov     q7, q9
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0, :256], r2
++        // copy b to a
++        vmov     q0, q8
++        vmov     q1, q9
++        vst1.8   {q2-q3}, [r0, :256], r2
++        vmov     q2, q10
++        it       le
++        bxle     r6
++        vmov     q3, q11
++        b        1b
++.endm
++
++.macro  edge_16b_e1, body_fn
++        sub      r6, r1, r3
++        // load c
++        vld1.8   {q1}, [r1, :128], r3
++        // load a
++        vld1.8   {q0}, [r6, :128]
++        mov      r6, lr
++1:      // load b
++        vld1.8   {q2}, [r1, :128], r3
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        subs     r12, #1
++        // copy c to a
++        vmov.64  q0, q1
++        it       le
++        bxle     r6
++        // copy b to c
++        vmov.64  q1, q2
++        b        1b
++.endm
++
++.macro  edge_8bx2_e1, body_fn
++        sub      r6, r1, r3
++        lsl      r3, #1
++        push     {r7, lr}
++        vld1.8   {d1}, [r1, :64], r3
++        vld1.8   {d0}, [r6, :64], r3
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vld1.8   {d4}, [r6, :64], r3
++        vmov     d2, d1
++        vld1.8   {d5}, [r1, :64], r3
++        subs     r12, #2
++        vmov     d3, d4
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++
++        // copy b to a
++        vmov     q0, q2
++        bgt      1b
++        pop      {r7, pc}
++.endm
++
++.macro  edge_4bx4_e1, body_fn
++        sub      r6, r1, r3
++        lsl      r3, #1
++        push     {r7, lr}
++        vld1.32  {d0[1]}, [r1, :32], r3
++        add      r7, r0, r2
++        vld1.32  {d0[0]}, [r6, :32], r3
++        lsl      r2, #1
++        vld1.32  {d4[1]}, [r1, :32], r3
++        vld1.32  {d4[0]}, [r6, :32], r3
++        vld1.32  {d5[1]}, [r1, :32], r3
++        vld1.32  {d5[0]}, [r6, :32], r3
++        vmov     d1, d4
++        vext.32  d2, d0, d4, #1
++        subs     r12, #4
++        vmov     d22, d5
++        vext.32  d3, d4, d5, #1
++        b        2f
++
++1:      vst1.32  {d0[0]}, [r0, :32], r2
++        vext.32  d2, d22, d4, #1
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vmov     d0, d22
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vext.32  d3, d4, d5, #1
++        vst1.32  {d1[1]}, [r7, :32], r2
++        vmov     d1, d4
++        vmov     d22, d5
++2:      @ Given the data duplication here we could probably do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        bl       \body_fn
++        ble      3f
++        vld1.32  {d4[0]}, [r6, :32], r3
++        subs     r12, #4
++        vld1.32  {d4[1]}, [r1, :32], r3
++        vld1.32  {d5[0]}, [r6, :32], r3
++        vld1.32  {d5[1]}, [r1, :32], r3
++        b        1b
++
++3:      vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32]
++        vst1.32  {d1[1]}, [r7, :32]
++        pop      {r7, pc}
++.endm
++
++.macro  edge_64b_e2, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        // load c and a
++        vld1.8   {q4-q5}, [r1, :128]
++        vldr     d25, [r6, #-8]
++        vldmia   r6, {d16-d23}
++        vext.8   q0, q12, q8, #16 - \pb
++        add      r6, r1, #32
++        vext.8   q1, q8, q9, #16 - \pb
++        add      r1, r1, r3
++        vext.8   q2, q9, q10, #16 - \pb
++        vld1.8   {q6-q7}, [r6, :128]
++        sub      r6, r1, r3
++        vext.8   q3, q10, q11, #16 - \pb
++
++1:      // load b
++        vldmia   r1, {d16-d24}
++        vext.8   q8, q8, q9, #\pb
++        pld      [r1, r3]
++        vext.8   q9, q9, q10, #\pb
++        subs     r12, #1
++        vext.8   q10, q10, q11, #\pb
++        vext.8   q11, q11, q12, #\pb
++        bl       \body_fn
++        // next a is mostly available in c
++        vldr     d25, [r6, #-8]
++        vstmia   r0, {q0-q3}
++        vext.8   q3, q6, q7, #16 - \pb
++        it       le
++        pople    {lr}
++        vext.8   q2, q5, q6, #16 - \pb
++        it       le
++        bxle     lr
++        vext.8   q1, q4, q5, #16 - \pb
++        add      r6, r6, r3
++        vext.8   q0, q12, q4, #16 - \pb
++        add      r0, r0, r2
++        // next c is mostly available in b
++        vldr     d8, [r1]
++        vext.8   d9, d16, d17, #8 - \pb
++        vext.8   q5, q8, q9, #16 - \pb
++        add      r1, r1, r3
++        vext.8   q6, q9, q10, #16 - \pb
++        pld      [r6, #-8]
++        vext.8   q7, q10, q11, #16 - \pb
++        b        1b
++.endm
++
++.macro  edge_32bx2_e2, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        // load a and first 32b of c
++        vld1.8   {q4-q5}, [r1, :256]
++        vldr     d25, [r6, #-8]
++        vld1.8   {q13-q14}, [r6, :256]
++        vldr     d31, [r1, #-8]
++        add      r6, r6, r3, lsl #1
++        vext.8   q0, q12, q13, #16 - \pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q1, q13, q14, #16 - \pb
++        vext.8   q2, q15, q4, #16 - \pb
++        vext.8   q3, q4, q5, #16 - \pb
++1:
++        // load second 32b of c and second 32b of b
++        vldmia   r6, {d12-d16}
++        vldmia   r1, {d20-d24}
++        // first 32b of b is mostly available in second 32b of c
++        vext.8   q9, q7, q8, #\pb
++        subs     r12, #2
++        vext.8   q8, q6, q7, #\pb
++        vext.8   q10, q10, q11, #\pb
++        vext.8   q11, q11, q12, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        ble      2f
++
++        vldr     d25, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d8, [r1]
++        vext.8   d9, d20, d21, #8 - \pb
++        vldr     d31, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        // first 32b of a is mostly available in second 32b of c
++        vext.8   q1, q6, q7, #16 - \pb
++        vext.8   q0, q12, q6, #16 - \pb
++        // first 32b of c is mostly available in second 32b of b
++        vext.8   q5, q10, q11, #16 - \pb
++        // second 32b of a is mostly available in first 32b of c
++        vext.8   q2, q15, q4, #16 - \pb
++        vext.8   q3, q4, q5, #16 - \pb
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_16b_e2, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        vld1.8   {q1}, [r1, :128], r3
++        vldr     d19, [r6, #-8]
++        vld1.8   {q10}, [r6, :128], r3
++
++1:      vldmia   r1, {d4-d6}
++        vext.8   q0, q9, q10, #16 - \pb
++        subs     r12, #1
++        vext.8   q2, q2, q3, #\pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        ble      2f
++        vmov     q10, q1
++        vldr     d2, [r1]
++        add      r1, r1, r3
++        vldr     d19, [r6, #-8]
++        add      r6, r6, r3
++        vext.8   d3, d4, d5, #8 - \pb
++        b        1b
++
++2:      pop      {pc}
++.endm
++
++.macro  edge_8bx2_e2, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        vldr     d18, [r6, #-8]
++        vldr     d19, [r6]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #-8]
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldmia   r6, {d3-d4}
++        vld1.8   {d21-d22}, [r1, :128]
++
++1:      vext.8   d0, d18, d19, #8 - \pb
++        vext.8   d4, d3, d4, #\pb
++        vext.8   d1, d20, d2, #8 - \pb
++        subs     r12, #2
++        vext.8   d5, d21, d22, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        ble      2f
++
++        vldr     d18, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #-8]
++        vmov     d19, d3
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldmia   r6, {d3-d4}
++        vld1.8   {d21-d22}, [r1, :128]
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_4bx4_e2, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7-r9, lr}
++        add      r8, r1, r3
++        sub      r6, r6, #\pb
++        add      r8, r8, #\pb
++        add      r7, r0, r2
++        lsl      r2, #1
++
++1:      vld1.32  {d0[0]}, [r6], r3
++        subs     r12, #4
++        vld1.32  {d2[0]}, [r1], r3
++        vld1.32  {d4[0]}, [r8], r3
++        vld1.32  {d0[1]}, [r6], r3
++        vld1.32  {d2[1]}, [r1], r3
++        vld1.32  {d4[1]}, [r8], r3
++        vld1.32  {d1[0]}, [r6], r3
++        vld1.32  {d3[0]}, [r1], r3
++        vld1.32  {d5[0]}, [r8], r3
++        vld1.32  {d1[1]}, [r6], r3
++        vld1.32  {d3[1]}, [r1], r3
++        vld1.32  {d5[1]}, [r8], r3
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
++        bgt      1b
++
++        pop      {r7-r9,pc}
++.endm
++
++.macro  edge_64b_e3, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        // load c and a
++        vld1.8   {q4-q5}, [r1, :128]
++        vldmia   r6, {d16-d24}
++        vext.8   q0, q8, q9, #\pb
++        add      r6, r1, #32
++        vext.8   q1, q9, q10, #\pb
++        add      r1, r1, r3
++        vext.8   q2, q10, q11, #\pb
++        vld1.8   {q6-q7}, [r6, :128]
++        sub      r6, r1, r3
++        vext.8   q3, q11, q12, #\pb
++
++1:      // load b
++        vldr     d17, [r1, #-8]
++        vldmia   r1, {d18-d25}
++        vext.8   q8, q8, q9, #16 - \pb
++        pld      [r1, r3]
++        vext.8   q9, q9, q10, #16 - \pb
++        subs     r12, #1
++        vext.8   q10, q10, q11, #16 - \pb
++        vext.8   q11, q11, q12, #16 - \pb
++        bl       \body_fn
++        // next a is mostly available in c
++        vldr     d24, [r6, #64]
++        vstmia   r0, {q0-q3}
++        vext.8   q0, q4, q5, #\pb
++        it       le
++        pople    {lr}
++        vext.8   q1, q5, q6, #\pb
++        it       le
++        bxle     lr
++        vext.8   q2, q6, q7, #\pb
++        add      r6, r6, r3
++        vext.8   q3, q7, q12, #\pb
++        add      r0, r0, r2
++        // next c is mostly available in b
++        vext.8   d14, d22, d23, #\pb
++        vldr     d15, [r1, #56]
++        vext.8   q4, q8, q9, #\pb
++        add      r1, r1, r3
++        vext.8   q5, q9, q10, #\pb
++        vext.8   q6, q10, q11, #\pb
++        b        1b
++.endm
++
++.macro  edge_32bx2_e3, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        // load a and first 32b of c
++        vldmia   r1, {d8-d12}
++        vldmia   r6, {d24-d28}
++        vext.8   q2, q4, q5, #\pb
++        add      r6, r6, r3, lsl #1
++        vext.8   q3, q5, q6, #\pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q0, q12, q13, #\pb
++        vext.8   q1, q13, q14, #\pb
++1:
++        // load second 32b of c and second 32b of b
++        vldr     d25, [r6, #-8]
++        subs     r12, #2
++        vldmia   r6, {d12-d15}
++        vldr     d27, [r1, #-8]
++        vldmia   r1, {d20-d23}
++        // first 32b of b is mostly available in second 32b of c
++        vext.8   q8, q12, q6, #16 - \pb
++        vext.8   q9, q6, q7, #16 - \pb
++        vext.8   q11, q10, q11, #16 - \pb
++        vext.8   q10, q13, q10, #16 - \pb
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        ble      2f
++
++        vldr     d24, [r6, #32]
++        add      r6, r6, r3, lsl #1
++        vldr     d11, [r1, #24]
++        vext.8   d10, d22, d23, #\pb
++        vldr     d30, [r1, #32]
++        add      r1, r1, r3, lsl #1
++        // first 32b of a is mostly available in second 32b of c
++        vext.8   q0, q6, q7, #\pb
++        vext.8   q1, q7, q12, #\pb
++        // first 32b of c is mostly available in second 32b of b
++        vext.8   q4, q10, q11, #\pb
++        // second 32b of a is mostly available in first 32b of c
++        vext.8   q3, q5, q15, #\pb
++        vext.8   q2, q4, q5, #\pb
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_16b_e3, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        vld1.8   {q1}, [r1, :128], r3
++        vldmia   r6, {d18-d20}
++        add      r6, r6, r3
++
++1:      vldr     d5, [r1, #-8]
++        vld1.8   {q3}, [r1, :128]
++        subs     r12, #1
++        vext.8   q0, q9, q10, #\pb
++        vext.8   q2, q2, q3, #16 - \pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        ble      2f
++        vmov     q9, q1
++        vldr     d3, [r1, #8]
++        add      r1, r1, r3
++        vldr     d20, [r6, #16]
++        add      r6, r6, r3
++        vext.8   d2, d4, d5, #\pb
++        b        1b
++
++2:      pop      {pc}
++.endm
++
++.macro  edge_8bx2_e3, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        vld1.8   {d18-d19}, [r6]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #8]
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldr     d4, [r6, #-8]
++        vldr     d3, [r6]
++        vldr     d21, [r1, #-8]
++        vldr     d22, [r1]
++
++1:      vext.8   d0, d18, d19, #\pb
++        vext.8   d4, d4, d3, #8 - \pb
++        vext.8   d1, d2, d20, #\pb
++        subs     r12, #2
++        vext.8   d5, d21, d22, #8 - \pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        ble      2f
++
++        vldr     d19, [r6, #8]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #8]
++        vmov     d18, d3
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldr     d4, [r6, #-8]
++        vldr     d3, [r6]
++        vldr     d21, [r1, #-8]
++        vldr     d22, [r1]
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_4bx4_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_4bx4_e2 \body_fn, (-\pb)
++.endm
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
++@ simpler and clearer in the code to stick with .word
++T       .word  (0 + \lab) - (4 + 98b)
++A       .word  (0 + \lab) - (8 + 98b)
++.else
++T       .word   1 + \lab
++A       .word   \lab
++.endif
++.endm
++
++.macro edge_64b_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++
++0:      edge_64b_e0     \body_fn, \pb
++10:     edge_64b_e1     \body_fn
++20:     edge_64b_e2     \body_fn, \pb
++30:     edge_64b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++
++0:      edge_32bx2_e0   \body_fn, \pb
++10:     edge_32bx2_e1   \body_fn
++20:     edge_32bx2_e2   \body_fn, \pb
++30:     edge_32bx2_e3   \body_fn, \pb
++.endm
++
++.macro edge_16b_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++        jent    5f
++        jent    15f
++        jent    25f
++        jent    35f
++
++0:      edge_32bx2_e0   \body_fn_64b, \pb
++10:     edge_32bx2_e1   \body_fn_64b
++20:     edge_32bx2_e2   \body_fn_64b, \pb
++30:     edge_32bx2_e3   \body_fn_64b, \pb
++5:      edge_16b_e0     \body_fn_16b, \pb
++15:     edge_16b_e1     \body_fn_16b
++25:     edge_16b_e2     \body_fn_16b, \pb
++35:     edge_16b_e3     \body_fn_16b, \pb
++.endm
++
++.macro edge_16b_8bx2_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++        jent    5f
++        jent    15f
++        jent    25f
++        jent    35f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++5:      edge_8bx2_e0    \body_fn, \pb
++15:     edge_8bx2_e1    \body_fn
++25:     edge_8bx2_e2    \body_fn, \pb
++35:     edge_8bx2_e3    \body_fn, \pb
++.endm
++
++.macro edge_8bx2_4bx4_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++        jent    5f
++        jent    15f
++        jent    25f
++        jent    35f
++
++0:      edge_8bx2_e0    \body_fn, \pb
++10:     edge_8bx2_e1    \body_fn
++20:     edge_8bx2_e2    \body_fn, \pb
++30:     edge_8bx2_e3    \body_fn, \pb
++5:      edge_4bx4_e0    \body_fn, \pb
++15:     edge_4bx4_e1    \body_fn
++25:     edge_4bx4_e2    \body_fn, \pb
++35:     edge_4bx4_e3    \body_fn, \pb
++.endm
++
++@ void ff_hevc_rpi_sao_edge_8_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_8, export=1
++        edge_16b_init   8, 0, 1, 99f
++99:
++        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_8, export=1
++        edge_16b_init   8, 0, 0, 99f
++99:
++        edge_16b_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_64_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 1
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
++        edge_16b_init   8, 1, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_8_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_10, export=1
++        edge_16b_init   10, 0, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++@ We simply split the 32 case into 2 vertical stripes
++@ and call the fns for w32
++@
++@ Calling code will always have src != dst so we don't have to worry
++@ about edge effects
++
++function ff_hevc_rpi_sao_edge_64_neon_10, export=1
++        edge_64b_init   10, 0, 1, 99f, xjump=1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
++        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
++99:
++        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
++        edge_64b_init   10, 1, 1, 99f, xjump=1
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
++        edge_64b_init   10, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 4
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
+new file mode 100644
+index 0000000000..36a23a5bf9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_arm.h
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
++#define AVCODEC_ARM_HEVCPRED_ARM_H
++
++#include "libavcodec/rpi_hevcpred.h"
++
++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
+new file mode 100644
+index 0000000000..80724d4cf3
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
+@@ -0,0 +1,35 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/arm/cpu.h"
++
++#include "libavcodec/rpi_hevcpred.h"
++#include "rpi_hevcpred_arm.h"
++
++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags))
++        ff_hevc_rpi_pred_init_neon(c, bit_depth);
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
+new file mode 100644
+index 0000000000..21e7700174
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
+@@ -0,0 +1,210 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcpred_arm.h"
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
++
++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
++{
++    switch (bit_depth)
++    {
++    case 8:
++        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
++        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
++        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16;  // Equivalent to c_4_neon_8
++        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++
++        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
++        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
++        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
++        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
++        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
++        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
++        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
++
++        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
++        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
++        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
++        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
++        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
++        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
++        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
++
++        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
++        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
++        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
++        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
++        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
++        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
++        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
++
++        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
++        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
++        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
++        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
++        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
++        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
++        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
++
++        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_8;
++        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_8;
++        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_8;
++        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_8;
++        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
++        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
++        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
++        break;
++    case 10:
++        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
++        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++        c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
++        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
++        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
++
++        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
++        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
++        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
++        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
++        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
++        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
++        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
++
++        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
++        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
++        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
++        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
++        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
++        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
++        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
++
++        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
++        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
++        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
++        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
++        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
++        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
++        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
++
++        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
++        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
++        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
++        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
++        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
++        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
++        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
++
++        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_10;
++        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_10;
++        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_10;
++        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_10;
++        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
++        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
++        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
++        break;
++    default:
++        break;
++    }
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+new file mode 100644
+index 0000000000..fa8f67cf03
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+@@ -0,0 +1,2984 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * General angular pred
++ *
++ * Horizontal (10) & Vertical (26) cases have their own file
++ * and are not dealt with properly here (luma filtering is missing)
++ *
++ * The inv_angle calculations are annoying - if it wasn't for the +128
++ * rounding step then the result would simply be the loop counter :-(
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.text
++
++@ Horizontal Patch functions
++@ These need a transpose before store so exist as smaller patches
++@ Patches can be called repeatedly without any intermediate setup
++@ to generate a horizontal block
++@
++@ It is almost certainly the case that larger patch fns can be built
++@ and they would be a little faster, but we would still need the small
++@ fns and code size (or at least instruction cache size) is an issue
++@ given how much code we already have here
++
++@ Generate 8x8 luma 8 patch
++@
++@ r3   Out stride
++@ r4   Angle add
++@ r7   Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2   Left ptr - updated
++@ r10  Inv angle accumulator (_up only)
++@ r12  32 - angle frac (_down) or angle frac (_up)
++@ d0   Older reference samples
++@ d1=r8+r9  Newer reference samples
++@ d2   32 - angle frac
++@ d3   Angle frac
++@ q2   Partially computed next result (_up only)
++@
++@ Temps
++@ r5   Loop counter
++@ r6
++@ r7   (_down only)
++@ r11  (_up only)
++@ q2, q8-q11
++
++patch_h_down_8x8_8:
++        ldrd        r8, r9, [r2]        @ Left
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        lsr         r8, #8
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #24
++        ldr         r9, [r2, #5]!
++        vmov        d1, r8, r9
++        // drop through...
++patch_h_down_8x8_8_continue:
++        mov         r5, #8
++1:
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++        vext.8      q8, q8, q9, #8
++          itt         mi
++          lsrmi       r7, r8, #8
++          vmovmi      d0, r8, r9
++          vdup.8      d2, r12
++        vext.8      q9, q9, q10, #8
++          it          mi
++          orrmi       r8, r7, r9, lsl #24
++        vext.8      q10, q10, q11, #8
++          it          mi
++          ldrmi       r9, [r2, #1]!
++        vmov        d22, d23
++        vrshrn.u16  d23, q2, #5
++          it          mi
++          vmovmi      d1, r8, r9
++        subs        r5, #1
++          vdup.8      d3, r6
++        bne         1b
++        // drop through...
++store_tran_8x8_8:
++        vzip.8      d16, d17
++        add         r6, r0, r3
++        vzip.8      d18, d19
++        lsl         r3, #1
++        vzip.8      d20, d21
++        add         r5, r0, r3
++        vzip.8      d22, d23
++        vzip.16     q8, q9
++        vzip.16     q10, q11
++        vzip.32     q8, q10
++        vzip.32     q9, q11
++        vst1.8      {d16}, [r0]!
++        vst1.8      {d17}, [r6], r3
++        vst1.8      {d20}, [r5], r3
++        vst1.8      {d21}, [r6], r3
++        vst1.8      {d18}, [r5], r3
++        vst1.8      {d19}, [r6], r3
++        vst1.8      {d22}, [r5]
++        asr         r3, #1
++        vst1.8      {d23}, [r6]
++
++        bx          lr
++
++patch_h_up_8x8_8:
++        ldrd        r8, r9, [r2]
++        rsb         r6, r4, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r4
++        lsr         r11, r8, #24
++        vdup.8      d2, r6
++        ldr         r8, [r2, #-1]!
++        orr         r9, r11, r9, lsl #8
++        vmov        d1, r8, r9
++        mov         r12, r4
++        vmull.u8    q2, d0, d2
++        vmlal.u8    q2, d1, d3
++patch_h_up_8x8_8_continue:
++        mov         r5, #8
++1:
++          add         r12, r4
++          mov         r11, #0
++          cmp         r12, #33
++          it          cs
++          addcs       r10, r7
++        vext.8      q8, q8, q9, #8
++          itt         cs
++          subcs       r12, #32
++          tstcs       r10, #1<<31
++          rsb         r6, r12, #32
++          it          eq
++          asreq       r11, r10, #8
++          it          cs
++          vmovcs      d0, r8, r9
++          vdup.8      d2, r6
++          it          cs
++          lsrcs       r6, r8, #24
++        vext.8      q9, q9, q10, #8
++          itt         cs
++          orrcs       r9, r6, r9, lsl #8
++          ldrbcs      r11, [r1, r11]
++          vdup.8      d3, r12
++        vext.8      q10, q10, q11, #8
++          it          hi
++          ldrbhi      r11, [r2, #-1]!
++        vmov        d22, d23
++        vrshrn.u16  d23, q2, #5
++          itt         cs
++          orrcs       r8, r11, r8, lsl #8
++          vmovcs      d1, r8, r9
++          vmull.u8    q2, d0, d2
++        subs        r5, #1
++          vmlal.u8    q2, d1, d3
++        bne         1b
++
++        b           store_tran_8x8_8
++
++
++.macro ADRT reg, val
++@ adr in T32 has enough range but not in A32
++A       adrl        \reg, \val
++T       adr         \reg, \val
++.endm
++
++@ ff_hevc_rpi_pred_angular_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r8, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        ldr         lr, [r2], #1        @ Top
++        rsb         r12, r6, #32
++        vmov        s0, lr
++        vdup.8      d3, r6
++        ldr         lr, [r2], #1
++        vdup.8      d2, r12
++        vmov        s2, lr
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      s0, lr
++          ldrmi       lr, [r2], #1
++          vdup.8      d2, r12
++          it          mi
++          vmovmi      s2, lr
++          vdup.8      d3, r6
++        mov         r5, #2
++1:
++        vrshrn.u16  d20, q2, #5
++            subs        r12, r4
++          vmull.u8    q2, d0, d2
++            it          mi
++            addmi       r12, #32
++          vmlal.u8    q2, d1, d3
++            rsb         r6, r12, #32
++        vext.64     q8, q8, q9, #1
++            it          mi
++            vmovmi      s0, lr
++        vext.64     q9, q9, q10, #1
++            it          mi
++            ldrmi       lr, [r2], #1
++            vdup.8      d2, r12
++            it          mi
++            vmovmi      s2, lr
++        subs        r5, #1
++            vdup.8      d3, r6
++        bne         1b
++
++          vrshrn.u16  d20, q2, #5
++            vmull.u8    q2, d0, d2
++        add         r12, r0,  r3
++            vmlal.u8    q2, d1, d3
++        lsl         r3,  #1
++          vext.64     q8, q8, q9, #1
++          vext.64     q9, q9, q10, #1
++            vrshrn.u16  d20, q2, #5
++
++98:
++        vst4.8      {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
++        vst4.8      {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
++        vst4.8      {d17[2], d18[2], d19[2], d20[2]}, [r0]
++        vst4.8      {d17[3], d18[3], d19[3], d20[3]}, [r12]
++        pop        {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        rsb         r12, r6, #32
++        ldr         lr, [r2]            @ Left
++        ldrb        r2, [r2, #-1]       @ Top-left
++        vmov        s0, lr
++        vdup.8      d2, r12
++        vdup.8      d3, r6
++        orr         lr, r2, lr, lsl #8
++        vmov        s2, lr
++        sub         r8, r7, #128
++        mov         r5, #3
++2:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++T         it          mi
++          addmi       r12, #32
++T         asr         r6, r8, #8
++T         it          mi
++T         ldrbmi      r2, [r1, r6]
++A         ldrbmi      r2, [r1, r8, asr #8]
++          rsb         r6, r12, #32
++          vdup.8      d2, r12
++          ittt        mi
++          vmovmi      s0, lr
++          orrmi       lr, r2, lr, lsl #8
++          vmovmi      s2, lr
++        vrshrn.u16  d20, q2, #5
++          vdup.8      d3, r6
++          it          mi
++          addmi       r8, r7
++        subs        r5, #1
++        vext.64     q8, q8, q9, #1
++        vext.64     q9, q9, q10, #1
++        bne         2b
++
++          vmull.u8    q2, d0, d2
++        add         r12, r0,  r3
++          vmlal.u8    q2, d1, d3
++        lsl         r3,  #1
++          vrshrn.u16  d20, q2, #5
++        b           98b
++
++@ Left of vertical - works down left
++18:
++        ldrh        r7, [r7]
++        rsb         r12, r6, #32
++        ldr         lr, [r1]            @ Top
++        ldrb        r1, [r2, #-1]       @ Top-left
++        vmov        s0, lr
++        vdup.8      d2, r12
++        vdup.8      d3, r6
++        orr         lr, r1, lr, lsl #8
++        vmov        s2, lr
++        sub         r8, r7, #128
++        mov         r5, #3
++2:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++T         it          mi
++          addmi       r12, #32
++T         asr         r6, r8, #8
++T         it          mi
++T         ldrbmi      r1, [r2, r6]
++A         ldrbmi      r1, [r2, r8, asr #8]
++          rsb         r6, r12, #32
++          vdup.8      d2, r12
++          ittt        mi
++          vmovmi      s0, lr
++          orrmi       lr, r1, lr, lsl #8
++          vmovmi      s2, lr
++        vrshrn.u16  d4, q2, #5
++          vdup.8      d3, r6
++          it          mi
++          addmi       r8, r7
++        subs        r5, #1
++        vst1.32     {d4[0]}, [r0], r3
++        bne         2b
++
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d4, q2, #5
++          vst1.32     {d4[0]}, [r0]
++
++        pop         {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldr         lr, [r1], #1        @ Top
++        rsb         r12, r6, #32
++        vmov        s0, lr
++        vdup.8      d3, r6
++        ldr         lr, [r1], #1
++        vdup.8      d2, r12
++        vmov        s2, lr
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      s0, lr
++          ldrmi       lr, [r1], #1
++          vdup.8      d2, r12
++          it          mi
++          vmovmi      s2, lr
++          vdup.8      d3, r6
++        mov         r5, #2
++1:
++        vrshrn.u16  d6, q2, #5
++            subs        r12, r4
++          vmull.u8    q2, d0, d2
++            it          mi
++            addmi       r12, #32
++          vmlal.u8    q2, d1, d3
++            rsb         r6, r12, #32
++        vst1.32     {d6[0]}, [r0], r3
++            itt         mi
++            vmovmi      s0, lr
++            ldrmi       lr, [r1], #1
++            vdup.8      d2, r12
++            it          mi
++            vmovmi      s2, lr
++        subs        r5, #1
++            vdup.8      d3, r6
++        bne         1b
++
++          vrshrn.u16  d6, q2, #5
++            vmull.u8    q2, d0, d2
++            vmlal.u8    q2, d1, d3
++          vst1.32     {d6[0]}, [r0], r3
++            vrshrn.u16  d6, q2, #5
++            vst1.32     {d6[0]}, [r0]
++
++        pop         {r4-r8, pc}
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_8x8_8
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        bl          patch_h_up_8x8_8
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        ldrb        lr, [r2, #-1]       @ Top-left
++        ldrh        r7, [r7]
++        vmov        d0, r8, r9
++        lsl         r9, r9, #8
++        vdup.8      d2, r12
++        orr         r9, r9, r8, lsr #24
++        orr         r8, lr, r8, lsl #8
++        vmov        d1, r8, r9
++        sub         r1, r7, #128
++        mov         r5, #7
++1:
++        vdup.8      d3, r6
++        vmull.u8    q2, d0, d2
++          subs        r12, r12, r4
++        vmlal.u8    q2, d1, d3
++          ittt        mi
++          addmi       lr, r2, r1, asr #8
++          addmi       r12, r12, #32
++          vmovmi      d0, r8, r9
++          rsb         r6, r12, #32
++          itt         mi
++          lslmi       r9, r9, #8
++          ldrbmi      lr, [lr]
++          vdup.8      d2, r12
++        vrshrn.u16  d4, q2, #5
++          itttt       mi
++          orrmi       r9, r9, r8, lsr #24
++          orrmi       r8, lr, r8, lsl #8
++          vmovmi      d1, r8, r9
++          addmi       r1, r1, r7
++        subs        r5, r5, #1
++        vst1.8      {d4}, [r0], r3
++        bne         1b
++
++          vdup.8      d3, r6
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d4, q2, #5
++          vst1.8      {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        mov         r5, #7
++        lsr         r8, #8
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #24
++        ldr         r9, [r1, #5]!
++        vmov        d1, r8, r9
++1:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++          it          mi
++          addmi       r12, #32
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      d0, r8, r9
++          lsrmi       r8, #8
++          vdup.8      d2, r12
++          itt         mi
++          orrmi       r8, r8, r9, lsl #24
++          ldrmi       r9, [r1, #1]!
++        vrshrn.u16  d6, q2, #5
++          it          mi
++          vmovmi      d1, r8, r9
++          vdup.8      d3, r6
++        subs        r5, #1
++        vst1.8      {d6}, [r0], r3
++        bne         1b
++
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d6, q2, #5
++          vst1.8      {d6}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_8x8_8
++        bl          patch_h_down_8x8_8_continue
++
++        add         r2, r1, #8          @ restore r2, but 8 rows further down left
++        sub         r0, #16
++        mov         r6, r4
++        add         r0, r0, r3, lsl #3
++
++        bl          patch_h_down_8x8_8
++        bl          patch_h_down_8x8_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++
++        push        {r2}
++        bl          patch_h_up_8x8_8
++        bl          patch_h_up_8x8_8_continue
++        pop         {r2}
++
++        sub         r0, #16
++        mov         r10, #-128
++        add         r2, #8
++        add         r0, r0, r3, lsl #3
++        sub         r10, r10, r7, lsl #3
++
++        bl          patch_h_up_8x8_8
++        bl          patch_h_up_8x8_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q9}, [r1]
++        sub         r1, r2, #1
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.8      d6, r6
++        vext.8      q8, q9, q9, #15
++        sub         r8, r7, #128
++        vld1.8      {d16[0]}, [r1]
++        vdup.8      d7, r12
++        mov         r5, #15
++1:
++        vmull.u8    q0, d18, d7
++        subs        r12, r4
++        vmlal.u8    q0, d16, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d19, d7
++        it          cc
++        addcc       r1, r2, r8, asr #8
++        vmlal.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vext.8      q10, q8, q8, #15
++        sub         r5, #1
++        vld1.8      {d20[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q11, q8
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmull.u8    q0, d22, d7
++        subs        r12, r4
++        vmlal.u8    q0, d20, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d23, d7
++        it          cc
++        addcc       r1, r2, r8, asr #8
++        vmlal.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vext.8      q8, q10, q10, #15
++        sub         r5, #1
++        vld1.8      {d16[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q9, q10
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d22, d7
++        vmlal.u8    q0, d20, d6
++        vmull.u8    q1, d23, d7
++        vmlal.u8    q1, d21, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d18, d7
++        vmlal.u8    q0, d16, d6
++        vmull.u8    q1, d19, d7
++        vmlal.u8    q1, d17, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.8      {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vext.8      q8, q9, q9, #1
++        vld1.8      {d17[7]}, [r1]!
++        mov         r5, #15
++1:
++        vmull.u8    q0, d16, d6
++        subs        r12, r4
++        vmlal.u8    q0, d18, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d19, d7
++        sub         r5, #1
++        vext.8      q10, q8, q8, #1
++        teq         r5, #0
++        vld1.8      {d21[7]}, [r1]
++        it          cc
++        addcc       r1, #1
++        vmov        q11, q8
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmull.u8    q0, d20, d6
++        subs        r12, r4
++        vmlal.u8    q0, d22, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d23, d7
++        sub         r5, #1
++        vext.8      q8, q10, q10, #1
++        teq         r5, #0
++        vld1.8      {d17[7]}, [r1]
++        it          cc
++        addcc       r1, #1
++        vmov        q9, q10
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d20, d6
++        vmlal.u8    q0, d22, d7
++        vmull.u8    q1, d21, d6
++        vmlal.u8    q1, d23, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d16, d6
++        vmlal.u8    q0, d18, d7
++        vmull.u8    q1, d17, d6
++        vmlal.u8    q1, d19, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_8x8_8
++        bl          patch_h_down_8x8_8_continue
++        bl          patch_h_down_8x8_8_continue
++        bl          patch_h_down_8x8_8_continue
++
++        add         r2, r1, #8          @ restore r2, but 8 rows further down left
++        add         r1, r1, #8
++        mov         r6, r4
++        sub         r0, #32
++        subs        r10, #1
++        add         r0, r0, r3, lsl #3
++        bne         1b
++
++        pop        {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<2
++1:
++        push        {r2,r10}
++        bl          patch_h_up_8x8_8
++        bl          patch_h_up_8x8_8_continue
++        bl          patch_h_up_8x8_8_continue
++        bl          patch_h_up_8x8_8_continue
++        pop         {r2,r10}
++
++        vmov        r8, s12
++        sub         r0, #32
++        add         r2, #8
++        add         r0, r0, r3, lsl #3
++        sub         r10, r10, r7, lsl #3
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop        {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q0-q1}, [r1]
++        sub         r9, r2, #1
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        mov         r5, #32
++1:
++        vld1.8      {d17[7]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        add         r9, r2, r8, asr #8
++        vext.8      q1, q0, q1, #15
++        vext.8      q0, q8, q0, #15
++2:
++        vmull.u8    q10, d4, d19
++        subs        r12, r4
++        vmlal.u8    q10, d0, d18
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d5, d19
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d1, d18
++        sub         r5, #1
++        vmull.u8    q12, d6, d19
++        teq         r5, #0
++        vmlal.u8    q12, d2, d18
++        vmull.u8    q13, d7, d19
++        vmlal.u8    q13, d3, d18
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.8      {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.8      {d16[0]}, [r5]
++        mov         r5, #32
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++1:
++        vmov        q2, q0
++        add         r1, #1
++        vmov        q3, q1
++        vext.8      q0, q0, q1, #1
++        vext.8      q1, q1, q8, #1
++2:
++        vmull.u8    q10, d0, d18
++        subs        r12, r4
++        vmlal.u8    q10, d4, d19
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d1, d18
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d5, d19
++        sub         r5, #1
++        vmull.u8    q12, d2, d18
++        teq         r5, #0
++        vmlal.u8    q12, d6, d19
++        vmull.u8    q13, d3, d18
++        vmlal.u8    q13, d7, d19
++        vld1.8      {d16[0]}, [r1]
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ Chroma 8 bit 4x4 patch fns
++        .text
++
++patch_h_down_c_4x4_8:
++        ldrd        r8, r9, [r2]        @ Left
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        lsr         r8, #16
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r2, #6]!
++        vmov        d1, r8, r9
++        // drop through...
++patch_h_down_c_4x4_8_continue:
++        mov         r5, #4
++1:
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++        vext.8      q8, q8, q9, #8
++          it          mi
++          lsrmi       r7, r8, #16
++        vmov        d18, d19
++          it          mi
++          vmovmi      d0, r8, r9
++          vdup.8      d2, r12
++          it          mi
++          orrmi       r8, r7, r9, lsl #16
++        vrshrn.u16  d19, q2, #5
++          itt         mi
++          ldrmi       r9, [r2, #2]!
++          vmovmi      d1, r8, r9
++        subs        r5, #1
++          vdup.8      d3, r6
++        bne         1b
++        // drop through...
++store_tran_c_4x4_8:
++        vzip.16     d16, d17
++        add         r6, r0, r3
++        vzip.16     d18, d19
++        lsl         r3, #1
++        vzip.32     q8, q9
++        add         r5, r0, r3
++        vst1.16     {d16}, [r0]!
++        vst1.16     {d17}, [r6], r3
++        vst1.16     {d18}, [r5]
++        asr         r3, #1
++        vst1.16     {d19}, [r6]
++
++        bx          lr
++
++patch_h_up_c_4x4_8:
++        ldrd        r8, r9, [r2]
++        rsb         r6, r4, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r4
++        lsr         r11, r8, #16
++        vdup.8      d2, r6
++        ldr         r8, [r2, #-2]!
++        orr         r9, r11, r9, lsl #16
++        vmov        d1, r8, r9
++        mov         r12, r4
++        vmull.u8    q2, d0, d2
++        vmlal.u8    q2, d1, d3
++patch_h_up_c_4x4_8_continue:
++        mov         r5, #4
++1:
++          add         r12, r4
++          cmp         r12, #33
++          it          cs
++          addcs       r10, r7
++          mov         r11, #0
++          itt         cs
++          subcs       r12, #32
++          tstcs       r10, #1<<31
++          rsb         r6, r12, #32
++          it          eq
++          asreq       r11, r10, #7
++          it          cs
++          vmovcs      d0, r8, r9
++          it          eq
++          biceq       r11, #1
++          vdup.8      d2, r6
++          it          cs
++          lsrcs       r6, r8, #16
++          vdup.8      d3, r12
++        vext.8      q8, q8, q9, #8
++          itt         cs
++          orrcs       r9, r6, r9, lsl #16
++          ldrhcs      r11, [r1, r11]
++        vmov        d18, d19
++          it          hi
++          ldrhhi      r11, [r2, #-2]!
++        vrshrn.u16  d19, q2, #5
++          itt         cs
++          orrcs       r8, r11, r8, lsl #16
++          vmovcs      d1, r8, r9
++          vmull.u8    q2, d0, d2
++        subs        r5, #1
++          vmlal.u8    q2, d1, d3
++        bne         1b
++
++        b           store_tran_c_4x4_8
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_c_4x4_8
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        bl          patch_h_up_c_4x4_8
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        ldrh        lr, [r2, #-2]       @ Top-left
++        ldrh        r7, [r7]
++        vmov        d0, r8, r9
++        lsl         r9, r9, #16
++        vdup.8      d2, r12
++        orr         r9, r9, r8, lsr #16
++        orr         r8, lr, r8, lsl #16
++        vmov        d1, r8, r9
++        sub         r1, r7, #128
++        mov         r5, #3
++1:
++        vdup.8      d3, r6
++        vmull.u8    q2, d0, d2
++          subs        r12, r12, r4
++        vmlal.u8    q2, d1, d3
++          itttt       mi
++          addmi       lr, r2, r1, asr #7
++          bicmi       lr, #1
++          addmi       r12, r12, #32
++          vmovmi      d0, r8, r9
++          rsb         r6, r12, #32
++          itt         mi
++          lslmi       r9, r9, #16
++          ldrhmi      lr, [lr]
++          vdup.8      d2, r12
++        vrshrn.u16  d4, q2, #5
++          itttt       mi
++          orrmi       r9, r9, r8, lsr #16
++          orrmi       r8, lr, r8, lsl #16
++          vmovmi      d1, r8, r9
++          addmi       r1, r1, r7
++        subs        r5, r5, #1
++        vst1.16     {d4}, [r0], r3
++        bne         1b
++
++          vdup.8      d3, r6
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d4, q2, #5
++          vst1.16     {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        mov         r5, #3
++        lsr         r8, #16
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r1, #6]!
++        vmov        d1, r8, r9
++1:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++          it          mi
++          addmi       r12, #32
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      d0, r8, r9
++          lsrmi       r8, #16
++          vdup.8      d2, r12
++          itt         mi
++          orrmi       r8, r8, r9, lsl #16
++          ldrmi       r9, [r1, #2]!
++        vrshrn.u16  d6, q2, #5
++          it          mi
++          vmovmi      d1, r8, r9
++          vdup.8      d3, r6
++        subs        r5, #1
++        vst1.16     {d6}, [r0], r3
++        bne         1b
++
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d6, q2, #5
++          vst1.16     {d6}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_c_4x4_8
++        bl          patch_h_down_c_4x4_8_continue
++
++        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
++        sub         r0, #16
++        mov         r6, r4
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_down_c_4x4_8
++        bl          patch_h_down_c_4x4_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++
++        push        {r2}
++        bl          patch_h_up_c_4x4_8
++        bl          patch_h_up_c_4x4_8_continue
++        pop         {r2}
++
++        sub         r0, #16
++        mov         r10, #-128
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++
++        bl          patch_h_up_c_4x4_8
++        bl          patch_h_up_c_4x4_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q9}, [r1]
++        sub         r1, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.8      d6, r6
++        vext.8      q8, q9, q9, #14
++        sub         r8, r7, #128
++        vld1.16     {d16[0]}, [r1]
++        vdup.8      d7, r12
++        mov         r5, #7
++1:
++        subs        r12, r4
++        vmull.u8    q0, d18, d7
++        it          cc
++        asrcc       r1, r8, #8
++        vmlal.u8    q0, d16, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d19, d7
++        it          cc
++        addcc       r1, r2, r1, lsl #1
++        vmlal.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vext.8      q10, q8, q8, #14
++        sub         r5, #1
++        vld1.16     {d20[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q11, q8
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        subs        r12, r4
++        vmull.u8    q0, d22, d7
++        it          cc
++        asrcc       r1, r8, #8
++        vmlal.u8    q0, d20, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d23, d7
++        it          cc
++        addcc       r1, r2, r1, lsl #1
++        vmlal.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vext.8      q8, q10, q10, #14
++        sub         r5, #1
++        vld1.16     {d16[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q9, q10
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d22, d7
++        vmlal.u8    q0, d20, d6
++        vmull.u8    q1, d23, d7
++        vmlal.u8    q1, d21, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d18, d7
++        vmlal.u8    q0, d16, d6
++        vmull.u8    q1, d19, d7
++        vmlal.u8    q1, d17, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.8      {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vext.8      q8, q9, q9, #2
++        vld1.16     {d17[3]}, [r1]!
++        mov         r5, #7
++1:
++        vmull.u8    q0, d16, d6
++        subs        r12, r4
++        vmlal.u8    q0, d18, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d19, d7
++        sub         r5, #1
++        vext.8      q10, q8, q8, #2
++        teq         r5, #0
++        vld1.16     {d21[3]}, [r1]
++        it          cc
++        addcc       r1, #2
++        vmov        q11, q8
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmull.u8    q0, d20, d6
++        subs        r12, r4
++        vmlal.u8    q0, d22, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d23, d7
++        sub         r5, #1
++        vext.8      q8, q10, q10, #2
++        teq         r5, #0
++        vld1.16     {d17[3]}, [r1]
++        it          cc
++        addcc       r1, #2
++        vmov        q9, q10
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d20, d6
++        vmlal.u8    q0, d22, d7
++        vmull.u8    q1, d21, d6
++        vmlal.u8    q1, d23, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d16, d6
++        vmlal.u8    q0, d18, d7
++        vmull.u8    q1, d17, d6
++        vmlal.u8    q1, d19, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_c_4x4_8
++        bl          patch_h_down_c_4x4_8_continue
++        bl          patch_h_down_c_4x4_8_continue
++        bl          patch_h_down_c_4x4_8_continue
++
++        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*2
++        mov         r6, r4
++        sub         r0, #32
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<2
++1:
++        push        {r2, r10}
++        bl          patch_h_up_c_4x4_8
++        bl          patch_h_up_c_4x4_8_continue
++        bl          patch_h_up_c_4x4_8_continue
++        bl          patch_h_up_c_4x4_8_continue
++        pop         {r2, r10}
++
++        vmov        r8, s12
++        sub         r0, #32
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q0-q1}, [r1]
++        sub         r9, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        mov         r5, #16
++1:
++        vld1.16     {d17[3]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        asr         r9, r8, #8
++        vext.8      q1, q0, q1, #14
++        add         r9, r2, r9, lsl #1
++        vext.8      q0, q8, q0, #14
++2:
++        vmull.u8    q10, d4, d19
++        subs        r12, r4
++        vmlal.u8    q10, d0, d18
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d5, d19
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d1, d18
++        sub         r5, #1
++        vmull.u8    q12, d6, d19
++        teq         r5, #0
++        vmlal.u8    q12, d2, d18
++        vmull.u8    q13, d7, d19
++        vmlal.u8    q13, d3, d18
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.8      {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.16     {d16[0]}, [r5]
++        mov         r5, #16
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++1:
++        vmov        q2, q0
++        add         r1, #2
++        vmov        q3, q1
++        vext.8      q0, q0, q1, #2
++        vext.8      q1, q1, q8, #2
++2:
++        vmull.u8    q10, d0, d18
++        subs        r12, r4
++        vmlal.u8    q10, d4, d19
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d1, d18
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d5, d19
++        sub         r5, #1
++        vmull.u8    q12, d2, d18
++        teq         r5, #0
++        vmlal.u8    q12, d6, d19
++        vmull.u8    q13, d3, d18
++        vmlal.u8    q13, d7, d19
++        vld1.16     {d16[0]}, [r1]
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++@------------------------------------------------------------------------------
++@ Data
++
++        .text
++        .balign  64
++angle_2:
++        .byte    32
++        .byte    26,  21,  17,  13,   9,   5,   2,   0
++        @ Sign inverted from standards table
++        .byte     2,   5,   9,  13,  17,  21,  26,  32
++        .byte    26,  21,  17,  13,   9,   5,   2,   0
++        @ Standard sign
++        .byte     2,   5,   9,  13,  17,  21,  26,  32
++
++        .balign   2
++
++        @ Sign inverted from standards table
++inv_angle:
++        .short   4096, 1638,  910,  630,  482,  390,  315
++        .short    256
++        .short    315,  390,  482,  630,  910, 1638, 4096
++
++@------------------------------------------------------------------------------
++@
++@ 10 bit fns
++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
++@ but runs out of register width for 12+ bit
++
++        .text
++        .balign 64
++
++patch_h_down_4x4_10:
++        ldrd        r8, r9, [r2]        @ Left
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.16     d3, r6
++        lsr         r8, #16
++        vdup.16     d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r2, #6]!
++        vmov        d1, r8, r9
++        // drop through...
++patch_h_down_4x4_10_continue:
++        mov         r5, #4
++1:
++          subs        r12, r4
++        vmul.u16    d4, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmla.u16    d4, d1, d3
++          rsb         r6, r12, #32
++        vext.16     q8, q8, q9, #4
++          it          mi
++          lsrmi       r7, r8, #16
++        vmov        d18, d19
++          it          mi
++          vmovmi      d0, r8, r9
++          vdup.16     d2, r12
++          it          mi
++          orrmi       r8, r7, r9, lsl #16
++        vrshr.u16   d19, d4, #5
++          itt         mi
++          ldrmi       r9, [r2, #2]!
++          vmovmi      d1, r8, r9
++        subs        r5, #1
++          vdup.16     d3, r6
++        bne         1b
++        // drop through...
++store_tran_4x4_10:
++        vzip.16     d16, d17
++        add         r6, r0, r3
++        vzip.16     d18, d19
++        lsl         r3, #1
++        vzip.32     q8, q9
++        add         r5, r0, r3
++        vst1.16     {d16}, [r0]!
++        vst1.16     {d17}, [r6], r3
++        vst1.16     {d18}, [r5]
++        asr         r3, #1
++        vst1.16     {d19}, [r6]
++
++        bx          lr
++
++patch_h_up_4x4_10:
++        ldrd        r8, r9, [r2]
++        rsb         r6, r4, #32
++        vmov        d0, r8, r9
++        vdup.16     d3, r4
++        lsr         r11, r8, #16
++        vdup.16     d2, r6
++        ldr         r8, [r2, #-2]!
++        orr         r9, r11, r9, lsl #16
++        vmov        d1, r8, r9
++        mov         r12, r4
++        vmul.u16    d4, d0, d2
++        vmla.u16    d4, d1, d3
++patch_h_up_4x4_10_continue:
++        mov         r5, #4
++1:
++          add         r12, r4
++          cmp         r12, #33
++          it          cs
++          addcs       r10, r7
++          mov         r11, #0
++          itt         cs
++          subcs       r12, #32
++          tstcs       r10, #1<<31
++          rsb         r6, r12, #32
++          it          eq
++          asreq       r11, r10, #7
++          it          cs
++          vmovcs      d0, r8, r9
++          it          eq
++          biceq       r11, #1
++          vdup.16     d2, r6
++          it          cs
++          lsrcs       r6, r8, #16
++          vdup.16     d3, r12
++        vext.16     q8, q8, q9, #4
++          itt         cs
++          orrcs       r9, r6, r9, lsl #16
++          ldrhcs      r11, [r1, r11]
++        vmov        d18, d19
++          it          hi
++          ldrhhi      r11, [r2, #-2]!
++        vrshr.u16   d19, d4, #5
++          itt         cs
++          orrcs       r8, r11, r8, lsl #16
++          vmovcs      d1, r8, r9
++          vmul.u16    d4, d0, d2
++        subs        r5, #1
++          vmla.u16    d4, d1, d3
++        bne         1b
++
++        b           store_tran_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_4x4_10
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        bl          patch_h_up_4x4_10
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        ldrh        lr, [r2, #-2]       @ Top-left
++        ldrh        r7, [r7]
++        vmov        d0, r8, r9
++        lsl         r9, r9, #16
++        vdup.16     d2, r12
++        orr         r9, r9, r8, lsr #16
++        orr         r8, lr, r8, lsl #16
++        vmov        d1, r8, r9
++        sub         r1, r7, #128
++        mov         r5, #3
++1:
++        sel         lr, lr, lr          @ force pipeline 0 on Cortex-A53
++        vdup.16     d3, r6
++        vmul.u16    d4, d0, d2
++          subs        r12, r12, r4
++        vmla.u16    d4, d1, d3
++          itttt       mi
++          addmi       lr, r2, r1, asr #7
++          bicmi       lr, #1
++          addmi       r12, r12, #32
++          vmovmi      d0, r8, r9
++          rsb         r6, r12, #32
++          itt         mi
++          lslmi       r9, r9, #16
++          ldrhmi      lr, [lr]
++          vdup.16     d2, r12
++        vrshr.u16   d4, d4, #5
++          itttt       mi
++          orrmi       r9, r9, r8, lsr #16
++          orrmi       r8, lr, r8, lsl #16
++          vmovmi      d1, r8, r9
++          addmi       r1, r1, r7
++        subs        r5, r5, #1
++        vst1.16     {d4}, [r0], r3
++        bne         1b
++
++          vdup.16     d3, r6
++          nop                           @ force next insn into pipeline 0 to enable
++          vmul.u16    d4, d0, d2        @ vmla to execute back-to-back on Cortex-A53
++          vmla.u16    d4, d1, d3
++          vrshr.u16   d4, d4, #5
++          vst1.16     {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.16     d3, r6
++        lsr         r8, #16
++        vdup.16     d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r1, #6]!
++        vmov        d1, r8, r9
++        mov         r5, #3
++1:
++        vmul.u16    d4, d0, d2
++          subs        r12, r4
++        vmla.u16    d4, d1, d3
++          it          mi
++          addmi       r12, #32
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      d0, r8, r9
++          lsrmi       r8, #16
++          vdup.16     d2, r12
++          itt         mi
++          orrmi       r8, r8, r9, lsl #16
++          ldrmi       r9, [r1, #2]!
++        vrshr.u16   d4, d4, #5
++          it          mi
++          vmovmi      d1, r8, r9
++          vdup.16     d3, r6
++        subs        r5, #1
++        vst1.16     {d4}, [r0], r3
++        bne         1b
++
++          vmul.u16    d4, d0, d2
++          vmla.u16    d4, d1, d3
++          vrshr.u16   d4, d4, #5
++          vst1.16     {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++
++        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
++        sub         r0, #16
++        mov         r6, r4
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++
++        push        {r2}
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++        pop         {r2}
++
++        sub         r0, #16
++        mov         r10, #-128
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q9}, [r1]
++        sub         r1, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.16     q2, r6
++        vext.16     q8, q9, q9, #7
++        sub         r8, r7, #128
++        vld1.16     {d16[0]}, [r1]
++        vdup.16     q3, r12
++        mov         r5, #7
++1:
++        vmul.u16    q0, q9, q3
++        subs        r12, r4
++        vmla.u16    q0, q8, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #1
++        vext.16     q10, q8, q8, #7
++        rsb         r6, r12, #32
++        vmov        q11, q8
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.16     {d20[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q11, q3
++        subs        r12, r4
++        vmla.u16    q0, q10, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #1
++        vext.16     q8, q10, q10, #7
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.16     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q11, q3
++        vmla.u16    q0, q10, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q9, q3
++        vmla.u16    q0, q8, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.16     {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vext.16     q8, q9, q9, #1
++        vld1.16     {d17[3]}, [r1]!
++        mov         r5, #7
++1:
++        vmul.u16    q0, q8, q2
++        subs        r12, r4
++        vmla.u16    q0, q9, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q10, q8, q8, #1
++        rsb         r6, r12, #32
++        vld1.16     {d21[3]}, [r1]
++        sub         r5, #1
++        vmov        q11, q8
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #2
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q10, q2
++        subs        r12, r4
++        vmla.u16    q0, q11, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q8, q10, q10, #1
++        rsb         r6, r12, #32
++        vld1.16     {d17[3]}, [r1]
++        sub         r5, #1
++        vmov        q9, q10
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #2
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q10, q2
++        vmla.u16    q0, q11, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q8, q2
++        vmla.u16    q0, q9, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++
++        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*2
++        mov         r6, r4
++        sub         r0, #32
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<2
++1:
++        push        {r2, r10}
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        pop         {r2, r10}
++
++        vmov        r8, s12
++        sub         r0, #32
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q0-q1}, [r1]
++        sub         r9, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        mov         r5, #16
++1:
++        vld1.16     {d17[3]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        asr         r9, r8, #8
++        vext.16     q1, q0, q1, #7
++        add         r9, r2, r9, lsl #1
++        vext.16     q0, q8, q0, #7
++2:
++        vmul.u16    q11, q2, q10
++        subs        r12, r4
++        vmla.u16    q11, q0, q9
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q3, q10
++        rsb         r6, r12, #32
++        vmla.u16    q12, q1, q9
++        sub         r5, #1
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.16     {d16[0]}, [r5]
++        mov         r5, #16
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++1:
++        vmov        q2, q0
++        add         r1, #2
++        vmov        q3, q1
++        vext.16     q0, q0, q1, #1
++        vext.16     q1, q1, q8, #1
++2:
++        vmul.u16    q11, q0, q9
++        subs        r12, r4
++        vmla.u16    q11, q2, q10
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q1, q9
++        rsb         r6, r12, #32
++        vmla.u16    q12, q3, q10
++        sub         r5, #1
++        vld1.16     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        vpush       {d8}
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        add         sp, #8
++        mov         r10, #8
++        mov         r1, r2
++1:
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++
++        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*2
++        mov         r6, r4
++        sub         r0, #64
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        add         sp, #8
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<6
++1:
++        push        {r2, r10}
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        pop         {r2, r10}
++
++        vmov        r8, s12
++        sub         r0, #64
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        sub         r9, r2, #2
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vmov        d0, d9
++        vmov        s2, r12
++        add         r10, r0, #32
++        mov         r5, #32
++1:
++        vld1.16     {d1[3]}, [r9]
++        add         r8, r7
++        vmov        q11, q4
++        vmov        q10, q3
++        asr         r9, r8, #8
++        vmov        q9, q2
++        add         r9, r2, r9, lsl #1
++        vmov        q8, q1
++        vext.16     q4, q3, q4, #7
++        vext.16     q3, q2, q3, #7
++        vext.16     q2, q1, q2, #7
++        vext.16     q1, q0, q1, #7
++2:
++        vmul.u16    q12, q8, d1[1]
++        adds        r12, r4
++        vmla.u16    q12, q1, d1[0]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q9, d1[1]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q2, d1[0]
++        sub         r5, #1
++        vmul.u16    q14, q10, d1[1]
++        teq         r5, #0
++        vmla.u16    q14, q3, d1[0]
++        vmul.u16    q15, q11, d1[1]
++        vmla.u16    q15, q4, d1[0]
++        vmov        s2, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d0
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        add         r1, r1, #64
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        vmov        d1, d9
++        vmov        s1, r12
++        add         r10, r0, #32
++        mov         r5, #32
++1:
++        vld1.16     {d0[0]}, [r1]!
++        vmov        q8, q1
++        vmov        q9, q2
++        vmov        q10, q3
++        vmov        q11, q4
++        vext.16     q1, q1, q2, #1
++        vext.16     q2, q2, q3, #1
++        vext.16     q3, q3, q4, #1
++        vext.16     q4, q4, q0, #1
++2:
++        vmul.u16    q12, q1, d0[2]
++        adds        r12, r4
++        vmla.u16    q12, q8, d0[3]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q2, d0[2]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q9, d0[3]
++        sub         r5, #1
++        vmul.u16    q14, q3, d0[2]
++        teq         r5, #0
++        vmla.u16    q14, q10, d0[3]
++        vmul.u16    q15, q4, d0[2]
++        vmla.u16    q15, q11, d0[3]
++        vmov        s1, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d1
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++
++@ Generate 4x4 chroma patch
++@
++@ In (const)
++@ r1   Up ptr (_up only)
++@ r3   Out stride
++@ r4   Angle add
++@ r7   Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2   Left ptr - updated
++@ r6   Angle frac (init to r4 + 32)
++@ r8   Inv angle accumulator
++@ q2   Cur Line - load before 1st call for down - set by _up
++@ q8   Cur Line - load before 1st call for up   - set by _down
++@
++@ Temps
++@ r5   Loop counter
++@ r12
++@ d0, q1, q12-q15
++
++patch_h_down_c_4x4_10:
++        vld1.16     {q12}, [r2]!
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        mov         r5, #4
++1:
++        vmov        q13, q12
++        vext.16     q12, q12, q12, #2
++        vld1.32     {d25[1]}, [r2]!
++patch_h_down_c_4x4_10_continue:
++2:
++        vmov        q8, q9
++        subs        r12, r4
++        vmul.u16    q0, q13, q3
++        it          cc
++        addcc       r12, #32
++        vmla.u16    q0, q12, q2
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vmov        q10, q11
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vrshr.u16   q11, q0, #5
++        bhi         2b
++        bne         1b
++
++        bcs         3f
++        vmov        q13, q12
++        vext.16     q12, q12, q12, #2
++        vld1.32     {d25[1]}, [r2]!
++3:
++
++store_tran_c_4x4_10:
++T       add         r6, r0, r3
++        vzip.32     q8, q10
++A       add         r6, r0, r3
++T       lsl         r3, #1
++        vzip.32     q9, q11
++A       add         r5, r0, r3, lsl #1
++T       add         r5, r0, r3
++        vst2.32     {d16,d18}, [r0]!
++A       lsl         r3, #1
++        vst2.32     {d17,d19}, [r6], r3
++        asr         r3, #1
++        vst2.32     {d20,d22}, [r5]
++        mov         r5, #4
++        vst2.32     {d21,d23}, [r6]
++        bx          lr
++
++patch_h_up_c_4x4_10:
++        vld1.16     {q1}, [r2]
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        mov         r5, #4
++1:
++        adds        r8, r7
++        vmov        q12, q1
++        it          mi
++        ldrmi       r6, [r2, #-4]!
++        vext.16     q1, q1, q1, #6
++        itt         pl
++        asrpl       r6, r8, #8
++        ldrpl       r6, [r1, r6, lsl #2]
++        vmov        s4, r6
++patch_h_up_c_4x4_10_continue:
++2:
++        vmov        q8, q9
++        subs        r12, r4
++        vmul.u16    q0, q12, q3
++        it          cc
++        addcc       r12, #32
++        vmla.u16    q0, q1, q2
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vmov        q10, q11
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vrshr.u16   q11, q0, #5
++        bhi         2b
++        bne         1b
++
++        bcs         store_tran_c_4x4_10
++        adds        r8, r7
++        vmov        q12, q1
++        it          mi
++        ldrmi       r6, [r2, #-4]!
++        vext.16     q1, q1, q1, #6
++        itt         pl
++        asrpl       r6, r8, #8
++        ldrpl       r6, [r1, r6, lsl #2]
++        vmov        s4, r6
++        b           store_tran_c_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r8, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #2
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_c_4x4_10
++        pop         {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        sub         r8, r7
++        bl          patch_h_up_c_4x4_10
++        pop         {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q9}, [r1]
++        sub         r1, r2, #4
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.16     q2, r6
++        vext.16     q8, q9, q9, #6
++        sub         r8, r7, #128
++        vld1.32     {d16[0]}, [r1]
++        vdup.16     q3, r12
++        mov         r5, #3
++1:
++        vmul.u16    q0, q9, q3
++        subs        r12, r4
++        vmla.u16    q0, q8, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #2
++        vext.16     q10, q8, q8, #6
++        rsb         r6, r12, #32
++        vmov        q11, q8
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.32     {d20[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q11, q3
++        subs        r12, r4
++        vmla.u16    q0, q10, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #2
++        vext.16     q8, q10, q10, #6
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.32     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q11, q3
++        vmla.u16    q0, q10, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q9, q3
++        vmla.u16    q0, q8, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.16     {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vext.16     q8, q9, q9, #2
++        vld1.32     {d17[1]}, [r1]!
++        mov         r5, #3
++1:
++        vmul.u16    q0, q8, q2
++        subs        r12, r4
++        vmla.u16    q0, q9, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q10, q8, q8, #2
++        rsb         r6, r12, #32
++        vld1.32     {d21[1]}, [r1]
++        sub         r5, #1
++        vmov        q11, q8
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #4
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q10, q2
++        subs        r12, r4
++        vmla.u16    q0, q11, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q8, q10, q10, #2
++        rsb         r6, r12, #32
++        vld1.32     {d17[1]}, [r1]
++        sub         r5, #1
++        vmov        q9, q10
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #4
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q10, q2
++        vmla.u16    q0, q11, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q8, q2
++        vmla.u16    q0, q9, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r8, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #2
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_c_4x4_10
++        bl          patch_h_down_c_4x4_10_continue
++
++        add         r2, r1, #4*4        @ restore r2, but 4 rows further down left
++        sub         r0, #32
++        mov         r6, r4
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_down_c_4x4_10
++        bl          patch_h_down_c_4x4_10_continue
++
++        pop         {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        sub         r8, r7
++
++        push        {r2, r8}
++        bl          patch_h_up_c_4x4_10
++        bl          patch_h_up_c_4x4_10_continue
++        pop         {r2, r8}
++
++        sub         r0, #32
++        mov         r6, r4
++        add         r2, #16
++        sub         r8, r8, r7, lsl #2
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_up_c_4x4_10
++        bl          patch_h_up_c_4x4_10_continue
++
++        pop         {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q0-q1}, [r1]
++        sub         r9, r2, #4
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        mov         r5, #8
++1:
++        vld1.32     {d17[1]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        asr         r9, r8, #8
++        vext.16     q1, q0, q1, #6
++        add         r9, r2, r9, lsl #2
++        vext.16     q0, q8, q0, #6
++2:
++        vmul.u16    q11, q2, q10
++        subs        r12, r4
++        vmla.u16    q11, q0, q9
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q3, q10
++        rsb         r6, r12, #32
++        vmla.u16    q12, q1, q9
++        sub         r5, #1
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.32     {d16[0]}, [r5]
++        mov         r5, #8
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++1:
++        vmov        q2, q0
++        add         r1, #4
++        vmov        q3, q1
++        vext.16     q0, q0, q1, #2
++        vext.16     q1, q1, q8, #2
++2:
++        vmul.u16    q11, q0, q9
++        subs        r12, r4
++        vmla.u16    q11, q2, q10
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q1, q9
++        rsb         r6, r12, #32
++        vmla.u16    q12, q3, q10
++        sub         r5, #1
++        vld1.32     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r10, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #2
++        vpush       {d8}
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        add         sp, #8
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_c_4x4_10
++        bl          patch_h_down_c_4x4_10_continue
++        bl          patch_h_down_c_4x4_10_continue
++        bl          patch_h_down_c_4x4_10_continue
++
++        add         r2, r1, #4*4         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*4
++        mov         r6, r4
++        sub         r0, #64
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++        add         sp, #8
++        mov         r10, #4
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        sub         r8, r7
++2:
++        push        {r2, r8}
++        bl          patch_h_up_c_4x4_10
++        bl          patch_h_up_c_4x4_10_continue
++        bl          patch_h_up_c_4x4_10_continue
++        bl          patch_h_up_c_4x4_10_continue
++        pop         {r2, r8}
++
++        sub         r0, #64
++        mov         r6, r4
++        add         r2, #16
++        sub         r8, r8, r7, lsl #2
++        add         r0, r0, r3, lsl #2
++        subs        r10, #1
++        bne         2b
++
++        pop         {r4-r10, pc}
++
++@ Left of vertical - works down left
++18:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        sub         r9, r2, #4
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vmov        d0, d9
++        vmov        s2, r12
++        add         r10, r0, #32
++        mov         r5, #16
++1:
++        vld1.32     {d1[1]}, [r9]
++        add         r8, r7
++        vmov        q11, q4
++        vmov        q10, q3
++        asr         r9, r8, #8
++        vmov        q9, q2
++        add         r9, r2, r9, lsl #2
++        vmov        q8, q1
++        vext.16     q4, q3, q4, #6
++        vext.16     q3, q2, q3, #6
++        vext.16     q2, q1, q2, #6
++        vext.16     q1, q0, q1, #6
++2:
++        vmul.u16    q12, q8, d1[1]
++        adds        r12, r4
++        vmla.u16    q12, q1, d1[0]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q9, d1[1]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q2, d1[0]
++        sub         r5, #1
++        vmul.u16    q14, q10, d1[1]
++        teq         r5, #0
++        vmla.u16    q14, q3, d1[0]
++        vmul.u16    q15, q11, d1[1]
++        vmla.u16    q15, q4, d1[0]
++        vmov        s2, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d0
++        pop         {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        add         r1, r1, #64
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        vmov        d1, d9
++        vmov        s1, r12
++        add         r10, r0, #32
++        mov         r5, #16
++1:
++        vld1.32     {d0[0]}, [r1]!
++        vmov        q8, q1
++        vmov        q9, q2
++        vmov        q10, q3
++        vmov        q11, q4
++        vext.16     q1, q1, q2, #2
++        vext.16     q2, q2, q3, #2
++        vext.16     q3, q3, q4, #2
++        vext.16     q4, q4, q0, #2
++2:
++        vmul.u16    q12, q1, d0[2]
++        adds        r12, r4
++        vmla.u16    q12, q8, d0[3]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q2, d0[2]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q9, d0[3]
++        sub         r5, #1
++        vmul.u16    q14, q3, d0[2]
++        teq         r5, #0
++        vmla.u16    q14, q10, d0[3]
++        vmul.u16    q15, q4, d0[2]
++        vmla.u16    q15, q11, d0[3]
++        vmov        s1, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d1
++        pop         {r4-r10, pc}
++
++endfunc
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+new file mode 100644
+index 0000000000..df8c1c25b9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+@@ -0,0 +1,705 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_8, export=1
++
++        @ Average the els of top & left
++        ldr         r2, [r2]
++        vld1.32     {d0[0]}, [r1]
++        mov         r1, #2
++        vmov        s1, r2
++        vmov        s2, r2
++        vmov.i16    q2, #3
++        add         r2, r0, r3
++        vaddl.u8    q1, d0, d1    @ d2[0] = top[0] + left[0]
++        lsl         r3, #1
++        vmovl.u8    q0, d0
++        vmov.i64    d7, #0xffff
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vmov.i64    d7, #0xff
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #3
++        vmla.i16    q0, q2, d6[0]
++        vdup.8      d6, d6[0]
++        vrshrn.i16  d0, q0, #2
++
++        @ Store top line
++        vst1.32     {d0[0]}, [r0], r3
++
++        @ Store the rest
++        vshr.u64    d1, d0, #5*8
++        vshr.u64    d2, d0, #6*8
++        vshr.u64    d3, d0, #7*8
++        vbif        d1, d6, d7
++        vbif        d2, d6, d7
++        vst1.32     {d1[0]}, [r2], r3
++        vbif        d3, d6, d7
++        vst1.32     {d2[0]}, [r0]
++        vst1.32     {d3[0]}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {d0}, [r1]
++        vld1.8      {d1}, [r2]
++A       add         r2, r0, r3, lsl #1
++A       lsl         r3, #2
++T       lsl         r3, #1
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vaddl.u8    q0, d0, d1
++        vadd.i16    d0, d1       @ d0 has 2 val pairs
++        vpadd.i32   d2, d0, d0   @ This adds U & V separately
++        vpadd.i32   d3, d0, d0
++        vrshrn.u16  d0, q1, #3
++
++        @ Store
++        vst1.8      {d0}, [r0], r3
++        vst1.8      {d0}, [r2], r3
++        vst1.8      {d0}, [r0]
++        vst1.8      {d0}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {d0}, [r1]
++        mov         r1, #2
++        vld1.8      {d16}, [r2]
++        vmov.i16    q2, #3
++        vmov.i64    d7, #0xffff
++        vaddl.u8    q1, d0, d16   @ d2[0] = top[0] + left[0]
++        vmovl.u8    q0, d0
++        vadd.i16    d6, d2, d3    @ d6 has 4 vals
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vmov.i64    d7, #0xff
++        vmovl.u8    q1, d16
++        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #4
++        vmla.i16    q1, q2, d6[0]
++        vmla.i16    q0, q2, d6[0]
++        vdup.8      d6, d6[0]
++        vrshrn.i16  d2, q1, #2
++        vrshrn.i16  d0, q0, #2
++
++        @ Store top line
++        vst1.8      {d0}, [r0], r3
++
++        @ Store the rest
++        vshr.u64    d2, #8
++        vbit        d6, d2, d7
++        vshr.u64    d2, #8
++        vst1.8      {d6}, [r0], r3
++        mov         r1, #6
++1:
++        vbit        d6, d2, d7
++        vshr.u64    d2, #8
++        vst1.8      {d6}, [r0], r3
++        subs        r1, #2
++        vbit        d6, d2, d7
++        vshr.u64    d2, #8
++        vst1.8      {d6}, [r0], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0}, [r1]
++        mov         r1, #8
++        vld1.8      {q1}, [r2]
++T       lsl         r3, #1
++        vaddl.u8    q0, d0, d1
++A       add         r2, r0, r3, lsl #1
++A       lsl         r3, #2
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vaddl.u8    q1, d2, d3
++        vadd.i16    q1, q0
++        vadd.i16    d3, d2        @ d3 has 2 val pairs
++        vpadd.i32   d2, d3, d3    @ This add U & V separately
++        vpadd.i32   d3, d3, d3
++        vrshrn.u16  d0, q1, #4
++        vrshrn.u16  d1, q1, #4
++
++        @ Store
++1:
++        vst1.8      {q0}, [r0], r3
++        subs        r1, #4
++        vst1.8      {q0}, [r2], r3
++        vst1.8      {q0}, [r0], r3
++        vst1.8      {q0}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q8}, [r1]
++        mov         r1, #2
++        vld1.8      {q9}, [r2]
++        vaddl.u8    q10, d16, d17
++        vaddl.u8    q11, d16, d18
++        vaddl.u8    q0, d18, d19
++        vmov.i16    q1, #3
++        vadd.i16    q10, q0
++        vmovl.u8    q0, d18
++        vadd.i16    d20, d21
++        vmov.i16    d2[0], r1     @ 2, 3, 3, 3...
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vmovl.u8    q2, d16
++        vmovl.u8    q9, d19
++        vpadd.i16   d20, d20      @ 2 (top & bottom of vector the same)
++        vmov.i64    d7, #0xffff
++        vmovl.u8    q8, d17
++        vbit        d4, d22, d7   @ q2 = top[0]+left[0], top[1..7]
++        vmov.i64    d7, #0xff
++        vpadd.i16   d20, d20      @ 1 (all the same)
++        vrshr.u16   d21, d20, #5
++        vrshr.u16   d20, d20, #5
++        vmla.i16    q0, q10, d2[1]
++        vmla.i16    q9, q10, d2[1]
++        vmla.i16    q2, q10, q1
++        vmla.i16    q8, q10, d2[1]
++        vdup.8      q1, d20[0]
++        vrshrn.i16  d0, q0, #2
++        vrshrn.i16  d1, q9, #2
++        vrshrn.i16  d4, q2, #2
++        vrshrn.i16  d5, q8, #2
++        vext.8      q0, q0, q0, #1
++
++        @ Store top line
++        vst1.8      {q2}, [r0], r3
++
++        @ Store the rest
++        mov         r1, #15
++1:
++        vbit        d2, d0, d7
++        vext.8      q0, q0, q0, #1
++        subs        r1, #1
++        vst1.8      {q1}, [r0], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0-q1}, [r1]
++        mov         r1, #16
++        vld1.8      {q2-q3}, [r2]
++T       lsl         r3, #1
++        vaddl.u8    q0, d0, d1
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vaddl.u8    q1, d2, d3
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vaddl.u8    q2, d4, d5
++        vaddl.u8    q3, d6, d7
++        vadd.i16    q0, q1
++        vadd.i16    q2, q3
++        vadd.i16    q0, q2
++        vadd.i16    d0, d1        @ d0 has 2 val pairs
++        vpadd.i32   d4, d0, d0    @ This adds U & V separately
++        vpadd.i32   d5, d0, d0
++        vrshrn.u16  d0, q2, #5
++        vrshrn.u16  d1, q2, #5
++        vrshrn.u16  d2, q2, #5
++        vrshrn.u16  d3, q2, #5
++
++        @ Store
++1:
++        vst1.8      {q0-q1}, [r0], r3
++        subs        r1, #2
++        vst1.8      {q0-q1}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_32_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0-q1}, [r1]
++        mov         r1, #32
++        vld1.8      {q2-q3}, [r2]
++        add         r2, r0, r3
++        vaddl.u8    q0, d0, d1
++        lsl         r3, #1
++        vaddl.u8    q1, d2, d3
++        vaddl.u8    q2, d4, d5
++        vaddl.u8    q3, d6, d7
++        vadd.i16    q0, q1
++        vadd.i16    q2, q3
++        vadd.i16    q0, q2
++        vadd.i16    d0, d1        @ d0 has 4 vals
++        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
++        vpadd.i16   d4, d0, d0    @ 1 (all the same)
++        vpadd.i16   d5, d0, d0
++        vrshrn.u16  d0, q2, #6
++        vrshrn.u16  d1, q2, #6
++        vrshrn.u16  d2, q2, #6
++        vrshrn.u16  d3, q2, #6
++
++        @ Store
++1:
++        vst1.8      {q0-q1}, [r0], r3
++        subs        r1, #2
++        vst1.8      {q0-q1}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ -----------------------------------------------------------------------------
++@
++@ 10 Bit versions
++@
++@ There is no actual bit depth dependency in this code except that our
++@ intermediate results will overflow the 16 bits they are stored in
++@ All there functions are good to 10 bits - with the worst case being
++@ in dc_32 where we use all 16 bits.
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {d0}, [r1]
++        mov         r1, #2
++        vld1.16     {d1}, [r2]
++T       lsl         r3, #1
++        vmov.i16    q2, #3
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vadd.u16    d2, d0, d1    @ d2[0] = top[0] + left[0]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vmov.i64    d7, #0xffff
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #3
++        vmla.i16    q0, q2, d6[0]
++        vrshr.u16   q0, #2
++
++        @ Store top line
++        vst1.16     {d0}, [r0], r3
++
++        @ Store the rest
++        vshr.u64    d3, d1, #1*16
++        vshr.u64    d4, d1, #2*16
++        vshr.u64    d5, d1, #3*16
++        vbif        d3, d6, d7
++        vbif        d4, d6, d7
++        vst1.16     {d3}, [r2], r3
++        vbif        d5, d6, d7
++        vst1.16     {d4}, [r0]
++        vst1.16     {d5}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0}, [r1]
++        vld1.8      {q1}, [r2]
++A       add         r2, r0, r3, lsl #2
++A       lsl         r3, #3
++T       lsl         r3, #2
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vadd.i16    q0, q1
++        vadd.i16    d0, d1       @ d0 has 2 val pairs
++        vpadd.i32   d2, d0, d0   @ This adds U & V separately
++        vpadd.i32   d3, d0, d0
++        vrshr.u16   q0, q1, #3
++
++        vst1.16     {q0}, [r0], r3
++        vst1.16     {q0}, [r2], r3
++        vst1.16     {q0}, [r0]
++        vst1.16     {q0}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {q0}, [r1]
++        mov         r1, #2
++        vld1.16     {q8}, [r2]
++T       lsl         r3, #1
++        vmov.i16    q2, #3
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vadd.i16    q1, q0, q8    @ q1[0] = top[0] + left[0]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vmov.i64    d7, #0xffff
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vadd.i16    d6, d2, d3    @ d6 has 4 vals
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #4
++        vmla.i16    q8, q2, d6[0]
++        vmla.i16    q0, q2, d6[0]
++        vdup.16     q2, d6[0]
++        vdup.16     q9, d6[0]
++        vrshr.u16   q8, q8, #2
++        vrshr.u16   q0, q0, #2
++        vext.16     q1, q8, q8, #1
++
++        @ Store top line
++        vst1.16     {q0}, [r0], r3
++
++        @ Store the rest
++        vbit        d18, d2, d7
++        vst1.16     {q9}, [r2], r3
++        mov         r1, #6
++1:
++        vext.16     q8, q8, q8, #2
++        subs        r1, #2
++        vext.16     q1, q1, q1, #2
++        vbit        d4, d16, d7
++        vst1.16     {q2}, [r0], r3
++        vbit        d18, d2, d7
++        vst1.16     {q9}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {q0-q1}, [r1]
++        mov         r1, #8
++        vld1.16     {q2-q3}, [r2]
++T       lsl         r3, #2
++        vadd.i16    q1, q0
++A       add         r2, r0, r3, lsl #2
++A       lsl         r3, #3
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vadd.i16    q2, q3
++        vadd.i16    q1, q2
++        vadd.i16    d3, d2        @ d3 has 2 val pairs
++        vpadd.i32   d2, d3, d3    @ This add U & V separately
++        vpadd.i32   d3, d3, d3
++        vrshr.u16   q0, q1, #4
++        vrshr.u16   q1, q1, #4
++
++        @ Store
++1:
++        vst1.8      {q0-q1}, [r0], r3
++        subs        r1, #2
++        vst1.8      {q0-q1}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {q8-q9}, [r1]
++        mov         r1, #2
++        vld1.16     {q10-q11}, [r2]
++        lsl         r3, #1        @ stride given in pels
++        vadd.i16    q0, q8, q9
++        vadd.i16    q1, q10, q11
++        vmov.i16    q3, #3
++        vadd.i16    q1, q0
++        vadd.i16    d0, d16, d20
++        vmov.i64    d31, #0xffff
++        vadd.i16    d3, d2
++        vmov.16     d6[0], r1     @ 2, 3, 3, 3...
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ topline[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vbit        d16, d0, d31  @ q8 = top[0]+left[0], top[1..7]
++        vpadd.i16   d3, d3        @ 2 (top & bottom of vector the same)
++        vpadd.i16   d3, d3        @ 1 (all the same)
++        vrshr.u16   d2, d3, #5
++        vrshr.u16   d3, d3, #5
++        vmov        q0, q1
++        vmla.i16    q10, q1, d6[1]
++        vmla.i16    q11, q1, d6[1]
++        vmla.i16    q8, q1, q3
++        vmla.i16    q9, q1, d6[1]
++        vrshr.u16   q2, q10, #2
++        vrshr.u16   q3, q11, #2
++        vrshr.u16   q8, #2
++        vrshr.u16   q9, #2
++        vext.16     q2, q2, q2, #1
++        mov         r1, #7<<29
++
++        @ Store top line
++        vst1.16     {q8-q9}, [r0], r3
++
++        @ Store the rest
++1:
++        vbit        d0, d4, d31
++        vext.16     q2, q2, q2, #1
++        subs        r1, #1<<29
++        vst1.16     {q0-q1}, [r0], r3
++        bne         1b
++1:
++        vbit        d0, d6, d31
++        vext.16     q3, q3, q3, #1
++        subs        r1, #1<<29
++        vst1.16     {q0-q1}, [r0], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
++
++        @ Average the els of top & left
++        vldm        r1, {q0-q3}
++        vldm        r2, {q8-q11}
++        vadd.i16    q0, q1
++        mov         r1, #16
++        vadd.i16    q2, q3
++        add         r2, r0, #32
++        vadd.i16    q8, q9
++        lsl         r3, #2
++        vadd.i16    q10, q11
++        vadd.u16    q0, q2
++        vadd.u16    q8, q10
++        vadd.i16    q0, q8
++        vadd.i16    d0, d1        @ d0 has 2 val pairs
++        vpadd.i32   d4, d0, d0    @ This adds U & V separately
++        vpadd.i32   d5, d0, d0
++        vrshr.u16   q0, q2, #5
++        vrshr.u16   q1, q2, #5
++
++        @ Store
++1:
++        vst1.16     {q0-q1}, [r0], r3
++        subs        r1, #1
++        vst1.16     {q0-q1}, [r2], r3
++        bne         1b
++
++        bx           lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels)
++
++function ff_hevc_rpi_pred_dc_32_neon_10, export=1
++
++        @ Average the els of top & left
++        @ With 10 bits we are (just) safe from overflow in i16
++        vldm        r1, {q0-q3}
++        vldm        r2, {q8-q11}
++        vadd.i16    q0, q1
++        mov         r1, #32
++        vadd.i16    q2, q3
++        add         r2, r0, #32
++        vadd.i16    q8, q9
++        lsl         r3, #1
++        vadd.i16    q10, q11
++        vadd.u16    q0, q2
++        vadd.u16    q8, q10
++        vadd.i16    q0, q8
++        vadd.i16    d0, d1        @ d0 has 4 vals
++        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
++        vpadd.i16   d4, d0, d0    @ 1 (all the same)
++        vpadd.i16   d5, d0, d0
++        vrshr.u16   q0, q2, #6
++        vrshr.u16   q1, q2, #6
++
++        @ Store
++1:
++        vst1.16     {q0-q1}, [r0], r3
++        subs        r1, #1
++        vst1.16     {q0-q1}, [r2], r3
++        bne         1b
++
++        bx           lr
++endfunc
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+new file mode 100644
+index 0000000000..f6969d3591
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+@@ -0,0 +1,881 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ All functions have the call
++@
++@ int ff_hevc_rpi_intra_filter_N_neon_PW(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++@
++@ Assumptions:
++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
++@  if reuseing this code)
++@
++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
++@ N==4, but do for chroma N>=8.  As we share Y/C fns that means we can ignore
++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
++@
++@ We always have at least 64 pixel H frame width rounding - this lets us
++@ load UR widthout having to worry about exactly how many pixels are actually
++@ within the frame.  As partial loads will only occur very occasionally this
++@ should be a win in nearly all cases.
++@
++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
++@ so we do no maths on the contents
++@
++@ No filtering in 32bit fns as they are chroma only
++
++
++.equ    AVAIL_UR, 1
++.equ    AVAIL_U,  2
++.equ    AVAIL_UL, 4
++.equ    AVAIL_L,  8
++.equ    AVAIL_DL, 16
++
++.equ    FILTER_LIGHT, 0x40
++.equ    FILTER_STRONG, 0x80
++
++.equ    AVAIL_S_UR_N_U_C, 32 - 1
++.equ    AVAIL_S_U_N_UL_C, 32 - 2
++.equ    AVAIL_S_UL_N_L_C, 32 - 3
++.equ    AVAIL_S_L_N_DL_C, 32 - 4
++
++.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
++
++@ On entry
++@  r2   req
++@  r3   avail
++@ [sp, #sp_offset...]  args
++@
++@ On Exit:
++@
++@ Extend values:
++@  d_l  scalar contains value for L & DL
++@       if DL avail then this is is DL[0] so we don't need to load that
++@  d_ul scalar containing value for UL
++@  d_u  scalar containing value for U
++@  d_ur scalar containing value for UR
++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
++@ This means that L-light-filter works even if nreq DL (we never filter
++@ req-DL without req-L, but we do filter req-L without req-DL)
++@ If UR avail then d_ur == a_ur so U-filter good too
++@
++@ Data load pointers (only load if req & avail):
++@  r4   DL + stride
++@  r10  L
++@  r6   U
++@  r5   UR
++@
++@ Others:
++@  r2   req
++@  r7   req & avail
++@  r3   L + stride
++@  r8   DL + stride * 2
++@  r9   stride * 2
++@  cs   Load U
++@  mi   Load UR
++@
++@ Clobbered:
++@  r12
++
++.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
++
++.equ    src_l\@,   \sp_offset + 0
++.equ    src_u\@,   \sp_offset + 4
++.equ    src_ur\@,  \sp_offset + 8
++.equ    stride\@,  \sp_offset + 12
++.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
++.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
++
++@ r9    stride
++@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
++@ r4 = b_dl, r10 = b_l,             r8 = b_u
++
++        ldr        r5,  [sp, #src_ur\@]
++        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
++        ldr        r10, [sp, #src_l\@]
++        ldr        r9,  [sp, #stride\@]
++        ldr        r6,  [sp, #src_u\@]
++
++        @ This is quite a slow instruction but it replaces
++        @ a decent number of tests that yield a max of 2 flags/op
++        @ It is annoying we can't branch on Q!
++        @ If L navail (ne) then DL must be navail (pl)
++        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
++
++        mov        r4,  r5
++        sub        r7,  r10, r9
++        it vs
++        movvs      r4,  r6
++        add        r8,  r6,  #b_size\@ - pw\@
++        it cs
++        movcs      r4,  r7
++        ite ne
++        movne      r10, r4
++        addeq      r4,  r7,  r9,  lsl #\log2_s
++        it cc
++        movcc      r7,  r10
++        it mi
++        addmi      r4,  r10, r9,  lsl #\log2_s
++        vld1.\d_type {\d_ul}, [r7]
++        itt vc
++        movvc      r8,  r7
++        movvc      r6,  r7
++        vld1.\d_type {\d_l }, [r4], r9
++        tst        r3,  #AVAIL_UR
++        vld1.\d_type {\d_u }, [r6]
++        it eq
++        moveq      r5,  r8
++        and        r7,  r2,  r3
++        add        r8,  r4,  r9
++        vld1.\d_type {\d_ur}, [r5]
++        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
++        add        r3,  r10, r9
++        lsl        r9,  #1
++.endm
++
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_8(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    0
++.set    pw,      (1 << pw_s)
++.set    log2_s,  2
++
++function ff_hevc_rpi_intra_filter_4_neon_8, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
++
++        it cs
++        vldrcs     s2,  [r6]
++        ite pl
++        vmovpl     s3,  s4
++        vldrmi     s3,  [r5]
++
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        add        r12, r0,  #-pw
++        bpl        1f
++
++        vld1.8    {d0[0]}, [r10], r9
++        vld1.8    {d0[1]}, [r3],  r9
++        vld1.8    {d0[2]}, [r10]
++        vld1.8    {d0[3]}, [r3]
++1:
++        bcc        1f
++        vld1.8    {d0[5]}, [r4],  r9
++        vld1.8    {d0[6]}, [r8]
++        vld1.8    {d0[7]}, [r4]
++1:
++        vstr       d1,  [r1]            @ Up
++        vst1.8    {d31[7]}, [r12]
++        vstr       d0,  [r0]            @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_16(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    1
++.set    pw,      (1 << pw_s)
++.set    log2_s,  2
++
++function ff_hevc_rpi_intra_filter_4_neon_16, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
++
++        it cs
++        vldrcs     d2,  [r6]
++        it mi
++        vldrmi     d3,  [r5]
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        add        r12, r0, #-pw
++        bpl        1f
++        vld1.16   {d0[0]}, [r10], r9
++        vld1.16   {d0[1]}, [r3],  r9
++        vld1.16   {d0[2]}, [r10]
++        vld1.16   {d0[3]}, [r3]
++1:
++        bcc        1f
++        vld1.16   {d1[1]}, [r4],  r9
++        vld1.16   {d1[2]}, [r8]
++        vld1.16   {d1[3]}, [r4]
++1:
++        vst1.16   {q1}, [r1]           @ Up
++        vst1.16   {d31[3]}, [r12]
++        vst1.16   {q0}, [r0]           @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_8(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    0
++.set    pw,      (1 << pw_s)
++.set    log2_s,  3
++
++function ff_hevc_rpi_intra_filter_8_neon_8, export=1
++        push      {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
++
++        it cs
++        vldrcs     d4,  [r6]
++        it mi
++        vldrmi     d5,  [r5]
++
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        bpl        1f
++        vld1.8    {d0[0]}, [r10], r9
++        vld1.8    {d0[1]}, [r3],  r9
++        vld1.8    {d0[2]}, [r10], r9
++        vld1.8    {d0[3]}, [r3],  r9
++        vld1.8    {d0[4]}, [r10], r9
++        vld1.8    {d0[5]}, [r3],  r9
++        vld1.8    {d0[6]}, [r10]
++        vld1.8    {d0[7]}, [r3]
++1:
++        bcc        1f
++        vld1.8    {d1[1]}, [r4],  r9
++        vld1.8    {d1[2]}, [r8],  r9
++        vld1.8    {d1[3]}, [r4],  r9
++        vld1.8    {d1[4]}, [r8],  r9
++        vld1.8    {d1[5]}, [r4],  r9
++        vld1.8    {d1[6]}, [r8]
++        vld1.8    {d1[7]}, [r4]
++1:
++        tst        r2,  #FILTER_LIGHT
++        add        r12, r0,  #-pw
++        beq        10f
++
++        @ Luma light filter
++        vext.8     q8,  q15, q2,  #15
++        vext.8     q12, q15, q0,  #15
++        vaddl.u8   q9,  d17, d5
++        vaddl.u8   q8,  d16, d4
++        vaddl.u8   q13, d25, d1
++        vaddl.u8   q12, d24, d0
++        vmov.u8    r3,  d5[7]           @ Save final pel
++        vmov.u8    r2,  d1[7]           @ Save final pel
++
++        vext.16    q2,  q8,  q9,  #1
++        vext.16    q3,  q9,  q9,  #1
++        vext.16    q0,  q12, q13, #1
++        vext.16    q1,  q13, q13, #1
++        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
++        vadd.u16   q2,  q8
++        vadd.u16   q3,  q9
++        vadd.u16   q0,  q12
++        vadd.u16   q1,  q13
++
++        vrshrn.u16 d4,  q2,  #2
++        vrshrn.u16 d5,  q3,  #2
++        vrshrn.u16 d0,  q0,  #2
++        vrshrn.u16 d1,  q1,  #2
++        vrshr.u16  d30, #2
++        vmov.u8    d5[7], r3            @ Restore final pel
++        vmov.u8    d1[7], r2            @ Restore final pel
++        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
++
++10:
++        vst1.8    {q2 }, [r1]           @ Up
++        vst1.8    {d31[7]}, [r12]       @ Up-left
++        vst1.8    {q0 }, [r0]           @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_16(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    1
++.set    pw,      (1 << pw_s)
++.set    log2_s,  3
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_16, export=1
++        push      {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
++
++        it cs
++        vldmcs     r6,  {d4, d5}
++        ldr        r12, [sp, #ur_size]
++        bpl        1f
++        cmp        r12, #4
++        vldm       r5,  {d6, d7}
++        bgt        1f
++        vdup.16    d7,  d6[3]
++1:
++        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
++        vdup.16    q1,  d0[0]
++        bpl        1f
++        vld1.16   {d0[0]}, [r10], r9
++        vld1.16   {d0[1]}, [r3],  r9
++        vld1.16   {d0[2]}, [r10], r9
++        vld1.16   {d0[3]}, [r3],  r9
++        vld1.16   {d1[0]}, [r10], r9
++        vld1.16   {d1[1]}, [r3],  r9
++        vld1.16   {d1[2]}, [r10]
++        vld1.16   {d1[3]}, [r3]
++1:
++        bcc        1f
++        ldr        r12, [sp, #dl_size]
++        vld1.16   {d2[1]}, [r4],  r9
++        cmp        r12, #p_size
++        vld1.16   {d2[2]}, [r8],  r9
++        vld1.16   {d2[3]}, [r4],  r9
++        blt        2f
++        vld1.16   {d3[0]}, [r8],  r9
++        vld1.16   {d3[1]}, [r4],  r9
++        vld1.16   {d3[2]}, [r8]
++        vld1.16   {d3[3]}, [r4]
++        b          1f
++2:
++        vdup.16    d3,  d2[3]
++1:
++        tst        r2,  #FILTER_LIGHT
++        add        r12, r0,  #-pw
++        beq        10f
++
++        @ Luma light filter
++        vext.16    q9,  q2,  q3,  #7
++        vext.16    q8,  q15, q2,  #7
++        vext.16    q13, q0,  q1,  #7
++        vext.16    q12, q15, q0,  #7
++        vadd.u16   q9,  q3
++        vadd.u16   q8,  q2
++        vadd.u16   q13, q1
++        vadd.u16   q12, q0
++        vmov.u16   r3,  d7[3]           @ Save final pel
++        vmov.u16   r2,  d3[3]           @ Save final pel
++
++        vext.16    q2,  q8,  q9,  #1
++        vext.16    q3,  q9,  q9,  #1
++        vext.16    q0,  q12, q13, #1
++        vext.16    q1,  q13, q13, #1
++        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
++        vadd.u16   q2,  q8
++        vadd.u16   q3,  q9
++        vadd.u16   q0,  q12
++        vadd.u16   q1,  q13
++
++        vrshr.u16  q2,  #2
++        vrshr.u16  q3,  #2
++        vrshr.u16  q0,  #2
++        vrshr.u16  q1,  #2
++        vrshr.u16  d30, #2
++        vmov.u16   d7[3], r3            @ Restore final pel
++        vmov.u16   d3[3], r2            @ Restore final pel
++        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
++
++10:
++        vst1.16   {q2,  q3}, [r1]       @ Up
++        vst1.16   {d31[3]}, [r12]       @ Up-left
++        vst1.16   {q0,  q1}, [r0]       @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_16_neon_16(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    1
++.set    pw,      (1 << pw_s)
++.set    log2_s,  4
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_16, export=1
++        push      {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
++
++        vdup.16    q9,  d16[0]
++        vdup.16    q11, d20[0]
++
++        it cs
++        vldmcs     r6,  {d16-d19}
++        ldr        r12, [sp, #ur_size]
++        bpl        1f
++        cmp        r12, #12
++        @ Given chroma frame layout, if UR exists then it is always legit to
++        @ load all of it even if most of it is outside the frame.
++        vldm       r5,  {d20-d23}
++        bgt        1f
++        bge        4f
++        cmp        r12,  #8
++        bge        3f
++        vdup.16    d21, d20[3]
++3:      vdup.16    d22, d21[3]
++4:      vdup.16    d23, d22[3]
++
++1:
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        ldr        r12, [sp, #dl_size]
++        vdup.16    q1,  d0[0]
++        vdup.16    q2,  d0[0]
++        vdup.16    q3,  d0[0]
++        bpl        1f
++        vld1.16   {d0[0]}, [r10], r9
++        vld1.16   {d0[1]}, [r3],  r9
++        vld1.16   {d0[2]}, [r10], r9
++        vld1.16   {d0[3]}, [r3],  r9
++        vld1.16   {d1[0]}, [r10], r9
++        vld1.16   {d1[1]}, [r3],  r9
++        vld1.16   {d1[2]}, [r10], r9
++        vld1.16   {d1[3]}, [r3],  r9
++        vld1.16   {d2[0]}, [r10], r9
++        vld1.16   {d2[1]}, [r3],  r9
++        vld1.16   {d2[2]}, [r10], r9
++        vld1.16   {d2[3]}, [r3],  r9
++        vld1.16   {d3[0]}, [r10], r9
++        vld1.16   {d3[1]}, [r3],  r9
++        vld1.16   {d3[2]}, [r10]
++        vld1.16   {d3[3]}, [r3]
++1:
++        bcc        1f
++        vld1.16   {d4[1]}, [r4],  r9
++        cmp        r12, #4
++        vld1.16   {d4[2]}, [r8],  r9
++        vld1.16   {d4[3]}, [r4],  r9
++        ble        2f
++        vld1.16   {d5[0]}, [r8],  r9
++        vld1.16   {d5[1]}, [r4],  r9
++        cmp        r12, #12
++        vld1.16   {d5[2]}, [r8],  r9
++        vld1.16   {d5[3]}, [r4],  r9
++        blt        3f
++        vld1.16   {d6[0]}, [r8],  r9
++        vld1.16   {d6[1]}, [r4],  r9
++        vld1.16   {d6[2]}, [r8],  r9
++        vld1.16   {d6[3]}, [r4],  r9
++        ble        4f
++        vld1.16   {d7[0]}, [r8],  r9
++        vld1.16   {d7[1]}, [r4],  r9
++        vld1.16   {d7[2]}, [r8]
++        vld1.16   {d7[3]}, [r4]
++        b          1f
++2:      vdup.16    d5,  d4[3]
++3:      vdup.16    d6,  d5[3]
++4:      vdup.16    d7,  d6[3]
++1:
++        tst        r2,  #FILTER_LIGHT
++        add        r12, r0,  #-pw
++        beq        10f
++
++        vpush     {q5}
++        @ Luma light filter
++        @ Left
++        vext.16    q5,  q2,  q3,  #7
++        vext.16    q14, q1,  q2,  #7
++        vext.16    q13, q0,  q1,  #7
++        vext.16    q12, q15, q0,  #7
++
++        vadd.u16   q5,  q3
++        vadd.u16   q14, q2
++        vadd.u16   q13, q1
++        vadd.u16   q12, q0
++        vmov.u16   r2,  d7[3]           @ Save final pel
++
++        vext.16    q0,  q12, q13, #1
++        vext.16    q1,  q13, q14, #1
++        vext.16    q2,  q14, q5,  #1
++        vext.16    q3,  q5,  q5,  #1
++
++        vmov       d30, d24             @ d30[0] = l[0] + ul
++        vadd.u16   q0,  q12
++        vadd.u16   q1,  q13
++        vadd.u16   q2,  q14
++        vadd.u16   q3,  q5
++
++        vrshr.u16  q0,  #2
++        vrshr.u16  q1,  #2
++        vrshr.u16  q2,  #2
++        vrshr.u16  q3,  #2
++
++        @ Up
++        vext.16    q5,  q10, q11, #7
++        vext.16    q14, q9,  q10, #7
++        vext.16    q13, q8,  q9,  #7
++        vext.16    q12, q15, q8,  #7
++
++        vadd.u16   q5,  q11
++        vadd.u16   q14, q10
++        vadd.u16   q13, q9
++        vadd.u16   q12, q8
++        vmov.u16   r3,  d23[3]          @ Save final pel
++
++        vext.16    q8,  q12, q13, #1
++        vext.16    q9,  q13, q14, #1
++        vext.16    q10, q14, q5,  #1
++        vext.16    q11, q5,  q5,  #1
++
++        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
++        vadd.u16   q8,  q12
++        vadd.u16   q9,  q13
++        vadd.u16   q10, q14
++        vadd.u16   q11, q5
++
++        vrshr.u16  q8,  #2
++        vrshr.u16  q9,  #2
++        vrshr.u16  q10, #2
++        vrshr.u16  q11, #2
++
++        @ Misc
++        vrshr.u16  d30, #2
++        vmov.u16   d7[3], r2            @ Restore final pel
++        vmov.u16   d23[3], r3           @ Restore final pel
++        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
++        vpop      {q5}
++
++10:
++        vstm       r1, {d16-d23}        @ Up
++        vst1.16   {d31[3]}, [r12]       @ Up-left
++        vstm       r0, { d0-d7 }        @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_4_neon_32(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    2
++.set    pw,      (1 << pw_s)
++.set    log2_s,  2
++
++function ff_hevc_rpi_intra_filter_4_neon_32, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
++
++        it cs
++        vldmcs     r6,  {d4, d5}
++        it mi
++        vldmmi     r5,  {d6, d7}
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        vdup.32    q1,  d0[0]
++        add        r12, r0,  #-pw
++        bpl        1f
++        vld1.32   {d0[0]}, [r10], r9
++        vld1.32   {d0[1]}, [r3],  r9
++        vld1.32   {d1[0]}, [r10]
++        vld1.32   {d1[1]}, [r3]
++1:
++        bcc        1f
++        vld1.32   {d2[1]}, [r4],  r9
++        vld1.32   {d3[0]}, [r8]
++        vld1.32   {d3[1]}, [r4]
++1:
++        vst1.32    {q2,  q3 }, [r1]     @ Up
++        vst1.32    {d31[1]}, [r12]
++        vst1.32    {q0,  q1 }, [r0]     @ Left
++        pop        {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_32(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    2
++.set    pw,      (1 << pw_s)
++.set    log2_s,  3
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_32, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
++
++        vdup.32    q9,  d16[0]
++        vdup.32    q11, d20[0]
++
++        it cs
++        vldmcs     r6,  {q8,  q9 }
++        ldr        r12, [sp, #ur_size]
++        bpl        1f
++        cmp        r12, #p_size
++        vldm       r5,  {q10, q11}
++        bge        1f
++        vdup.32    q11, d21[1]
++1:
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        vdup.32    q1,  d0[0]
++        vdup.32    q2,  d0[0]
++        vdup.32    q3,  d0[0]
++        bpl        1f
++        vld1.32   {d0[0]}, [r10], r9
++        vld1.32   {d0[1]}, [r3],  r9
++        vld1.32   {d1[0]}, [r10], r9
++        vld1.32   {d1[1]}, [r3],  r9
++        vld1.32   {d2[0]}, [r10], r9
++        vld1.32   {d2[1]}, [r3],  r9
++        vld1.32   {d3[0]}, [r10]
++        vld1.32   {d3[1]}, [r3]
++1:
++        bcc        1f
++        ldr        r12, [sp, #dl_size]
++        vld1.32   {d4[1]}, [r4],  r9
++        cmp        r12, #p_size
++        vld1.32   {d5[0]}, [r8],  r9
++        vld1.32   {d5[1]}, [r4],  r9
++        blt        2f
++        vld1.32   {d6[0]}, [r8],  r9
++        vld1.32   {d6[1]}, [r4],  r9
++        vld1.32   {d7[0]}, [r8]
++        vld1.32   {d7[1]}, [r4]
++        b          1f
++2:
++        vdup.32    q3,  d5[1]
++1:
++        add        r12, r0,  #-pw
++        vstm       r1,  { q8-q11}       @ Up
++        vst1.32   {d31[1]}, [r12]
++        vstm       r0,  { q0-q3 }       @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_16_neon_32(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    2
++.set    pw,      (1 << pw_s)
++.set    log2_s,  4
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_32, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
++
++        @ Once we get this big we have run out of neon regs to store
++        @ everything at once so do in pieces
++
++        @ Up (have)
++        it cs
++        vldmcs     r6,  { q0-q3 }
++        ldr        r12, [sp, #ur_size]
++        it mi
++        vldmmi     r5,  { q8-q11}
++        it cs
++        vstmcs     r1,  { q0-q3 }
++        bpl        1f
++        cmp        r12, #12
++        add        lr,  r1,  #(pw << log2_s)
++        bgt        2f
++        cmp        r12, #8
++        bge        3f
++        vdup.16    q9,  d17[1]
++4:      vdup.16    d10, d19[1]
++3:      vdup.16    q11, d21[1]
++2:      vstm       lr, { q8-q11}
++1:
++
++        @ Left (have)
++        add        lr,  r0,  #-pw
++        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
++        vst1.32   {d30[1]}, [lr]        @ UL
++        bpl        1f
++        vld1.32   { d0[0]}, [r10], r9
++        vld1.32   { d0[1]}, [r3],  r9
++        vld1.32   { d1[0]}, [r10], r9
++        vld1.32   { d1[1]}, [r3],  r9
++        vld1.32   { d2[0]}, [r10], r9
++        vld1.32   { d2[1]}, [r3],  r9
++        vld1.32   { d3[0]}, [r10], r9
++        vld1.32   { d3[1]}, [r3],  r9
++        vld1.32   { d4[0]}, [r10], r9
++        vld1.32   { d4[1]}, [r3],  r9
++        vld1.32   { d5[0]}, [r10], r9
++        vld1.32   { d5[1]}, [r3],  r9
++        vld1.32   { d6[0]}, [r10], r9
++        vld1.32   { d6[1]}, [r3],  r9
++        vld1.32   { d7[0]}, [r10]
++        vld1.32   { d7[1]}, [r3]
++        vstm       r0,  { q0-q3 }
++1:
++        bcc        1f
++        ldr        r12, [sp, #dl_size]
++        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
++        add        lr,  r0,  #(pw << log2_s)
++        vld1.32   {d16[1]}, [r4],  r9
++        cmp        r12, #4
++        vld1.32   {d17[0]}, [r8],  r9
++        vld1.32   {d17[1]}, [r4],  r9
++        ble        2f
++        vld1.32   {d18[0]}, [r8],  r9
++        vld1.32   {d18[1]}, [r4],  r9
++        cmp        r12, #12
++        vld1.32   {d19[0]}, [r8],  r9
++        vld1.32   {d19[1]}, [r4],  r9
++        blt        3f
++        vld1.32   {d20[0]}, [r8],  r9
++        vld1.32   {d20[1]}, [r4],  r9
++        vld1.32   {d21[0]}, [r8],  r9
++        vld1.32   {d21[1]}, [r4],  r9
++        ble        4f
++        vld1.32   {d22[0]}, [r8],  r9
++        vld1.32   {d22[1]}, [r4],  r9
++        vld1.32   {d23[0]}, [r8]
++        vld1.32   {d23[1]}, [r4]
++        b          5f
++2:      vdup.32    q9,  d17[1]
++3:      vdup.32    q10, d19[1]
++4:      vdup.32    q11, d21[1]
++5:      vstm       lr,  { q8-q11}
++1:
++        eors       r7,  r2
++        beq        99f
++
++        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
++        vdup.32    q0,  d31[0]
++        vdup.32    q1,  d31[0]
++        vdup.32    q2,  d31[0]
++        vdup.32    q3,  d31[0]
++        add        lr,  r1,  #(pw << log2_s)
++        vdup.32    q8,  d31[1]
++        vdup.32    q9,  d31[1]
++        vdup.32    q10, d31[1]
++        vdup.32    q11, d31[1]
++        it cs
++        vstmcs     r1,  { q0-q3 }
++        it mi
++        vstmmi     lr,  { q8-q11}
++
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        vdup.32    q0,  d30[0]
++        vdup.32    q1,  d30[0]
++        vdup.32    q2,  d30[0]
++        vdup.32    q3,  d30[0]
++        add        lr,  r0,  #(pw << log2_s)
++        it mi
++        vstmmi     r0, { q0-q3 }
++        it cs
++        vstmcs     lr, { q0-q3 }
++
++99:
++        pop       {r4-r10, pc}
++endfunc
++
++
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+new file mode 100644
+index 0000000000..56819ae439
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+@@ -0,0 +1,920 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * Horizontal & Vertical special cases of angular intra pred
++ *
++ * Split out because:
++ *  Vertical, at least, is relatively common
++ *  Much simpler code than the general angular case
++ *  Luma with size < 32 has extra filtering that doesn't happen anywhere else
++ *
++ * *** Currently luma filtering is mandatory where it occurs, but there are
++ *     cases where it should be turned off (rdpcm & an extension sps flag).
++ *     These don't occur in the standard conformance suite for Main Profile
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ ff_hevc_rpi_pred_vertical_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.32     {d0[0]}, [r2 :32]   @ Left
++        add         r2, r0, r3
++        vld1.8      {d1[]}, [r1]
++        lsl         r3, #1
++        vdup.8      d4, ip
++        vmov.i8     d2, #128
++        vhsub.u8    d4, d0, d4
++        veor        d1, d2
++        vld1.32     {d0[0]}, [r1 :32]   @ Top
++        vqadd.s8    d1, d4
++        vmov.i64    d3, #0xff
++        vmov        d4, d0
++        veor        d5, d1, d2
++        veor        d1, d1, d2
++        vbit        d0, d1, d3
++        vshr.u64    d5, #8
++        vst1.32     {d0[0]}, [r0], r3
++        vshr.u64    d1, #16
++        vbit        d4, d5, d3
++        vshr.u64    d5, #16
++        vst1.32     {d4[0]}, [r2], r3
++        vbit        d0, d1, d3
++        vst1.32     {d0[0]}, [r0]
++        vbit        d4, d5, d3
++        vst1.32     {d4[0]}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {d0}, [r2 :64]      @ Left
++        vmov.i8     d1, #128
++        vld1.8      {d2[]}, [r1]
++        vld1.8      {d3}, [r1 :64]      @ Top
++        vdup.8      d4, ip
++        vhsub.u8    d4, d0, d4
++        veor        d2, d1
++        vmov.i64    d0, #0xff
++        mov         r1, #8
++        vqadd.s8    d2, d4, d2
++        veor        d1, d2, d1
++1:
++        vbit        d3, d1, d0
++        vshr.u64    d1, #8
++        vst1.8      {d3}, [r0 :64], r3
++        subs        r1, #2
++        vbit        d3, d1, d0
++        vshr.u64    d1, #8
++        vst1.8      {d3}, [r0 :64], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {q0}, [r2 :128]     @ Left
++        vdup.8      q1, ip
++        vld1.8      {d4[],d5[]}, [r1]
++        vhsub.u8    q0, q1
++        vmov.i8     q1, #128
++        veor        q2, q1
++        vmov.i64    d16, #0xff
++        vqadd.s8    q0, q2
++        vld1.8      {q3}, [r1 :128]     @ Top
++        mov         r1, #16
++        veor        q0, q1
++        vmov        q1, q3
++        vext.8      q2, q0, q0, #1
++1:
++        vbit        d2, d0, d16
++        vbit        d6, d4, d16
++        vext.8      q0, q0, q0, #2
++        subs        r1, #2
++        vst1.8      {q1}, [r0 :128], r3
++        vext.8      q2, q2, q2, #2
++        vst1.8      {q3}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vert_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
++        vld1.8     {q0,  q1 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3
++        lsl         r3,  #1
++        mov         r1,  #16
++1:
++        vst1.8     {q0,  q1 }, [r0  :128], r3
++        subs        r1,  #1
++        vst1.8     {q0,  q1 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
++        vld1.16    {d0 }, [r1  :64]    @ Up
++        add         r2,  r0,  r3,  lsl #1
++        lsl         r3,  #2
++
++        vst1.16    {d0 }, [r0  :64], r3
++        vst1.16    {d0 }, [r2  :64], r3
++        vst1.16    {d0 }, [r0  :64]
++        vst1.16    {d0 }, [r2  :64]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
++        vld1.16    {q0 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #1
++        lsl         r3,  #2
++        mov         r1,  #4
++1:
++        vst1.16    {q0 }, [r0  :128], r3
++        subs        r1,  #2
++        vst1.16    {q0 }, [r2  :128], r3
++        vst1.16    {q0 }, [r0  :128], r3
++        vst1.16    {q0 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
++        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #1
++        lsl         r3,  #2
++        mov         r1,  #8
++1:
++        vst1.16    {q0,  q1 }, [r0  :128], r3
++        subs        r1,  #1
++        vst1.16    {q0,  q1 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++@ ? Might be faster as simple arm
++
++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.32     {d0[0]}, [r1 :32]   @ Top
++        add         r1, r2, #3
++        vld1.8      {d1[]}, [r2]!
++        vdup.8      d2, ip
++        vmov.i8     d3, #128
++        vhsub.u8    d0, d2
++        veor        d1, d3
++        vld1.8      {d2[]}, [r2]!
++        add         ip, r0, r3
++        vqadd.s8    d0, d0, d1
++        lsl         r3, #1
++        vld1.8      {d1[]}, [r2]
++        vld1.8      {d4[]}, [r1]
++        veor        d0, d3
++        vst1.32     {d0[0]}, [r0 :32], r3
++        vst1.32     {d2[0]}, [ip :32], r3
++        vst1.32     {d1[0]}, [r0 :32]
++        vst1.32     {d4[0]}, [ip :32]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {d0}, [r1 :64]      @ Top
++        vmov.i8     d1, #128
++        vld1.8      {d2[]}, [r2]!
++        mov         r1, #8-2
++        vdup.8      d3, ip
++        vhsub.u8    d0, d3
++        veor        d2, d1
++        vqadd.s8    d0, d2
++          vld1.8      {d2[]}, [r2]!
++        veor        d0, d1
++        vst1.8      {d0}, [r0], r3
++1:
++            vld1.8      {d0[]}, [r2]!
++        subs        r1, #2
++          vst1.8      {d2}, [r0 :64], r3
++              vld1.8      {d2[]}, [r2]!
++            vst1.8      {d0}, [r0 :64], r3
++        bne         1b
++
++              vst1.8      {d2}, [r0 :64]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {q0}, [r1 :64]      @ Top
++        mov         r1, #16-2
++        vld1.8      {d4[],d5[]}, [r2]!
++        vdup.8      q3, ip
++        vhsub.u8    q0, q3
++        vmov.i8     q1, #128
++        veor        q2, q1
++        vqadd.s8    q0, q2
++          vld1.8      {d4[],d5[]}, [r2]!
++        veor        q0, q1
++        vst1.8      {q0}, [r0], r3
++1:
++            vld1.8      {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.8      {q2}, [r0 :64], r3
++              vld1.8      {d4[],d5[]}, [r2]!
++            vst1.8      {q0}, [r0 :64], r3
++        bne         1b
++
++              vst1.8      {q2}, [r0 :64]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
++        vld1.8      {d0[],d1[]}, [r2]!
++        add         ip, r0, #16
++        mov         r1, #32-2
++          vld1.8      {d2[],d3[]}, [r2]!
++        vst1.8      {q0}, [r0 :128], r3
++        vst1.8      {q0}, [ip :128], r3
++1:
++            vld1.8      {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.8      {q1}, [r0 :128], r3
++          vst1.8      {q1}, [ip :128], r3
++              vld1.8      {d2[],d3[]}, [r2]!
++            vst1.8      {q0}, [r0 :128], r3
++            vst1.8      {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.8      {q1}, [r0 :128]
++              vst1.8      {q1}, [ip :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
++        add         r1, r2, #2
++        vld1.16     {d0[]}, [r2]
++        add         r2, #4
++        vld1.16     {d1[]}, [r1]
++        add         r1, #4
++        vld1.16     {d2[]}, [r2]
++A       add         r2, r0, r3, lsl #1
++T       lsl         r3, #1
++T       add         r2, r0, r3
++        vld1.16     {d3[]}, [r1]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vst1.16     {d0}, [r0 :64], r3
++        vst1.16     {d1}, [r2 :64], r3
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d3}, [r2 :64]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
++        vld1.16     {d0[],d1[]}, [r2]!
++        lsl         r3, #1
++          vld1.16     {d2[],d3[]}, [r2]!
++        mov         r1, #8-2
++        vst1.16     {q0}, [r0 :64], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q1}, [r0 :64], r3
++              vld1.16     {d2[],d3[]}, [r2]!
++            vst1.16     {q0}, [r0 :64], r3
++        bne         1b
++
++              vst1.16     {q1}, [r0 :64]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
++        vld1.16     {d0[],d1[]}, [r2]!
++        lsl         r3, #1
++        add         ip, r0, #16
++        mov         r1, #16-2
++          vld1.16     {d2[],d3[]}, [r2]!
++        vst1.16     {q0}, [r0 :128], r3
++        vst1.16     {q0}, [ip :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q1}, [r0 :128], r3
++          vst1.16     {q1}, [ip :128], r3
++              vld1.16     {d2[],d3[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], r3
++            vst1.16     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.16     {q1}, [r0 :128]
++              vst1.16     {q1}, [ip :128]
++        bx          lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ 10 Bit
++@ Has clipping constants so 10-bit only but could easily be macroed up to
++@ 14-bit before we run out of bits
++
++
++@ ff_hevc_rpi_pred_vertical_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {d0}, [r2 :64]      @ Left
++        vmov.i16    d2, #0
++        vld1.16     {d1[]}, [r1]
++T       lsl         r3, #1
++        vdup.16     d4, ip
++        vmov.i16    d3, #0x3ff
++        vld1.16     {d5}, [r1 :64]      @ Top
++        vhsub.u16   d4, d0, d4
++        vmov.i64    d0, #0xffff
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vadd.i16    d1, d1, d4
++        vmov        d6, d5
++        vmax.s16    d1, d1, d2
++        vmin.s16    d2, d1, d3
++        vmin.s16    d1, d1, d3
++        vbit        d5, d1, d0
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vshr.u64    d2, #16
++        vshr.u64    d1, #32
++        vbit        d6, d2, d0
++        vst1.16     {d5}, [r0], r3
++        vshr.u64    d2, #32
++        vst1.16     {d6}, [r2], r3
++        vbit        d5, d1, d0
++        vst1.16     {d5}, [r0]
++        vbit        d6, d2, d0
++        vst1.16     {d6}, [r2]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0}, [r2 :128]     @ Left
++        lsl         r3, #1
++        vdup.16     q1, ip
++        vld1.16     {d4[],d5[]}, [r1]
++        vhsub.u16   q0, q0, q1
++        vmov.i16    q1, #0
++        vadd.i16    q0, q2
++        vmov.i16    q2, #0x3ff
++        vld1.16     {q3}, [r1 :128]     @ Top
++        mov         r1, #8
++        vmax.s16    q0, q1
++        vmov        q1, q3
++        vmin.s16    q0, q2
++        vmov.i64    d16, #0xffff
++        vext.16     q2, q0, q0, #1
++1:
++        vbit        d2, d0, d16
++        vbit        d6, d4, d16
++        vext.16     q0, q0, q0, #2
++        subs        r1, #2
++        vst1.16     {q1}, [r0 :128], r3
++        vext.16     q2, q2, q2, #2
++        vst1.16     {q3}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0-q1}, [r2 :128]  @ Left
++T       lsl         r3, #1
++        vdup.16     q2, ip
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vld1.16     {d6[],d7[]}, [r1]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vhsub.u16   q0, q2
++        vhsub.u16   q1, q2
++        vadd.i16    q0, q3
++        vadd.i16    q1, q3
++        vmov.i16    q2, #0
++        vld1.16     {q8-q9}, [r1 :128]  @ Top
++        mov         r1, #0
++        vmov.i16    q3, #0x3ff
++        vmax.s16    q0, q2
++        vmax.s16    q1, q2
++        vmin.s16    q0, q3
++        vmin.s16    q1, q3
++        vmov        q10, q8
++        vmov        q11, q9
++        vext.16     q2, q0, q1, #1
++        vext.16     q3, q1, q1, #1
++        vmov.i64    d24, #0xffff
++1:
++        vbit        d16, d0, d24
++        vbit        d20, d4, d24
++        vext.16     q0, q0, q0, #2
++        subs        r1, #1<<30
++        vst1.16     {q8-q9}, [r0 :128], r3
++        vext.16     q2, q2, q2, #2
++        vst1.16     {q10-q11}, [r2 :128], r3
++        bne         1b
++1:
++        vbit        d16, d2, d24
++        vbit        d20, d6, d24
++        vext.16     q1, q1, q1, #2
++        subs        r1, #1<<30
++        vst1.16     {q8-q9}, [r0 :128], r3
++        vext.16     q3, q3, q3, #2
++        vst1.16     {q10-q11}, [r2 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
++        vldm        r1, { q0-q3 }    @ Up
++        lsl         r3, #1
++        mov         r1, #32
++        add         r2, r0, #32
++1:
++        vst1.16     {q0-q1}, [r0 :128], r3
++        subs        r1, #1
++        vst1.16     {q2-q3}, [r2 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
++        vld1.16    {q0 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #2
++        lsl         r3,  #3
++
++        vst1.16    {q0 }, [r0  :128], r3
++        vst1.16    {q0 }, [r2  :128], r3
++        vst1.16    {q0 }, [r0  :128]
++        vst1.16    {q0 }, [r2  :128]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
++        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #2
++        lsl         r3,  #3
++        mov         r1,  #4
++1:
++        vst1.16    {q0,  q1 }, [r0  :128], r3
++        subs        r1,  #1
++        vst1.16    {q0,  q1 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
++        vldm        r1, { q0-q3 }    @ Up
++        lsl         r3, #2
++        mov         r1, #16
++        add         r2, r0, #32
++1:
++        vst1.16     {q0-q1}, [r0 :128], r3
++        subs        r1, #1
++        vst1.16     {q2-q3}, [r2 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++@ ff_hevc_rpi_pred_horizontal_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {d0}, [r1 :64]      @ Top
++        vmov.i16    d1, #0
++        vld1.16     {d2[]}, [r2]!
++T       lsl         r3, #1
++        vdup.16     d3, ip
++        vmov.i16    d4, #0x3ff
++        vhsub.u16   d0, d3
++A       add         ip, r0, r3, lsl #1
++T       add         ip, r0, r3
++        vld1.16     {d3[]}, [r2]!
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vadd.i16    d0, d2
++        vld1.16     {d2[]}, [r2]!
++        vmax.s16    d0, d1
++        vld1.16     {d1[]}, [r2]
++        vmin.s16    d0, d4
++        vst1.16     {d0}, [r0 :64], r3
++        vst1.16     {d3}, [ip :64], r3
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d1}, [ip :64]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0}, [r1 :128]     @ Top
++        lsl         r3, #1
++        vdup.16     q1, ip
++        mov         r1, #8-2
++        vhsub.u16   q0, q1
++        vld1.16     {d2[],d3[]}, [r2]!
++        vmov.i16    q2, #0
++        vadd.i16    q0, q1
++        vmov.i16    q1, #0x3ff
++        vmax.s16    q0, q2
++          vld1.16     {d4[],d5[]}, [r2]!
++        vmin.s16    q0, q1
++        vst1.16     {q0}, [r0 :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q2}, [r0 :128], r3
++              vld1.16     {d4[],d5[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], r3
++        bne         1b
++
++              vst1.16     {q2}, [r0 :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0-q1}, [r1 :128]  @ Top
++        lsl         r3, #1
++        vdup.16     q2, ip
++        add         ip, r0, r3
++        vhsub.u16   q0, q2
++        add         ip, #16
++        vhsub.u16   q1, q2
++        mov         r1, #16-2
++        vld1.16     {d4[],d5[]}, [r2]!
++        vmov.i16    q3, #0
++        vadd.u16    q0, q2
++        vadd.i16    q1, q2
++        vmov.i16    q2, #0x3ff
++        vmax.s16    q0, q3
++        vmax.s16    q1, q3
++          vld1.16     {d6[],d7[]}, [r2]!
++        vmin.s16    q0, q2
++        vmin.s16    q1, q2
++        vst1.16     {q0-q1}, [r0 :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q3}, [r0 :128], r3
++          vst1.16     {q3}, [ip :128], r3
++              vld1.16     {d6[],d7[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], r3
++            vst1.16     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.16     {q3}, [r0 :128]
++              vst1.16     {q3}, [ip :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
++        vld1.16     {d0[],d1[]}, [r2]!
++        add         ip, r0, #16
++        push        {lr}
++        mov         lr, #32
++          vld1.16     {d2[],d3[]}, [r2]!
++        lsl         r3, #1
++        vst1.16     {q0}, [r0 :128], lr
++        sub         r3, #32
++        vst1.16     {q0}, [ip :128], lr
++        mov         r1, #32-2
++        vst1.16     {q0}, [r0 :128], r3
++        vst1.16     {q0}, [ip :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q1}, [r0 :128], lr
++          vst1.16     {q1}, [ip :128], lr
++          vst1.16     {q1}, [r0 :128], r3
++          vst1.16     {q1}, [ip :128], r3
++              vld1.16     {d2[],d3[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], lr
++            vst1.16     {q0}, [ip :128], lr
++            vst1.16     {q0}, [r0 :128], r3
++            vst1.16     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.16     {q1}, [r0 :128], lr
++              vst1.16     {q1}, [ip :128], lr
++              vst1.16     {q1}, [r0 :128]
++              vst1.16     {q1}, [ip :128]
++        pop         {pc}
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
++        add         r1, r2, #4
++        vld1.32     {d0[],d1[]}, [r2]
++        add         r2, #8
++        vld1.32     {d2[],d3[]}, [r1]
++        add         r1, #8
++        vld1.32     {d4[],d5[]}, [r2]
++A       add         r2, r0, r3, lsl #2
++T       lsl         r3, #2
++T       add         r2, r0, r3
++        vld1.32     {d6[],d7[]}, [r1]
++A       lsl         r3, #3
++T       lsl         r3, #1
++        vst1.32     {q0}, [r0 :128], r3
++        vst1.32     {q1}, [r2 :128], r3
++        vst1.32     {q2}, [r0 :128]
++        vst1.32     {q3}, [r2 :128]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
++        vld1.32     {d0[],d1[]}, [r2]!
++        lsl         r3, #2
++        add         ip, r0, #16
++        mov         r1, #8-2
++          vld1.32     {d2[],d3[]}, [r2]!
++        vst1.32     {q0}, [r0 :128], r3
++        vst1.32     {q0}, [ip :128], r3
++1:
++            vld1.32     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.32     {q1}, [r0 :128], r3
++          vst1.32     {q1}, [ip :128], r3
++              vld1.32     {d2[],d3[]}, [r2]!
++            vst1.32     {q0}, [r0 :128], r3
++            vst1.32     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.32     {q1}, [r0 :128]
++              vst1.32     {q1}, [ip :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
++        vld1.32     {d0[],d1[]}, [r2]!
++        add         ip, r0, #16
++        push        {lr}
++        mov         lr, #32
++          vld1.32     {d2[],d3[]}, [r2]!
++        lsl         r3, #2
++        vst1.32     {q0}, [r0 :128], lr
++        sub         r3, #32
++        vst1.32     {q0}, [ip :128], lr
++        mov         r1, #16-2
++        vst1.32     {q0}, [r0 :128], r3
++        vst1.32     {q0}, [ip :128], r3
++1:
++            vld1.32     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.32     {q1}, [r0 :128], lr
++          vst1.32     {q1}, [ip :128], lr
++          vst1.32     {q1}, [r0 :128], r3
++          vst1.32     {q1}, [ip :128], r3
++              vld1.32     {d2[],d3[]}, [r2]!
++            vst1.32     {q0}, [r0 :128], lr
++            vst1.32     {q0}, [ip :128], lr
++            vst1.32     {q0}, [r0 :128], r3
++            vst1.32     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.32     {q1}, [r0 :128], lr
++              vst1.32     {q1}, [ip :128], lr
++              vst1.32     {q1}, [r0 :128]
++              vst1.32     {q1}, [ip :128]
++        pop         {pc}
++endfunc
++
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+new file mode 100644
+index 0000000000..af8c4c03f0
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+@@ -0,0 +1,1043 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ Planar intra pred (8.4.4.2.4)
++@
++@ predSamples[ x ][ y ] =
++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
++@   ( x + 1 ) * p[ nTbS ][ -1 ] +
++@   ( nTbS - 1 - y ) * p[ x ][ -1 ] +
++@   ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
++
++@ All 10-bit functions would work with 9
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_8, export=1
++
++        vld1.8      {d0}, [r1]          @ Top
++        adr         ip, nb_3_0_1_4
++        vld1.8      {d1}, [r2]          @ Left
++        vmov.i64    d2, #0xffffffff
++        vldr        d3, [ip, #8]        @ {1,2,3,4,1,2,3,4}
++        add         r1, r0, r3
++        vdup.32     d4, d0[0]           @ {t0,t1,t2,t3,t0,t1,t2,t3}
++        vdup.8      d0, d0[4]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
++        vdup.8      d5, d1[4]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
++        vdup.8      d6, d1[0]           @ {l0,l0,l0,l0,l0,l0,l0,l0}
++        vshll.u8    q8, d4, #2
++        lsl         r3, #1
++        vsubl.u8    q2, d5, d4
++        vmlal.u8    q8, d0, d3
++        vld1.8      {d0}, [ip]          @ {3,2,1,0,3,2,1,0}
++        vdup.8      d7, d1[1]           @ {l1,l1,l1,l1,l1,l1,l1,l1}
++        vshl.s16    q9, q2, #1
++        vbif        d6, d7, d2          @ {l0,l0,l0,l0,l1,l1,l1,l1}
++        vadd.i16    d16, d4
++        vdup.8      d7, d1[2]           @ {l2,l2,l2,l2,l2,l2,l2,l2}
++        vadd.i16    d17, d18
++        vdup.8      d1, d1[3]           @ {l3,l3,l3,l3,l3,l3,l3,l3}
++        vadd.i16    q2, q8, q9
++        vmlal.u8    q8, d0, d6
++        vbif        d7, d1, d2          @ {l2,l2,l2,l2,l3,l3,l3,l3}
++        vmlal.u8    q2, d0, d7
++        vrshrn.i16  d0, q8, #3
++        vst1.32     d0[0], [r0 :32], r3
++        vst1.32     d0[1], [r1 :32], r3
++        vrshrn.i16  d0, q2, #3
++        vst1.32     d0[0], [r0 :32]
++        vst1.32     d0[1], [r1 :32]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_10, export=1
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        vld1.16     {q0}, [r1 :64]      @ Top
++        adr         ip, nbh_3_0_1_4
++        vldr        d2, [r2, #8]        @ Left (lower)
++        vldr        d3, [ip, #8]        @ {1,2,3,4}
++T       lsl         r3, #1
++        vshl.s16    d4, d0, #2
++        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4}
++        vldr        d5, [r2]            @ Left (upper)
++        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4}
++        vldr        d6, [ip]            @ {3,2,1,0}
++        vmla.i16    d4, d3, d1          @ Acc set up
++        vsub.i16    d0, d2, d0          @ Add set up
++        vmov        d7, d6
++        vdup.16     d2, d5[0]
++        vdup.16     d3, d5[1]
++        vdup.16     d16, d5[2]
++        vadd.i16    d18, d0, d4
++        vshl.s16    d0, #1              @ x2
++        vadd.i16    d19, d0, d4
++        vdup.16     d17, d5[3]
++        vadd.i16    d4, d0, d18
++A       add         r1, r0, r3, lsl #1
++T       add         r1, r0, r3
++        vadd.i16    d5, d0, d19
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vmla.i16    q9, q1, q3
++        vmla.i16    q2, q8, q3
++        vrshr.u16   q0, q9, #3
++        vst1.16     {d0}, [r0], r3
++        vrshr.u16   d2, d4, #3
++        vst1.16     {d1}, [r1], r3
++        vrshr.u16   d3, d5, #3
++        vst1.16     {d2}, [r0]
++        vst1.16     {d3}, [r1]
++
++        bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_8, export=1
++
++        vld1.8      {q0}, [r1]          @ Top
++        adr         ip, nb_7_0_1_8
++        vldr        d2, [r2, #8]        @ Left (lower)
++        mov         r1, #8
++        vldr        d3, [ip, #8]        @ {1,2,3,4,5,6,7,8}
++        vshll.u8    q2, d0, #3
++        vdup.8      d1, d1[0]           @ {t8,t8,t8,t8,t8,t8,t8,t8}
++        vdup.8      d2, d2[0]           @ {l8,l8,l8,l8,l8,l8,l8,l8}
++        vldr        d6, [r2]            @ Left (upper)
++        vmlal.u8    q2, d3, d1
++        vsubl.u8    q0, d2, d0
++        vldr        d7, [ip]            @ {7,6,5,4,3,2,1,0}
++
++@ u8   7..0    [1]  d7
++@ u8  left[y]  [1]  d6
++@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vdup.8      d2, d6[0]
++        vadd.i16    q2, q0
++        vdup.8      d3, d6[1]
++        vadd.i16    q8, q2, q0
++1:
++        vmlal.u8    q2, d7, d2
++        subs        r1, #2
++        vadd.i16    q9, q8, q0
++        vmlal.u8    q8, d7, d3
++        vdup.8      d2, d6[2]
++        vdup.8      d3, d6[3]
++        vrshrn.i16  d20, q2, #4
++        vshr.u64    d6, #16
++        vmov        q2, q9
++        vst1.8      {d20}, [r0], r3
++        vrshrn.i16  d20, q8, #4
++        vadd.i16    q8, q2, q0
++        vst1.8      {d20}, [r0], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_10, export=1
++
++        adr         ip, nb_7_0_1_8
++        vld1.16     {q0}, [r1 :128]!    @ Top (left)
++        lsl         r3, #1
++        vld1.16     {q1}, [ip :128]     @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
++        add         ip, r2, #16
++        vld1.16     {d4[],d5[]}, [r1]   @ Top (right)
++        mov         r1, #8-2
++        vshl.s16    q3, q0, #3
++        vmovl.u8    q8, d3              @ {1,2,3,4,5,6,7,8}
++        vld1.16     {d18[],d19[]}, [ip] @ Left (lower)
++        vmla.i16    q3, q8, q2          @ Acc set up
++        vsub.i16    q0, q9, q0          @ Add set up
++        vmovl.u8    q1, d2              @ {7,6,5,4,3,2,1,0}
++        vadd.i16    q2, q3, q0
++
++@ u16  7..0        [1]  q1
++@ u32 left[y]      [1]  [r2]
++@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vld1.16     {d6[],d7[]}, [r2]!
++        vadd.i16    q8, q2, q0
++        vld1.16     {d18[],d19[]}, [r2]!
++        vmla.i16    q2, q1, q3
++        vadd.i16    q3, q8, q0
++        vmla.i16    q8, q1, q9
++1:
++        vrshr.u16   q9, q2, #4
++        subs        r1, #2
++        vmov        q2, q3
++        vrshr.u16   q10, q8, #4
++          vld1.16     {d6[],d7[]}, [r2]!
++        vst1.16     {q9}, [r0 :128], r3
++          vadd.i16    q8, q2, q0
++          vld1.16     {d18[],d19[]}, [r2]!
++          vmla.i16    q2, q1, q3
++          vadd.i16    q3, q8, q0
++          vmla.i16    q8, q1, q9
++        vst1.16     {q10}, [r0 :128], r3
++        bne         1b
++
++        vrshr.u16   q9, q2, #4
++        add         r3, r0
++        vrshr.u16   q10, q8, #4
++        vst1.16     {q9}, [r0 :128]
++        vst1.16     {q10}, [r3 :128]
++
++        bx         lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++        .balign 64
++
++nb_31_0_1_32:
++        .byte   31, 30, 29, 28, 27, 26, 25, 24
++        .byte   23, 22, 21, 20, 19, 18, 17, 16
++nb_15_0_1_16:
++        .byte   15, 14, 13, 12, 11, 10,  9,  8
++        .byte    7,  6,  5,  4,  3,  2,  1,  0
++        .byte    1,  2,  3,  4,  5,  6,  7,  8
++        .byte    9, 10, 11, 12, 13, 14, 15, 16
++        .byte   17, 18, 19, 20, 21, 22, 23, 24
++        .byte   25, 26, 27, 28, 29, 30, 31, 32
++
++        @ should be back on a 64-byte boundary here
++
++        @ These could be extracted from the above array, but separate out
++        @ out for better (16 byte) alignment
++nb_3_0_1_4:
++        .byte    3,  2,  1,  0,  3,  2,  1,  0
++        .byte    1,  2,  3,  4,  1,  2,  3,  4
++nb_7_0_1_8:
++        .byte    7,  6,  5,  4,  3,  2,  1,  0
++        .byte    1,  2,  3,  4,  5,  6,  7,  8
++nbh_3_0_1_4:
++        .short   3,  2,  1,  0,  1,  2,  3,  4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_8, export=1
++
++        adr         ip, nb_15_0_1_16 + 16
++        vld1.8      {q0}, [r1 :128]!    @ Top (left)
++        add         r2, #16
++        vld1.8      {q1}, [ip: 128]     @ {1,2,3...16}
++        vld1.8      {d4[]}, [r1]        @ Top (right)
++        sub         ip, #16
++        vshll.u8    q3, d0, #4
++        mov         r1, #16
++        vshll.u8    q8, d1, #4
++        vld1.8      {d5[]}, [r2]        @ Left (lower)
++        sub         r2, #16
++        vmlal.u8    q3, d2, d4
++        vmlal.u8    q8, d3, d4          @ Acc set up
++        vsubl.u8    q1, d5, d0
++        vsubl.u8    q0, d5, d1          @ Add set up
++        vld1.8      {q2}, [ip :128]     @ {15,14,13...0}
++
++@ u8  15..0    [1]  q2
++@ u8  left[y]  [1]  [r2]
++@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q3, q1
++        vadd.i16    q8, q0
++1:
++        vadd.i16    q10, q3, q1
++        subs        r1, #2
++        vld1.8      {d18[]}, [r2]!
++        vadd.i16    q11, q8, q0
++        vld1.8      {d19[]}, [r2]!
++        vmlal.u8    q3, d4, d18
++        vmlal.u8    q8, d5, d18
++        vadd.i16    q12, q10, q1
++        vmlal.u8    q10, d4, d19
++        vadd.i16    q13, q11, q0
++        vmlal.u8    q11, d5, d19
++        vrshrn.u16  d18, q3, #5
++        vrshrn.u16  d19, q8, #5
++        vmov        q3, q12
++        vst1.8      {q9}, [r0 :128], r3
++        vrshrn.u16  d18, q10, #5
++        vrshrn.u16  d19, q11, #5
++        vmov        q8, q13
++        vst1.8      {q9}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        adr         ip, nb_15_0_1_16 + 16
++        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
++        add         r2, #32
++        vld1.8      {q2}, [ip :128]     @ {1,2,3...16}
++        lsl         r3, #1
++        vld1.16     {d6[],d7[]}, [r1]   @ Top (right)
++        sub         ip, #16
++        vmovl.u8    q8, d4
++        mov         r1, #16
++        vshl.i16    q9, q0, #4
++        vmovl.u8    q2, d5
++        vshl.i16    q10, q1, #4
++        vld1.16     {d22[],d23[]}, [r2] @ Left (lower)
++        sub         r2, #32
++        vld1.8      {q12}, [ip]         @ {15,14,13...0}
++        vmla.i16    q9, q8, q3
++        vmla.i16    q10, q2, q3         @ Acc set up
++        vsub.i16    q0, q11, q0
++        vsub.i16    q1, q11, q1         @ Add set up
++        vadd.i16    q2, q9, q0
++        vadd.i16    q3, q10, q1
++        vmovl.u8    q8, d24
++        vmovl.u8    q9, d25
++
++@ u16  15..0       [2]  q8,q9
++@ u32 left[y]      [2]  [r2]
++@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++        vadd.i16    q10, q2, q0
++        subs        r1, #2
++        vld1.16     {d24[],d25[]}, [r2]!
++        vadd.i16    q11, q3, q1
++        vld1.16     {d28[],d29[]}, [r2]!
++        vmla.i16    q2, q8, q12
++        vmla.i16    q3, q9, q12
++        vadd.i16    q12, q10, q0
++        vmla.i16    q10, q8, q14
++        vadd.i16    q13, q11, q1
++        vmla.i16    q11, q9, q14
++        vrshr.u16   q14, q2, #5
++        vrshr.u16   q15, q3, #5
++        vmov        q2, q12
++        vst1.16     {q14-q15}, [r0 :128], r3
++        vrshr.u16   q14, q10, #5
++        vrshr.u16   q15, q11, #5
++        vmov        q3, q13
++        vst1.16     {q14-q15}, [r0 :128], r3
++        bne         1b
++
++        bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_8, export=1
++
++        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
++        adr         ip, nb_31_0_1_32 + 32
++        vpush       {d8-d12}
++        vld1.8      {q2-q3}, [ip :128]  @ {1,2,3...32}
++        add         r2, #32
++        vld1.8      {d8[]}, [r1]        @ Top (right)
++        sub         ip, #32
++        vshll.u8    q8, d0, #5
++        mov         r1, #32
++        vld1.8      {d9[]}, [r2]        @ Left (lower)
++        sub         r2, #32
++        vshll.u8    q9, d1, #5
++        vshll.u8    q10, d2, #5
++        vshll.u8    q11, d3, #5
++        vmlal.u8    q8, d4, d8
++        vsubl.u8    q12, d9, d0
++        vmlal.u8    q9, d5, d8
++        vsubl.u8    q13, d9, d1
++        vmlal.u8    q10, d6, d8
++        vsubl.u8    q14, d9, d2
++        vmlal.u8    q11, d7, d8         @ Acc set up
++        vsubl.u8    q15, d9, d3         @ Add set up
++        vadd.i16    q8, q12
++        vadd.i16    q9, q13
++        vadd.i16    q10, q14
++        vadd.i16    q11, q15
++        vld1.8      {q4-q5}, [ip :128]  @ {31,30,29...0}
++
++@ u8  31..0    [2]  q4,q5
++@ u8  left[y]  [2]  [r2]
++@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
++
++        vld1.8      {d12[]}, [r2]!
++        vadd.i16    q0, q8, q12
++        b           2f
++1:
++          vld1.8      {d12[]}, [r2]!
++        vrshrn.u16  d3, q1, #6
++        vrshrn.u16  d2, q0, #6
++          vadd.i16    q0, q8, q12
++        vrshrn.u16  d4, q2, #6
++        vrshrn.u16  d5, q3, #6
++        vst1.8      {q1-q2}, [r0 :128], r3
++2:        vadd.i16    q1, q9, q13
++          subs        r1, #2
++          vadd.i16    q2, q10, q14
++          vadd.i16    q3, q11, q15
++          vmlal.u8    q8, d8, d12
++          vmlal.u8    q9, d9, d12
++          vmlal.u8    q10, d10, d12
++          vmlal.u8    q11, d11, d12
++            vld1.8      {d12[]}, [r2]!
++          vrshrn.u16  d19, q9, #6
++          vrshrn.u16  d18, q8, #6
++            vadd.i16    q8, q0, q12
++          vrshrn.u16  d20, q10, #6
++          vrshrn.u16  d21, q11, #6
++          vst1.8      {q9-q10}, [r0 :128], r3
++            vadd.i16    q9, q1, q13
++            vadd.i16    q10, q2, q14
++            vadd.i16    q11, q3, q15
++            vmlal.u8    q0, d8, d12
++            vmlal.u8    q1, d9, d12
++            vmlal.u8    q2, d10, d12
++            vmlal.u8    q3, d11, d12
++
++        bne         1b
++
++        vpop        {d8-d12}
++
++        vrshrn.u16  d3, q1, #6
++        vrshrn.u16  d2, q0, #6
++        vrshrn.u16  d4, q2, #6
++        vrshrn.u16  d5, q3, #6
++        vst1.8      {q1-q2}, [r0 :128]
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
++        adr         ip, nb_31_0_1_32 + 32
++        vpush       {q4-q7}
++        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
++        add         r2, #64
++        vld1.8      {q14-q15}, [ip :128] @ {1,2,3...32}
++T       lsl         r3, #1
++        vld1.16     {d8[],d9[]}, [r1]    @ Top (right)
++        sub         ip, #32
++        vmovl.u8    q12, d28
++        mov         r1, #32
++        vmovl.u8    q13, d29
++        vld1.8      {q6-q7}, [ip :128]   @ {31,30,29...0}
++        vmovl.u8    q14, d30
++        vmovl.u8    q15, d31
++        vld1.16     {d10[],d11[]}, [r2]  @ Left (lower)
++        sub         r2, #64
++        vshl.i16    q8, q0, #5
++        vshl.i16    q9, q1, #5
++        vshl.i16    q10, q2, #5
++        vshl.i16    q11, q3, #5
++        vmla.i16    q8, q12, q4
++        vsub.i16    q0, q5, q0
++        vmla.i16    q9, q13, q4
++        vsub.i16    q1, q5, q1
++        vmla.i16    q10, q14, q4
++        vmov.u16    ip, d0[0]
++        vsub.i16    q2, q5, q2
++        vmla.i16    q11, q15, q4         @ Acc set up
++        vsub.i16    q3, q5, q3           @ Add set up
++        vadd.i16    q8, q0
++        vadd.i16    q9, q1
++        vadd.i16    q10, q2
++        vadd.i16    q11, q3
++        vmovl.u8    q4, d12
++        vmovl.u8    q5, d13
++        vmovl.u8    q6, d14
++        vmovl.u8    q7, d15
++
++@ u16 31..0    [4]  q4-q7
++@ u16 left[y]  [4]  [r2]
++@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q12, q8, q0
++A       sub         r0, r0, r3, lsl #1
++T       sub         r0, r3
++1:
++        vld1.16     {d0[0]}, [r2]!
++A       add         r0, r0, r3, lsl #1
++T       add         r0, r3
++        vadd.i16    q13, q9, q1
++        subs        r1, #2
++        vadd.i16    q14, q10, q2
++        vadd.i16    q15, q11, q3
++        vmla.i16    q8, q4, d0[0]
++        vmla.i16    q9, q5, d0[0]
++        vmla.i16    q10, q6, d0[0]
++        vmla.i16    q11, q7, d0[0]
++        vmov.16     d0[0], ip
++        vrshr.u16   q8, #6
++        vrshr.u16   q9, #6
++        vrshr.u16   q10, #6
++        vrshr.u16   q11, #6
++        vstm        r0, {q8-q11}
++        vadd.i16    q8, q12, q0
++A       add         r0, r0, r3, lsl #1
++T       add         r0, r3
++        vld1.16     {d0[0]}, [r2]!
++        vadd.i16    q9, q13, q1
++        vadd.i16    q10, q14, q2
++        vadd.i16    q11, q15, q3
++        vmla.i16    q12, q4, d0[0]
++        vmla.i16    q13, q5, d0[0]
++        vmla.i16    q14, q6, d0[0]
++        vmla.i16    q15, q7, d0[0]
++        vmov.16     d0[0], ip
++        vrshr.u16   q12, #6
++        vrshr.u16   q13, #6
++        vrshr.u16   q14, #6
++        vrshr.u16   q15, #6
++        vstm        r0, {q12-q15}
++        vadd.i16    q12, q8, q0
++        bne         1b
++
++        vpop        {q4-q7}
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
++
++        vld1.8      {q0}, [r1]          @ Top
++        adr         ip, nbx2_3_0_1_4
++        vldr        d2, [r2, #8]        @ Left (lower)
++        mov         r1, #4
++        vldr        d3, [ip, #8]        @ {1,1,2,2,3,3,4,4}
++        lsl         r3, #1
++        vshll.u8    q2, d0, #2
++        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
++        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
++        vldr        d6, [r2]            @ Left (upper)
++        vmlal.u8    q2, d3, d1
++        vsubl.u8    q0, d2, d0
++        vldr        d7, [ip]            @ {3,3,2,2,1,1,0,0}
++
++@ u8   3..0    [1]  d7
++@ u8  left[y]  [1]  d6
++@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vdup.16     d2, d6[0]
++        vadd.i16    q2, q0
++        vdup.16     d3, d6[1]
++        vadd.i16    q8, q2, q0
++1:
++        vmlal.u8    q2, d7, d2
++        subs        r1, #2
++        vadd.i16    q9, q8, q0
++        vmlal.u8    q8, d7, d3
++        vdup.16     d2, d6[2]
++        vdup.16     d3, d6[3]
++        vrshrn.i16  d20, q2, #3
++        vmov        q2, q9
++        vst1.8      {d20}, [r0], r3
++        vrshrn.i16  d20, q8, #3
++        vadd.i16    q8, q2, q0
++        vst1.8      {d20}, [r0], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
++
++        adr         ip, nbx2_3_0_1_4
++        vld1.16     {q0}, [r1 :128]!    @ Top (left)
++        lsl         r3, #2
++        vld1.16     {q1}, [ip :128]     @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
++        add         ip, r2, #16
++        vld1.32     {d4[],d5[]}, [r1]   @ Top (right)
++        vshl.s16    q3, q0, #2
++        vmovl.u8    q8, d3              @ {1,1,2,2,3,3,4,4}
++        vld1.32     {d18[],d19[]}, [ip] @ Left (lower)
++        vmla.i16    q3, q8, q2          @ Acc set up
++        vsub.i16    q0, q9, q0          @ Add set up
++        vmovl.u8    q1, d2              @ {3,3,2,2,1,1,0,0}
++        vadd.i16    q2, q3, q0
++
++@ u16  3..0        [1]  q1
++@ u32 left[y]      [1]  [r2]
++@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vld1.32     {d6[],d7[]}, [r2]!
++        vadd.i16    q8, q2, q0
++        vld1.32     {d18[],d19[]}, [r2]!
++        vmla.i16    q2, q1, q3
++        vadd.i16    q3, q8, q0
++        vmla.i16    q8, q1, q9
++
++        vrshr.u16   q9, q2, #3
++        vmov        q2, q3
++        vrshr.u16   q10, q8, #3
++          vld1.32     {d6[],d7[]}, [r2]!
++        vst1.16     {q9}, [r0 :128], r3
++          vadd.i16    q8, q2, q0
++          vld1.32     {d18[],d19[]}, [r2]!
++          vmla.i16    q2, q1, q3
++          vadd.i16    q3, q8, q0
++          vmla.i16    q8, q1, q9
++        vst1.16     {q10}, [r0 :128], r3
++
++          vrshr.u16   q9, q2, #3
++          add         r3, r0
++          vrshr.u16   q10, q8, #3
++          vst1.16     {q9}, [r0 :128]
++          vst1.16     {q10}, [r3 :128]
++
++          bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
++
++        adr         ip, nbx2_7_0_1_8 + 16
++        vld1.8      {q0}, [r1 :128]!    @ Top (left)
++        add         r2, #16
++        vld1.8      {q1}, [ip: 128]     @ {1,1,2,2,3,3...8,8}
++        lsl         r3, #1
++        vld1.16     {d4[]}, [r1]        @ Top (right)
++        sub         ip, #16
++        vshll.u8    q3, d0, #3
++        mov         r1, #8
++        vshll.u8    q8, d1, #3
++        vld1.16     {d5[]}, [r2]        @ Left (lower)
++        sub         r2, #16
++        vmlal.u8    q3, d2, d4
++        vmlal.u8    q8, d3, d4          @ Acc set up
++        vsubl.u8    q1, d5, d0
++        vsubl.u8    q0, d5, d1          @ Add set up
++        vld1.8      {q2}, [ip :128]     @ {7,7,6,6,5,5...0,0}
++
++@ u8  7..0     [1]  q2
++@ u8  left[y]  [1]  [r2]
++@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q3, q1
++        vadd.i16    q8, q0
++1:
++        vadd.i16    q10, q3, q1
++        subs        r1, #2
++        vld1.16     {d18[]}, [r2]!
++        vadd.i16    q11, q8, q0
++        vld1.16     {d19[]}, [r2]!
++        vmlal.u8    q3, d4, d18
++        vmlal.u8    q8, d5, d18
++        vadd.i16    q12, q10, q1
++        vmlal.u8    q10, d4, d19
++        vadd.i16    q13, q11, q0
++        vmlal.u8    q11, d5, d19
++        vrshrn.u16  d18, q3, #4
++        vrshrn.u16  d19, q8, #4
++        vmov        q3, q12
++        vst1.8      {q9}, [r0 :128], r3
++        vrshrn.u16  d18, q10, #4
++        vrshrn.u16  d19, q11, #4
++        vmov        q8, q13
++        vst1.8      {q9}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++        .balign 64
++
++nbx2_15_0_1_16:
++        .byte   15, 15, 14, 14, 13, 13, 12, 12
++        .byte   11, 11, 10, 10,  9,  9,  8,  8
++nbx2_7_0_1_8:
++        .byte    7,  7,  6,  6,  5,  5,  4,  4
++        .byte    3,  3,  2,  2,  1,  1,  0,  0
++        .byte    1,  1,  2,  2,  3,  3,  4,  4
++        .byte    5,  5,  6,  6,  7,  7,  8,  8
++        .byte    9,  9, 10, 10, 11, 11, 12, 12
++        .byte   13, 13, 14, 14, 15, 15, 16, 16
++
++        @ should be back on a 64-byte boundary here
++
++nbx2_3_0_1_4:
++        .byte    3,  3,  2,  2,  1,  1,  0,  0
++        .byte    1,  1,  2,  2,  3,  3,  4,  4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        adr         ip, nbx2_7_0_1_8 + 16
++        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
++        add         r2, #32
++        vld1.8      {q2}, [ip :128]     @ {1,1,2,2,3,3...8,8}
++        lsl         r3, #2
++        vld1.32     {d6[],d7[]}, [r1]   @ Top (right)
++        sub         ip, #16
++        vmovl.u8    q8, d4
++        mov         r1, #8
++        vshl.i16    q9, q0, #3
++        vmovl.u8    q2, d5
++        vshl.i16    q10, q1, #3
++        vld1.32     {d22[],d23[]}, [r2] @ Left (lower)
++        sub         r2, #32
++        vld1.8      {q12}, [ip]         @ {7,7,6,6,5,5...0,0}
++        vmla.i16    q9, q8, q3
++        vmla.i16    q10, q2, q3         @ Acc set up
++        vsub.i16    q0, q11, q0
++        vsub.i16    q1, q11, q1         @ Add set up
++        vadd.i16    q2, q9, q0
++        vadd.i16    q3, q10, q1
++        vmovl.u8    q8, d24
++        vmovl.u8    q9, d25
++
++@ u16  7..0        [2]  q8,q9
++@ u32 left[y]      [2]  [r2]
++@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++        vadd.i16    q10, q2, q0
++        subs        r1, #2
++        vld1.32     {d24[],d25[]}, [r2]!
++        vadd.i16    q11, q3, q1
++        vld1.32     {d28[],d29[]}, [r2]!
++        vmla.i16    q2, q8, q12
++        vmla.i16    q3, q9, q12
++        vadd.i16    q12, q10, q0
++        vmla.i16    q10, q8, q14
++        vadd.i16    q13, q11, q1
++        vmla.i16    q11, q9, q14
++        vrshr.u16   q14, q2, #4
++        vrshr.u16   q15, q3, #4
++        vmov        q2, q12
++        vst1.16     {q14-q15}, [r0 :128], r3
++        vrshr.u16   q14, q10, #4
++        vrshr.u16   q15, q11, #4
++        vmov        q3, q13
++        vst1.16     {q14-q15}, [r0 :128], r3
++        bne         1b
++
++        bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
++
++        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
++        adr         ip, nbx2_15_0_1_16 + 32
++        vpush       {d8-d12}
++        vld1.8      {q2-q3}, [ip :128]  @ {1,1,2,2,3,3...16,16}
++        add         r2, #32
++        vld1.16     {d8[]}, [r1]        @ Top (right)
++        sub         ip, #32
++        vshll.u8    q8, d0, #4
++        mov         r1, #16
++        vld1.16     {d9[]}, [r2]        @ Left (lower)
++        sub         r2, #32
++        vshll.u8    q9, d1, #4
++        lsl         r3, #1
++        vshll.u8    q10, d2, #4
++        vshll.u8    q11, d3, #4
++        vmlal.u8    q8, d4, d8
++        vsubl.u8    q12, d9, d0
++        vmlal.u8    q9, d5, d8
++        vsubl.u8    q13, d9, d1
++        vmlal.u8    q10, d6, d8
++        vsubl.u8    q14, d9, d2
++        vmlal.u8    q11, d7, d8         @ Acc set up
++        vsubl.u8    q15, d9, d3         @ Add set up
++        vadd.i16    q8, q12
++        vadd.i16    q9, q13
++        vadd.i16    q10, q14
++        vadd.i16    q11, q15
++        vld1.8      {q4-q5}, [ip :128]  @ {15,15,14,14,13,13...0,0}
++
++@ u8  15..0    [2]  q4,q5
++@ u8  left[y]  [2]  [r2]
++@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
++
++        vld1.16     {d12[]}, [r2]!
++        vadd.i16    q0, q8, q12
++        b           2f
++1:
++          vld1.16     {d12[]}, [r2]!
++        vrshrn.u16  d3, q1, #5
++        vrshrn.u16  d2, q0, #5
++          vadd.i16    q0, q8, q12
++        vrshrn.u16  d4, q2, #5
++        vrshrn.u16  d5, q3, #5
++        vst1.8      {q1-q2}, [r0 :128], r3
++2:        vadd.i16    q1, q9, q13
++          subs        r1, #2
++          vadd.i16    q2, q10, q14
++          vadd.i16    q3, q11, q15
++          vmlal.u8    q8, d8, d12
++          vmlal.u8    q9, d9, d12
++          vmlal.u8    q10, d10, d12
++          vmlal.u8    q11, d11, d12
++            vld1.16     {d12[]}, [r2]!
++          vrshrn.u16  d19, q9, #5
++          vrshrn.u16  d18, q8, #5
++            vadd.i16    q8, q0, q12
++          vrshrn.u16  d20, q10, #5
++          vrshrn.u16  d21, q11, #5
++          vst1.8      {q9-q10}, [r0 :128], r3
++            vadd.i16    q9, q1, q13
++            vadd.i16    q10, q2, q14
++            vadd.i16    q11, q3, q15
++            vmlal.u8    q0, d8, d12
++            vmlal.u8    q1, d9, d12
++            vmlal.u8    q2, d10, d12
++            vmlal.u8    q3, d11, d12
++
++        bne         1b
++
++        vpop        {d8-d12}
++
++        vrshrn.u16  d3, q1, #5
++        vrshrn.u16  d2, q0, #5
++        vrshrn.u16  d4, q2, #5
++        vrshrn.u16  d5, q3, #5
++        vst1.8      {q1-q2}, [r0 :128]
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
++        adr         ip, nbx2_15_0_1_16 + 32
++        vpush       {q4-q7}
++        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
++        add         r2, #64
++        vld1.8      {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
++T       lsl         r3, #2
++        vld1.32     {d8[],d9[]}, [r1]    @ Top (right)
++        sub         ip, #32
++        vmovl.u8    q12, d28
++        mov         r1, #16
++        vmovl.u8    q13, d29
++        vld1.8      {q6-q7}, [ip :128]   @ {15,15,14,14,13,13...0,0}
++        vmovl.u8    q14, d30
++        vmovl.u8    q15, d31
++        vld1.32     {d10[],d11[]}, [r2]  @ Left (lower)
++        sub         r2, #64
++        vshl.i16    q8, q0, #4
++        vshl.i16    q9, q1, #4
++        vshl.i16    q10, q2, #4
++        vshl.i16    q11, q3, #4
++        vmla.i16    q8, q12, q4
++        vsub.i16    q0, q5, q0
++        vmla.i16    q9, q13, q4
++        vpush       {q0}
++        vsub.i16    q1, q5, q1
++        vmla.i16    q10, q14, q4
++        vsub.i16    q2, q5, q2
++        vmla.i16    q11, q15, q4         @ Acc set up
++        vsub.i16    q3, q5, q3           @ Add set up
++        vadd.i16    q8, q0
++        vadd.i16    q9, q1
++        vadd.i16    q10, q2
++        vadd.i16    q11, q3
++        vmovl.u8    q4, d12
++        vmovl.u8    q5, d13
++        vmovl.u8    q6, d14
++        vmovl.u8    q7, d15
++
++@ u16 31..0    [4]  q4-q7
++@ u16 left[y]  [4]  [r2]
++@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q12, q8, q0
++A       sub         r0, r0, r3, lsl #2
++T       sub         r0, r3
++1:
++        vld1.32     {d0[],d1[]}, [r2]!
++A       add         r0, r0, r3, lsl #2
++T       add         r0, r3
++        vadd.i16    q13, q9, q1
++        subs        r1, #2
++        vadd.i16    q14, q10, q2
++        vadd.i16    q15, q11, q3
++        vmla.i16    q8, q4, q0
++        vmla.i16    q9, q5, q0
++        vmla.i16    q10, q6, q0
++        vmla.i16    q11, q7, q0
++        vld1.16     {q0}, [sp]
++        vrshr.u16   q8, #5
++        vrshr.u16   q9, #5
++        vrshr.u16   q10, #5
++        vrshr.u16   q11, #5
++        vstm        r0, {q8-q11}
++        vadd.i16    q8, q12, q0
++A       add         r0, r0, r3, lsl #2
++T       add         r0, r3
++        vld1.32     {d0[],d1[]}, [r2]!
++        vadd.i16    q9, q13, q1
++        vadd.i16    q10, q14, q2
++        vadd.i16    q11, q15, q3
++        vmla.i16    q12, q4, q0
++        vmla.i16    q13, q5, q0
++        vmla.i16    q14, q6, q0
++        vmla.i16    q15, q7, q0
++        vld1.16     {q0}, [sp]
++        vrshr.u16   q12, #5
++        vrshr.u16   q13, #5
++        vrshr.u16   q14, #5
++        vrshr.u16   q15, #5
++        vstm        r0, {q12-q15}
++        vadd.i16    q12, q8, q0
++        bne         1b
++
++        vpop        {q3-q7}
++        bx          lr
++
++endfunc
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index c91b2fd169..d6e019bbe1 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -2236,8 +2236,7 @@ typedef struct AVCodecContext {
+ #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+ #endif
+ 
+-    /**
+-     * Audio only. The amount of padding (in samples) appended by the encoder to
++    /* Audio only. The amount of padding (in samples) appended by the encoder to
+      * the end of the audio. I.e. this number of decoded samples must be
+      * discarded by the caller from the end of the stream to get the original
+      * audio without any trailing padding.
+@@ -2567,6 +2566,17 @@ typedef struct AVHWAccel {
+      * that avctx->hwaccel_priv_data is invalid.
+      */
+     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++
++    /**
++     * Called if parsing fails
++     *
++     * An error has occured, end_frame will not be called
++     * start_frame & decode_slice may or may not have been called
++     * Optional
++     *
++     * @param avctx the codec context
++     */
++    void (*abort_frame)(AVCodecContext *avctx);
+ } AVHWAccel;
+ 
+ /**
+diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
+index 1bf1c620d6..ccfa991f60 100644
+--- a/libavcodec/cabac.h
++++ b/libavcodec/cabac.h
+@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
+ typedef struct CABACContext{
+     int low;
+     int range;
+-    int outstanding_count;
++    union
++    {
++        int outstanding_count;
++        struct {
++            uint16_t bits;
++            uint16_t range;
++        } by22;
++    };
+     const uint8_t *bytestream_start;
+     const uint8_t *bytestream;
+     const uint8_t *bytestream_end;
+diff --git a/libavcodec/codec.h b/libavcodec/codec.h
+index 1fda619ee7..b4650f9ec9 100644
+--- a/libavcodec/codec.h
++++ b/libavcodec/codec.h
+@@ -349,6 +349,17 @@ const AVCodec *av_codec_iterate(void **opaque);
+  */
+ AVCodec *avcodec_find_decoder(enum AVCodecID id);
+ 
++/**
++ * Find a registered decoder with a matching codec ID and pix_fmt.
++ * A decoder will pix_fmt set to NULL will match any fmt.
++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
++ *
++ * @param id AVCodecID of the requested decoder
++ * @param fmt AVPixelForma that msut be supported by decoder
++ * @return A decoder if one was found, NULL otherwise.
++ */
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
++
+ /**
+  * Find a registered decoder with the specified name.
+  *
+diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h
+new file mode 100644
+index 0000000000..080fd1293c
+--- /dev/null
++++ b/libavcodec/h264-ctrls.h
+@@ -0,0 +1,218 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the H.264 state controls for use with stateless H.264
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _H264_CTRLS_H_
++#define _H264_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/*
++ * Maximum DPB size, as specified by section 'A.3.1 Level limits
++ * common to the Baseline, Main, and Extended profiles'.
++ */
++#define V4L2_H264_NUM_DPB_ENTRIES 16
++
++/* Our pixel format isn't stable at the moment */
++#define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */
++
++/*
++ * This is put insanely high to avoid conflicting with controls that
++ * would be added during the phase where those controls are not
++ * stable. It should be fixed eventually.
++ */
++#define V4L2_CID_MPEG_VIDEO_H264_SPS		(V4L2_CID_MPEG_BASE+1000)
++#define V4L2_CID_MPEG_VIDEO_H264_PPS		(V4L2_CID_MPEG_BASE+1001)
++#define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX	(V4L2_CID_MPEG_BASE+1002)
++#define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS	(V4L2_CID_MPEG_BASE+1003)
++#define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS	(V4L2_CID_MPEG_BASE+1004)
++#define V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE	(V4L2_CID_MPEG_BASE+1005)
++#define V4L2_CID_MPEG_VIDEO_H264_START_CODE	(V4L2_CID_MPEG_BASE+1006)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_H264_SPS			0x0110
++#define V4L2_CTRL_TYPE_H264_PPS			0x0111
++#define V4L2_CTRL_TYPE_H264_SCALING_MATRIX	0x0112
++#define V4L2_CTRL_TYPE_H264_SLICE_PARAMS	0x0113
++#define V4L2_CTRL_TYPE_H264_DECODE_PARAMS	0x0114
++
++enum v4l2_mpeg_video_h264_decode_mode {
++	V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_h264_start_code {
++	V4L2_MPEG_VIDEO_H264_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B,
++};
++
++#define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG			0x01
++#define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG			0x02
++#define V4L2_H264_SPS_CONSTRAINT_SET2_FLAG			0x04
++#define V4L2_H264_SPS_CONSTRAINT_SET3_FLAG			0x08
++#define V4L2_H264_SPS_CONSTRAINT_SET4_FLAG			0x10
++#define V4L2_H264_SPS_CONSTRAINT_SET5_FLAG			0x20
++
++#define V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE		0x01
++#define V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS	0x02
++#define V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO		0x04
++#define V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED	0x08
++#define V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY			0x10
++#define V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD		0x20
++#define V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE			0x40
++
++struct v4l2_ctrl_h264_sps {
++	__u8 profile_idc;
++	__u8 constraint_set_flags;
++	__u8 level_idc;
++	__u8 seq_parameter_set_id;
++	__u8 chroma_format_idc;
++	__u8 bit_depth_luma_minus8;
++	__u8 bit_depth_chroma_minus8;
++	__u8 log2_max_frame_num_minus4;
++	__u8 pic_order_cnt_type;
++	__u8 log2_max_pic_order_cnt_lsb_minus4;
++	__u8 max_num_ref_frames;
++	__u8 num_ref_frames_in_pic_order_cnt_cycle;
++	__s32 offset_for_ref_frame[255];
++	__s32 offset_for_non_ref_pic;
++	__s32 offset_for_top_to_bottom_field;
++	__u16 pic_width_in_mbs_minus1;
++	__u16 pic_height_in_map_units_minus1;
++	__u32 flags;
++};
++
++#define V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE				0x0001
++#define V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT	0x0002
++#define V4L2_H264_PPS_FLAG_WEIGHTED_PRED				0x0004
++#define V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT		0x0008
++#define V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED			0x0010
++#define V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT			0x0020
++#define V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE				0x0040
++#define V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT			0x0080
++
++struct v4l2_ctrl_h264_pps {
++	__u8 pic_parameter_set_id;
++	__u8 seq_parameter_set_id;
++	__u8 num_slice_groups_minus1;
++	__u8 num_ref_idx_l0_default_active_minus1;
++	__u8 num_ref_idx_l1_default_active_minus1;
++	__u8 weighted_bipred_idc;
++	__s8 pic_init_qp_minus26;
++	__s8 pic_init_qs_minus26;
++	__s8 chroma_qp_index_offset;
++	__s8 second_chroma_qp_index_offset;
++	__u16 flags;
++};
++
++struct v4l2_ctrl_h264_scaling_matrix {
++	__u8 scaling_list_4x4[6][16];
++	__u8 scaling_list_8x8[6][64];
++};
++
++struct v4l2_h264_weight_factors {
++	__s16 luma_weight[32];
++	__s16 luma_offset[32];
++	__s16 chroma_weight[32][2];
++	__s16 chroma_offset[32][2];
++};
++
++struct v4l2_h264_pred_weight_table {
++	__u16 luma_log2_weight_denom;
++	__u16 chroma_log2_weight_denom;
++	struct v4l2_h264_weight_factors weight_factors[2];
++};
++
++#define V4L2_H264_SLICE_TYPE_P				0
++#define V4L2_H264_SLICE_TYPE_B				1
++#define V4L2_H264_SLICE_TYPE_I				2
++#define V4L2_H264_SLICE_TYPE_SP				3
++#define V4L2_H264_SLICE_TYPE_SI				4
++
++#define V4L2_H264_SLICE_FLAG_FIELD_PIC			0x01
++#define V4L2_H264_SLICE_FLAG_BOTTOM_FIELD		0x02
++#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED	0x04
++#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH		0x08
++
++struct v4l2_ctrl_h264_slice_params {
++	/* Size in bytes, including header */
++	__u32 size;
++
++	/* Offset in bytes to the start of slice in the OUTPUT buffer. */
++	__u32 start_byte_offset;
++
++	/* Offset in bits to slice_data() from the beginning of this slice. */
++	__u32 header_bit_size;
++
++	__u16 first_mb_in_slice;
++	__u8 slice_type;
++	__u8 pic_parameter_set_id;
++	__u8 colour_plane_id;
++	__u8 redundant_pic_cnt;
++	__u16 frame_num;
++	__u16 idr_pic_id;
++	__u16 pic_order_cnt_lsb;
++	__s32 delta_pic_order_cnt_bottom;
++	__s32 delta_pic_order_cnt0;
++	__s32 delta_pic_order_cnt1;
++
++	struct v4l2_h264_pred_weight_table pred_weight_table;
++	/* Size in bits of dec_ref_pic_marking() syntax element. */
++	__u32 dec_ref_pic_marking_bit_size;
++	/* Size in bits of pic order count syntax. */
++	__u32 pic_order_cnt_bit_size;
++
++	__u8 cabac_init_idc;
++	__s8 slice_qp_delta;
++	__s8 slice_qs_delta;
++	__u8 disable_deblocking_filter_idc;
++	__s8 slice_alpha_c0_offset_div2;
++	__s8 slice_beta_offset_div2;
++	__u8 num_ref_idx_l0_active_minus1;
++	__u8 num_ref_idx_l1_active_minus1;
++	__u32 slice_group_change_cycle;
++
++	/*
++	 * Entries on each list are indices into
++	 * v4l2_ctrl_h264_decode_params.dpb[].
++	 */
++	__u8 ref_pic_list0[32];
++	__u8 ref_pic_list1[32];
++
++	__u32 flags;
++};
++
++#define V4L2_H264_DPB_ENTRY_FLAG_VALID		0x01
++#define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE		0x02
++#define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM	0x04
++#define V4L2_H264_DPB_ENTRY_FLAG_FIELD		0x08
++#define V4L2_H264_DPB_ENTRY_FLAG_BOTTOM_FIELD	0x10
++
++struct v4l2_h264_dpb_entry {
++	__u64 reference_ts;
++	__u16 frame_num;
++	__u16 pic_num;
++	/* Note that field is indicated by v4l2_buffer.field */
++	__s32 top_field_order_cnt;
++	__s32 bottom_field_order_cnt;
++	__u32 flags; /* V4L2_H264_DPB_ENTRY_FLAG_* */
++};
++
++#define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC	0x01
++
++struct v4l2_ctrl_h264_decode_params {
++	struct v4l2_h264_dpb_entry dpb[V4L2_H264_NUM_DPB_ENTRIES];
++	__u16 num_slices;
++	__u16 nal_ref_idc;
++	__s32 top_field_order_cnt;
++	__s32 bottom_field_order_cnt;
++	__u32 flags; /* V4L2_H264_DECODE_PARAM_FLAG_* */
++};
++
++#endif
+diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
+index db8363e4cc..c3896cfd90 100644
+--- a/libavcodec/h264_slice.c
++++ b/libavcodec/h264_slice.c
+@@ -759,6 +759,7 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
+ #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \
+                      (CONFIG_H264_D3D11VA_HWACCEL * 2) + \
+                      CONFIG_H264_NVDEC_HWACCEL + \
++                     CONFIG_H264_V4L2REQUEST_HWACCEL + \
+                      CONFIG_H264_VAAPI_HWACCEL + \
+                      CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \
+                      CONFIG_H264_VDPAU_HWACCEL)
+@@ -843,6 +844,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback)
+ #endif
+ #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+             *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
++#endif
++#if CONFIG_H264_V4L2REQUEST_HWACCEL
++            *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+             if (h->avctx->codec->pix_fmts)
+                 choices = h->avctx->codec->pix_fmts;
+@@ -1736,7 +1740,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+     unsigned int slice_type, tmp, i;
+     int field_pic_flag, bottom_field_flag;
+     int first_slice = sl == h->slice_ctx && !h->current_slice;
+-    int picture_structure;
++    int picture_structure, pos;
+ 
+     if (first_slice)
+         av_assert0(!h->setup_finished);
+@@ -1818,8 +1822,9 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+     }
+ 
+     if (nal->type == H264_NAL_IDR_SLICE)
+-        get_ue_golomb_long(&sl->gb); /* idr_pic_id */
++        sl->idr_pic_id = get_ue_golomb_long(&sl->gb);
+ 
++    pos = sl->gb.index;
+     if (sps->poc_type == 0) {
+         sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb);
+ 
+@@ -1833,6 +1838,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+         if (pps->pic_order_present == 1 && picture_structure == PICT_FRAME)
+             sl->delta_poc[1] = get_se_golomb(&sl->gb);
+     }
++    sl->pic_order_cnt_bit_size = sl->gb.index - pos;
+ 
+     sl->redundant_pic_count = 0;
+     if (pps->redundant_pic_cnt_present)
+@@ -1872,9 +1878,11 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
+ 
+     sl->explicit_ref_marking = 0;
+     if (nal->ref_idc) {
++        int bit_pos = sl->gb.index;
+         ret = ff_h264_decode_ref_pic_marking(sl, &sl->gb, nal, h->avctx);
+         if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE))
+             return AVERROR_INVALIDDATA;
++        sl->ref_pic_marking_size_in_bits = sl->gb.index - bit_pos;
+     }
+ 
+     if (sl->slice_type_nos != AV_PICTURE_TYPE_I && pps->cabac) {
+diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c
+index 5eedeb3c27..a504c89565 100644
+--- a/libavcodec/h264dec.c
++++ b/libavcodec/h264dec.c
+@@ -1102,6 +1102,9 @@ AVCodec ff_h264_decoder = {
+ #endif
+ #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL
+                                HWACCEL_VIDEOTOOLBOX(h264),
++#endif
++#if CONFIG_H264_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(h264),
+ #endif
+                                NULL
+                            },
+diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h
+index a419615124..b3dcd6e7da 100644
+--- a/libavcodec/h264dec.h
++++ b/libavcodec/h264dec.h
+@@ -190,6 +190,8 @@ typedef struct H264SliceContext {
+     int slice_type_nos;         ///< S free slice type (SI/SP are remapped to I/P)
+     int slice_type_fixed;
+ 
++    int idr_pic_id;
++
+     int qscale;
+     int chroma_qp[2];   // QPc
+     int qp_thresh;      ///< QP threshold to skip loopfilter
+@@ -328,11 +330,13 @@ typedef struct H264SliceContext {
+     MMCO mmco[MAX_MMCO_COUNT];
+     int  nb_mmco;
+     int explicit_ref_marking;
++    int ref_pic_marking_size_in_bits;
+ 
+     int frame_num;
+     int poc_lsb;
+     int delta_poc_bottom;
+     int delta_poc[2];
++    int pic_order_cnt_bit_size;
+     int curr_pic_num;
+     int max_pic_num;
+ } H264SliceContext;
+diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h
+new file mode 100644
+index 0000000000..13698d3f33
+--- /dev/null
++++ b/libavcodec/hevc-ctrls.h
+@@ -0,0 +1,230 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++
++	__u8	padding;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	rps;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	num_active_dpb_entries;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	num_rps_poc_st_curr_before;
++	__u8	num_rps_poc_st_curr_after;
++	__u8	num_rps_poc_lt_curr;
++
++	__u8	padding;
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
+index 0772608a30..c30fb2a83f 100644
+--- a/libavcodec/hevcdec.c
++++ b/libavcodec/hevcdec.c
+@@ -372,14 +372,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
+                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
+                      CONFIG_HEVC_NVDEC_HWACCEL + \
++                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
+                      CONFIG_HEVC_VAAPI_HWACCEL + \
+                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
++                     CONFIG_HEVC_RPI4_8_HWACCEL + \
++                     CONFIG_HEVC_RPI4_10_HWACCEL + \
+                      CONFIG_HEVC_VDPAU_HWACCEL)
+     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+ 
+     switch (sps->pix_fmt) {
+     case AV_PIX_FMT_YUV420P:
+     case AV_PIX_FMT_YUVJ420P:
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++        *fmt++ = AV_PIX_FMT_RPI4_8;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -398,9 +404,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV420P10:
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++        *fmt++ = AV_PIX_FMT_RPI4_10;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -416,6 +428,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+         *fmt++ = AV_PIX_FMT_CUDA;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV444P:
+@@ -3225,7 +3240,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
+     s->ref = NULL;
+     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
+     if (ret < 0)
++    {
++        // Ensure that hwaccel knows this frame is over
++        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
++            s->avctx->hwaccel->abort_frame(s->avctx);
++        }
++
+         return ret;
++    }
+ 
+     if (avctx->hwaccel) {
+         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
+@@ -3588,6 +3610,15 @@ AVCodec ff_hevc_decoder = {
+ #endif
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+                                HWACCEL_VIDEOTOOLBOX(hevc),
++#endif
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++                               HWACCEL_RPI4_8(hevc),
++#endif
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++                               HWACCEL_RPI4_10(hevc),
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(hevc),
+ #endif
+                                NULL
+                            },
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index 6109c89bd6..30927fda99 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -27,6 +27,7 @@ extern const AVHWAccel ff_h264_d3d11va_hwaccel;
+ extern const AVHWAccel ff_h264_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_h264_dxva2_hwaccel;
+ extern const AVHWAccel ff_h264_nvdec_hwaccel;
++extern const AVHWAccel ff_h264_v4l2request_hwaccel;
+ extern const AVHWAccel ff_h264_vaapi_hwaccel;
+ extern const AVHWAccel ff_h264_vdpau_hwaccel;
+ extern const AVHWAccel ff_h264_videotoolbox_hwaccel;
+@@ -34,6 +35,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
+ extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+ extern const AVHWAccel ff_hevc_nvdec_hwaccel;
++extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
+ extern const AVHWAccel ff_hevc_vaapi_hwaccel;
+ extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+ extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+@@ -47,6 +49,7 @@ extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel;
+ extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_mpeg2_nvdec_hwaccel;
+ extern const AVHWAccel ff_mpeg2_dxva2_hwaccel;
++extern const AVHWAccel ff_mpeg2_v4l2request_hwaccel;
+ extern const AVHWAccel ff_mpeg2_vaapi_hwaccel;
+ extern const AVHWAccel ff_mpeg2_vdpau_hwaccel;
+ extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel;
+@@ -62,11 +65,13 @@ extern const AVHWAccel ff_vc1_nvdec_hwaccel;
+ extern const AVHWAccel ff_vc1_vaapi_hwaccel;
+ extern const AVHWAccel ff_vc1_vdpau_hwaccel;
+ extern const AVHWAccel ff_vp8_nvdec_hwaccel;
++extern const AVHWAccel ff_vp8_v4l2request_hwaccel;
+ extern const AVHWAccel ff_vp8_vaapi_hwaccel;
+ extern const AVHWAccel ff_vp9_d3d11va_hwaccel;
+ extern const AVHWAccel ff_vp9_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_vp9_dxva2_hwaccel;
+ extern const AVHWAccel ff_vp9_nvdec_hwaccel;
++extern const AVHWAccel ff_vp9_v4l2request_hwaccel;
+ extern const AVHWAccel ff_vp9_vaapi_hwaccel;
+ extern const AVHWAccel ff_vp9_vdpau_hwaccel;
+ extern const AVHWAccel ff_wmv3_d3d11va_hwaccel;
+@@ -75,5 +80,7 @@ extern const AVHWAccel ff_wmv3_dxva2_hwaccel;
+ extern const AVHWAccel ff_wmv3_nvdec_hwaccel;
+ extern const AVHWAccel ff_wmv3_vaapi_hwaccel;
+ extern const AVHWAccel ff_wmv3_vdpau_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
+ 
+ #endif /* AVCODEC_HWACCELS_H */
+diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
+index f421dc909f..66b001e333 100644
+--- a/libavcodec/hwconfig.h
++++ b/libavcodec/hwconfig.h
+@@ -24,6 +24,7 @@
+ 
+ 
+ #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
++#define HWACCEL_CAP_MT_SAFE         (1 << 1)
+ 
+ 
+ typedef struct AVCodecHWConfigInternal {
+@@ -80,6 +81,12 @@ typedef struct AVCodecHWConfigInternal {
+     HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD,  NONE,         ff_ ## codec ## _d3d11va_hwaccel)
+ #define HWACCEL_XVMC(codec) \
+     HW_CONFIG_HWACCEL(0, 0, 1, XVMC,         NONE,         ff_ ## codec ## _xvmc_hwaccel)
++#define HWACCEL_RPI4_8(codec) \
++    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8,       NONE,         ff_ ## codec ## _rpi4_8_hwaccel)
++#define HWACCEL_RPI4_10(codec) \
++    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10,      NONE,         ff_ ## codec ## _rpi4_10_hwaccel)
++#define HWACCEL_V4L2REQUEST(codec) \
++    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
+ 
+ #define HW_CONFIG_ENCODER(device, frames, ad_hoc, format, device_type_) \
+     &(const AVCodecHWConfigInternal) { \
+diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+index 547bece576..bfd1083c16 100644
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
+@@ -24,6 +24,9 @@
+  * MMAL Video Decoder
+  */
+ 
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ #include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/mmal_parameters_video.h>
+@@ -31,6 +34,7 @@
+ #include <interface/mmal/util/mmal_util_params.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/mmal/vc/mmal_vc_api.h>
++#pragma GCC diagnostic pop
+ #include <stdatomic.h>
+ 
+ #include "avcodec.h"
+diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c
+index 99e56532a5..15aaf97a34 100644
+--- a/libavcodec/mpeg12dec.c
++++ b/libavcodec/mpeg12dec.c
+@@ -1154,6 +1154,9 @@ static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = {
+ #endif
+ #if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL
+     AV_PIX_FMT_VIDEOTOOLBOX,
++#endif
++#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
++    AV_PIX_FMT_DRM_PRIME,
+ #endif
+     AV_PIX_FMT_YUV420P,
+     AV_PIX_FMT_NONE
+@@ -2952,6 +2955,9 @@ AVCodec ff_mpeg2video_decoder = {
+ #endif
+ #if CONFIG_MPEG2_XVMC_HWACCEL
+                         HWACCEL_XVMC(mpeg2),
++#endif
++#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL
++                        HWACCEL_V4L2REQUEST(mpeg2),
+ #endif
+                         NULL
+                     },
+diff --git a/libavcodec/mpeg2-ctrls.h b/libavcodec/mpeg2-ctrls.h
+new file mode 100644
+index 0000000000..6601455b3d
+--- /dev/null
++++ b/libavcodec/mpeg2-ctrls.h
+@@ -0,0 +1,82 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the MPEG2 state controls for use with stateless MPEG-2
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _MPEG2_CTRLS_H_
++#define _MPEG2_CTRLS_H_
++
++#define V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS		(V4L2_CID_MPEG_BASE+250)
++#define V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION		(V4L2_CID_MPEG_BASE+251)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS 0x0103
++#define	V4L2_CTRL_TYPE_MPEG2_QUANTIZATION 0x0104
++
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_I	1
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_P	2
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_B	3
++#define V4L2_MPEG2_PICTURE_CODING_TYPE_D	4
++
++struct v4l2_mpeg2_sequence {
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
++	__u16	horizontal_size;
++	__u16	vertical_size;
++	__u32	vbv_buffer_size;
++
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
++	__u16	profile_and_level_indication;
++	__u8	progressive_sequence;
++	__u8	chroma_format;
++};
++
++struct v4l2_mpeg2_picture {
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
++	__u8	picture_coding_type;
++
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
++	__u8	f_code[2][2];
++	__u8	intra_dc_precision;
++	__u8	picture_structure;
++	__u8	top_field_first;
++	__u8	frame_pred_frame_dct;
++	__u8	concealment_motion_vectors;
++	__u8	q_scale_type;
++	__u8	intra_vlc_format;
++	__u8	alternate_scan;
++	__u8	repeat_first_field;
++	__u16	progressive_frame;
++};
++
++struct v4l2_ctrl_mpeg2_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++	__u64	backward_ref_ts;
++	__u64	forward_ref_ts;
++
++	struct v4l2_mpeg2_sequence sequence;
++	struct v4l2_mpeg2_picture picture;
++
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
++	__u32	quantiser_scale_code;
++};
++
++struct v4l2_ctrl_mpeg2_quantization {
++	/* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
++	__u8	load_intra_quantiser_matrix;
++	__u8	load_non_intra_quantiser_matrix;
++	__u8	load_chroma_intra_quantiser_matrix;
++	__u8	load_chroma_non_intra_quantiser_matrix;
++
++	__u8	intra_quantiser_matrix[64];
++	__u8	non_intra_quantiser_matrix[64];
++	__u8	chroma_intra_quantiser_matrix[64];
++	__u8	chroma_non_intra_quantiser_matrix[64];
++};
++
++#endif
+diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
+index 601f170447..f890f99931 100644
+--- a/libavcodec/pthread_frame.c
++++ b/libavcodec/pthread_frame.c
+@@ -191,7 +191,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
+ 
+         /* if the previous thread uses hwaccel then we take the lock to ensure
+          * the threads don't run concurrently */
+-        if (avctx->hwaccel) {
++        if (avctx->hwaccel &&
++            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+             pthread_mutex_lock(&p->parent->hwaccel_mutex);
+             p->hwaccel_serializing = 1;
+         }
+@@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
+ 
+     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
+ 
+-    if (avctx->hwaccel && !p->hwaccel_serializing) {
++    if (avctx->hwaccel &&
++        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
++        !p->hwaccel_serializing) {
+         pthread_mutex_lock(&p->parent->hwaccel_mutex);
+         p->hwaccel_serializing = 1;
+     }
+diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+index b6fb91c1c6..7b2770e780 100644
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -289,10 +289,20 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
+ 
++    /* RPI (Might as well define for everything) */
++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
++
+     /* special */
+     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+ 
++     /* RPI (Might as well define for everything) */
++     { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++     { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
++     { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
++     { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
++
+     { AV_PIX_FMT_NONE, 0 },
+ };
+ 
+diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+index d181b74570..3fe2ab445f 100644
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -24,6 +24,7 @@
+  * Raw Video Encoder
+  */
+ 
++#include "config.h"
+ #include "avcodec.h"
+ #include "raw.h"
+ #include "internal.h"
+@@ -31,6 +32,10 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#if CONFIG_SAND
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -49,12 +54,95 @@ FF_ENABLE_DEPRECATION_WARNINGS
+     return 0;
+ }
+ 
++#if CONFIG_SAND
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3 / 2;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height;
++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
++    return 0;
++}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++    dst += width * height * 2;
++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++    return 0;
++}
++
++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height * 2;
++    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
++    return 0;
++}
++#endif
++
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *frame, int *got_packet)
+ {
+-    int ret = av_image_get_buffer_size(frame->format,
+-                                       frame->width, frame->height, 1);
++    int ret;
++
++#if CONFIG_SAND
++    if (av_rpi_is_sand_frame(frame)) {
++        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
++            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
++            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
++        *got_packet = (ret == 0);
++        return ret;
++    }
++#endif
+ 
++    ret = av_image_get_buffer_size(frame->format,
++                                       frame->width, frame->height, 1);
+     if (ret < 0)
+         return ret;
+ 
+diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
+new file mode 100644
+index 0000000000..58c094c5f8
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac.c
+@@ -0,0 +1,2257 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#define UNCHECKED_BITSTREAM_READER 1
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++
++#include "cabac_functions.h"
++#include "rpi_hevc_data.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++
++#include "libavutil/rpi_sand_fns.h"
++
++// BY22 is probably faster than simple bypass if the processor has
++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
++// x86 has fast int divide
++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
++// Use native divide if we have a fast one - otherwise use mpy 1/x
++// x86 has a fast integer divide - arm doesn't - unsure about other
++// architectures
++#define USE_BY22_DIV  ARCH_X86
++
++// Special case blocks with a single significant ceoff
++// Decreases the complexity of the code for a common case but increases the
++// code size.
++#define USE_N_END_1 1
++
++#if !USE_BY22_DIV
++// * 1/x @ 32 bits gets us 22 bits of accuracy
++#define CABAC_BY22_PEEK_BITS  22
++#else
++// A real 32-bit divide gets us another bit
++// If we have a 64 bit int & a unit time divider then we should get a lot
++// of bits (55)  but that is untested and it is unclear if it would give
++// us a large advantage
++#define CABAC_BY22_PEEK_BITS  23
++#endif
++
++#define CABAC_MAX_BIN 31
++
++
++#if USE_BY22 && !USE_BY22_DIV
++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
++
++static const uint32_t cabac_by22_inv_range[256] = {
++                                                    0,      I(257), I(258), I(259),
++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
++    I(510), I(511)
++};
++#undef I
++#endif  // USE_BY22
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_cabac.h"
++#endif
++
++/**
++ * number of bin by SyntaxElement.
++ */
++static const int8_t num_bins_in_se[] = {
++     1, // sao_merge_flag
++     1, // sao_type_idx
++     0, // sao_eo_class
++     0, // sao_band_position
++     0, // sao_offset_abs
++     0, // sao_offset_sign
++     0, // end_of_slice_flag
++     3, // split_coding_unit_flag
++     1, // cu_transquant_bypass_flag
++     3, // skip_flag
++     3, // cu_qp_delta
++     1, // pred_mode
++     4, // part_mode
++     0, // pcm_flag
++     1, // prev_intra_luma_pred_mode
++     0, // mpm_idx
++     0, // rem_intra_luma_pred_mode
++     2, // intra_chroma_pred_mode
++     1, // merge_flag
++     1, // merge_idx
++     5, // inter_pred_idc
++     2, // ref_idx_l0
++     2, // ref_idx_l1
++     2, // abs_mvd_greater0_flag
++     2, // abs_mvd_greater1_flag
++     0, // abs_mvd_minus2
++     0, // mvd_sign_flag
++     1, // mvp_lx_flag
++     1, // no_residual_data_flag
++     3, // split_transform_flag
++     2, // cbf_luma
++     4, // cbf_cb, cbf_cr
++     2, // transform_skip_flag[][]
++     2, // explicit_rdpcm_flag[][]
++     2, // explicit_rdpcm_dir_flag[][]
++    18, // last_significant_coeff_x_prefix
++    18, // last_significant_coeff_y_prefix
++     0, // last_significant_coeff_x_suffix
++     0, // last_significant_coeff_y_suffix
++     4, // significant_coeff_group_flag
++    44, // significant_coeff_flag
++    24, // coeff_abs_level_greater1_flag
++     6, // coeff_abs_level_greater2_flag
++     0, // coeff_abs_level_remaining
++     0, // coeff_sign_flag
++     8, // log2_res_scale_abs
++     2, // res_scale_sign_flag
++     1, // cu_chroma_qp_offset_flag
++     1, // cu_chroma_qp_offset_idx
++};
++
++/**
++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
++ */
++static const int elem_offset[sizeof(num_bins_in_se)] = {
++    0, // sao_merge_flag
++    1, // sao_type_idx
++    2, // sao_eo_class
++    2, // sao_band_position
++    2, // sao_offset_abs
++    2, // sao_offset_sign
++    2, // end_of_slice_flag
++    2, // split_coding_unit_flag
++    5, // cu_transquant_bypass_flag
++    6, // skip_flag
++    9, // cu_qp_delta
++    12, // pred_mode
++    13, // part_mode
++    17, // pcm_flag
++    17, // prev_intra_luma_pred_mode
++    18, // mpm_idx
++    18, // rem_intra_luma_pred_mode
++    18, // intra_chroma_pred_mode
++    20, // merge_flag
++    21, // merge_idx
++    22, // inter_pred_idc
++    27, // ref_idx_l0
++    29, // ref_idx_l1
++    31, // abs_mvd_greater0_flag
++    33, // abs_mvd_greater1_flag
++    35, // abs_mvd_minus2
++    35, // mvd_sign_flag
++    35, // mvp_lx_flag
++    36, // no_residual_data_flag
++    37, // split_transform_flag
++    40, // cbf_luma
++    42, // cbf_cb, cbf_cr
++    46, // transform_skip_flag[][]
++    48, // explicit_rdpcm_flag[][]
++    50, // explicit_rdpcm_dir_flag[][]
++    52, // last_significant_coeff_x_prefix
++    70, // last_significant_coeff_y_prefix
++    88, // last_significant_coeff_x_suffix
++    88, // last_significant_coeff_y_suffix
++    88, // significant_coeff_group_flag
++    92, // significant_coeff_flag
++    136, // coeff_abs_level_greater1_flag
++    160, // coeff_abs_level_greater2_flag
++    166, // coeff_abs_level_remaining
++    166, // coeff_sign_flag
++    166, // log2_res_scale_abs
++    174, // res_scale_sign_flag
++    176, // cu_chroma_qp_offset_flag
++    177, // cu_chroma_qp_offset_idx
++};
++
++#define CNU 154
++/**
++ * Indexed by init_type
++ */
++static const uint8_t init_values[3][HEVC_CONTEXTS] = {
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      200,
++      // split_coding_unit_flag
++      139, 141, 157,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      CNU, CNU, CNU,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      CNU,
++      // part_mode
++      184, CNU, CNU, CNU,
++      // prev_intra_luma_pred_mode
++      184,
++      // intra_chroma_pred_mode
++      63, 139,
++      // merge_flag
++      CNU,
++      // merge_idx
++      CNU,
++      // inter_pred_idc
++      CNU, CNU, CNU, CNU, CNU,
++      // ref_idx_l0
++      CNU, CNU,
++      // ref_idx_l1
++      CNU, CNU,
++      // abs_mvd_greater1_flag
++      CNU, CNU,
++      // abs_mvd_greater1_flag
++      CNU, CNU,
++      // mvp_lx_flag
++      CNU,
++      // no_residual_data_flag
++      CNU,
++      // split_transform_flag
++      153, 138, 138,
++      // cbf_luma
++      111, 141,
++      // cbf_cb, cbf_cr
++      94, 138, 182, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++       79, 108, 123,  63,
++      // last_significant_coeff_y_prefix
++      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++       79, 108, 123,  63,
++      // significant_coeff_group_flag
++      91, 171, 134, 141,
++      // significant_coeff_flag
++      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
++      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
++      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
++      141, 111,
++      // coeff_abs_level_greater1_flag
++      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
++      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
++      // coeff_abs_level_greater2_flag
++      138, 153, 136, 167, 152, 152,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      185,
++      // split_coding_unit_flag
++      107, 139, 126,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      197, 185, 201,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      149,
++      // part_mode
++      154, 139, 154, 154,
++      // prev_intra_luma_pred_mode
++      154,
++      // intra_chroma_pred_mode
++      152, 139,
++      // merge_flag
++      110,
++      // merge_idx
++      122,
++      // inter_pred_idc
++      95, 79, 63, 31, 31,
++      // ref_idx_l0
++      153, 153,
++      // ref_idx_l1
++      153, 153,
++      // abs_mvd_greater1_flag
++      140, 198,
++      // abs_mvd_greater1_flag
++      140, 198,
++      // mvp_lx_flag
++      168,
++      // no_residual_data_flag
++      79,
++      // split_transform_flag
++      124, 138, 94,
++      // cbf_luma
++      153, 111,
++      // cbf_cb, cbf_cr
++      149, 107, 167, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
++       94, 108, 123, 108,
++      // last_significant_coeff_y_prefix
++      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
++       94, 108, 123, 108,
++      // significant_coeff_group_flag
++      121, 140, 61, 154,
++      // significant_coeff_flag
++      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
++      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
++      140, 140,
++      // coeff_abs_level_greater1_flag
++      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
++      // coeff_abs_level_greater2_flag
++      107, 167, 91, 122, 107, 167,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      160,
++      // split_coding_unit_flag
++      107, 139, 126,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      197, 185, 201,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      134,
++      // part_mode
++      154, 139, 154, 154,
++      // prev_intra_luma_pred_mode
++      183,
++      // intra_chroma_pred_mode
++      152, 139,
++      // merge_flag
++      154,
++      // merge_idx
++      137,
++      // inter_pred_idc
++      95, 79, 63, 31, 31,
++      // ref_idx_l0
++      153, 153,
++      // ref_idx_l1
++      153, 153,
++      // abs_mvd_greater1_flag
++      169, 198,
++      // abs_mvd_greater1_flag
++      169, 198,
++      // mvp_lx_flag
++      168,
++      // no_residual_data_flag
++      79,
++      // split_transform_flag
++      224, 167, 122,
++      // cbf_luma
++      153, 111,
++      // cbf_cb, cbf_cr
++      149, 92, 167, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
++       79, 108, 123,  93,
++      // last_significant_coeff_y_prefix
++      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
++       79, 108, 123,  93,
++      // significant_coeff_group_flag
++      121, 140, 61, 154,
++      // significant_coeff_flag
++      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
++      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
++      140, 140,
++      // coeff_abs_level_greater1_flag
++      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
++      // coeff_abs_level_greater2_flag
++      107, 167, 91, 107, 107, 167,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++};
++
++static const uint8_t scan_1x1[1] = {
++    0,
++};
++
++static const uint8_t horiz_scan2x2_x[4] = {
++    0, 1, 0, 1,
++};
++
++static const uint8_t horiz_scan2x2_y[4] = {
++    0, 0, 1, 1
++};
++
++static const uint8_t horiz_scan4x4_x[16] = {
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++};
++
++static const uint8_t horiz_scan4x4_y[16] = {
++    0, 0, 0, 0,
++    1, 1, 1, 1,
++    2, 2, 2, 2,
++    3, 3, 3, 3,
++};
++
++static const uint8_t horiz_scan8x8_inv[8][8] = {
++    {  0,  1,  2,  3, 16, 17, 18, 19, },
++    {  4,  5,  6,  7, 20, 21, 22, 23, },
++    {  8,  9, 10, 11, 24, 25, 26, 27, },
++    { 12, 13, 14, 15, 28, 29, 30, 31, },
++    { 32, 33, 34, 35, 48, 49, 50, 51, },
++    { 36, 37, 38, 39, 52, 53, 54, 55, },
++    { 40, 41, 42, 43, 56, 57, 58, 59, },
++    { 44, 45, 46, 47, 60, 61, 62, 63, },
++};
++
++static const uint8_t diag_scan2x2_x[4] = {
++    0, 0, 1, 1,
++};
++
++static const uint8_t diag_scan2x2_y[4] = {
++    0, 1, 0, 1,
++};
++
++static const uint8_t diag_scan2x2_inv[2][2] = {
++    { 0, 2, },
++    { 1, 3, },
++};
++
++static const uint8_t diag_scan4x4_inv[4][4] = {
++    { 0,  2,  5,  9, },
++    { 1,  4,  8, 12, },
++    { 3,  7, 11, 14, },
++    { 6, 10, 13, 15, },
++};
++
++static const uint8_t diag_scan8x8_inv[8][8] = {
++    {  0,  2,  5,  9, 14, 20, 27, 35, },
++    {  1,  4,  8, 13, 19, 26, 34, 42, },
++    {  3,  7, 12, 18, 25, 33, 41, 48, },
++    {  6, 11, 17, 24, 32, 40, 47, 53, },
++    { 10, 16, 23, 31, 39, 46, 52, 57, },
++    { 15, 22, 30, 38, 45, 51, 56, 60, },
++    { 21, 29, 37, 44, 50, 55, 59, 62, },
++    { 28, 36, 43, 49, 54, 58, 61, 63, },
++};
++
++
++typedef struct
++{
++    uint16_t coeff;
++    uint16_t scale;
++} xy_off_t;
++
++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
++
++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
++
++#define OFF_DIAG(t) {\
++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++#define OFF_HORIZ(t) {\
++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
++}
++
++#define OFF_VERT(t) {\
++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++static const xy_off_t off_xys[3][4][16] =
++{
++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
++};
++
++
++// Helper fns
++#ifndef hevc_mem_bits32
++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
++{
++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
++}
++#endif
++
++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
++#define hevc_clz32 hevc_clz32_builtin
++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
++{
++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
++}
++#endif
++
++// It is unlikely that we will ever need this but include for completeness
++#ifndef hevc_clz32
++static inline unsigned int hevc_clz32(unsigned int x)
++{
++    unsigned int n = 1;
++    if ((x & 0xffff0000) == 0) {
++        n += 16;
++        x <<= 16;
++    }
++    if ((x & 0xff000000) == 0) {
++        n += 8;
++        x <<= 8;
++    }
++    if ((x & 0xf0000000) == 0) {
++        n += 4;
++        x <<= 4;
++    }
++    if ((x & 0xc0000000) == 0) {
++        n += 2;
++        x <<= 2;
++    }
++    return n - ((x >> 31) & 1);
++}
++#endif
++
++static inline int cabac_overflow(const CABACContext * const cc)
++{
++    av_assert0(cc->bytestream >= cc->bytestream_start);
++    return cc->bytestream >= cc->bytestream_end + 4;
++}
++
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
++{
++    return cabac_overflow(&lc->cc);
++}
++
++#if !USE_BY22
++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
++// will no longer be called but the setup calls will still exist and we want
++// to null them out
++#define bypass_start(s)
++#define bypass_finish(s)
++#else
++// Use BY22 for residual bypass block
++
++#define bypass_start(cc) get_cabac_by22_start(cc)
++#define bypass_finish(cc) get_cabac_by22_finish(cc)
++
++// BY22 notes that bypass is simply a divide into the bitstream and so we
++// can peek out large quantities of bits at once and treat the result as if
++// it was VLC.  In many cases this will lead to O(1) processing rather than
++// O(n) though the setup and teardown is sufficiently expensive that it is
++// only worth using if we expect to be dealing with more than a few bits
++// The definition of "a few bits" will vary from platform to platform but
++// tests on ARM show that it probably isn't worth it for a single coded
++// residual, but is for >1 - it also seems likely that if there are
++// more residuals then they are likely to be bigger and this will make the
++// O(1) nature of the code more worthwhile.
++
++
++// Bypass block start
++// Must be called before _by22_peek is used as it sets the CABAC environment
++// into the correct state.  _by22_finish must be called to return to 'normal'
++// (i.e. non-bypass) cabac decoding
++#ifndef get_cabac_by22_start
++static inline void get_cabac_by22_start(CABACContext * const c)
++{
++    const unsigned int bits = __builtin_ctz(c->low);
++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
++#if !USE_BY22_DIV
++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
++#endif
++
++    c->bytestream -= (CABAC_BITS / 8);
++    c->by22.bits = bits;
++#if !USE_BY22_DIV
++    c->by22.range = c->range;
++    c->range = inv;
++#endif
++    c->low = x;
++}
++#endif
++
++// Bypass block finish
++// Must be called at the end of the bypass block to return to normal operation
++static inline void get_cabac_by22_finish(CABACContext * const c)
++{
++    unsigned int used = c->by22.bits;
++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
++
++    c->bytestream += bytes_used + (CABAC_BITS / 8);
++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
++#if !USE_BY22_DIV
++    c->range = c->by22.range;
++#endif
++}
++
++// Peek bypass bits
++// _by22_start must be called before _by22_peek is called and _by22_flush
++// must be called afterwards to flush any used bits
++// The actual number of valid bits returned is
++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
++// will be at least 22 which should be long enough for any prefix or suffix
++// though probably not long enough for the worst case combination
++#ifndef get_cabac_by22_peek
++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
++{
++#if USE_BY22_DIV
++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
++#else
++    uint32_t x = c->low & ~1U;
++    const uint32_t inv = c->range;
++
++    if (inv != 0)
++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
++
++    return x << 1;
++#endif
++}
++#endif
++
++// Flush bypass bits peeked by _by22_peek
++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
++// val is an unmodified copy of whatever _by22_peek returned
++#ifndef get_cabac_by22_flush
++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
++{
++    // Subtract the bits used & reshift up to the top of the word
++#if USE_BY22_DIV
++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
++#else
++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
++#endif
++
++    // and refill lower bits
++    // We will probably OR over some existing bits but that doesn't matter
++    c->by22.bits += n;
++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
++}
++#endif
++
++#endif  // USE_BY22
++
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
++{
++    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
++    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
++}
++
++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
++    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
++}
++
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
++{
++    GetBitContext * const gb = &lc->gb;
++    skip_bits(gb, 1);
++    align_get_bits(gb);
++    return ff_init_cabac_decoder(&lc->cc,
++                          gb->buffer + get_bits_count(gb) / 8,
++                          (get_bits_left(gb) + 7) / 8);
++}
++
++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int init_type = 2 - s->sh.slice_type;
++    int i;
++
++    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
++        init_type ^= 3;
++
++    for (i = 0; i < HEVC_CONTEXTS; i++) {
++        int init_value = init_values[init_type][i];
++        int m = (init_value >> 4) * 5 - 45;
++        int n = ((init_value & 15) << 3) - 16;
++        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
++
++        pre ^= pre >> 31;
++        if (pre > 124)
++            pre = 124 + (pre & 1);
++        lc->cabac_state[i] = pre;
++    }
++
++    for (i = 0; i < 4; i++)
++        lc->stat_coeff[i] = 0;
++}
++
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
++{
++    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
++    {
++        lc->qPy_pred = s->sh.slice_qp;
++        cabac_init_state(s, lc);
++    }
++    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
++    {
++        lc->qPy_pred = s->sh.slice_qp;
++        load_states(s, lc);
++    }
++    lc->cabac_init_req = 0;
++}
++
++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
++{
++    return get_cabac_inline(c, state);
++}
++
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
++{
++    return get_cabac_terminate(c);
++}
++
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
++{
++    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
++        return 0;
++
++    if (!get_cabac_bypass(&lc->cc))
++        return SAO_BAND;
++    return SAO_EDGE;
++}
++
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
++{
++    int i;
++    int value = get_cabac_bypass(&lc->cc);
++
++    for (i = 0; i < 4; i++)
++        value = (value << 1) | get_cabac_bypass(&lc->cc);
++    return value;
++}
++
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int i = 0;
++    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
++
++    while (i < length && get_cabac_bypass(&lc->cc))
++        i++;
++    return i;
++}
++
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
++{
++    return get_cabac_bypass(&lc->cc);
++}
++
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret = get_cabac_bypass(&lc->cc) << 1;
++    ret    |= get_cabac_bypass(&lc->cc);
++    return ret;
++}
++
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
++{
++    int val = 1;
++
++    if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
++        return 0;
++
++    while (val < 5 &&
++           get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
++        val++;
++
++    if (val >= 5) {
++        unsigned int k = 0;
++        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++            val += 1 << k;
++            k++;
++        }
++//        if (k == CABAC_MAX_BIN)
++//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++
++        while (k--)
++            val += get_cabac_bypass(&lc->cc) << k;
++    }
++    return get_cabac_bypass(&lc->cc) ? -val : val;
++}
++
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
++    int i = 0;
++
++    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
++        i++;
++
++    return i;
++}
++
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
++{
++    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
++        return PART_2Nx2N;
++    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
++        if (lc->cu.pred_mode == MODE_INTRA) // 0
++            return PART_NxN;
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++            return PART_2NxN;
++        if (log2_cb_size == 3) // 00
++            return PART_Nx2N;
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
++            return PART_Nx2N;
++        return PART_NxN; // 000
++    }
++
++    if (!s->ps.sps->amp_enabled_flag) {
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++            return PART_2NxN;
++        return PART_Nx2N;
++    }
++
++    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
++            return PART_2NxN;
++        if (get_cabac_bypass(&lc->cc)) // 0101
++            return PART_2NxnD;
++        return PART_2NxnU; // 0100
++    }
++
++    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
++        return PART_Nx2N;
++    if (get_cabac_bypass(&lc->cc)) // 0001
++        return PART_nRx2N;
++    return PART_nLx2N;  // 0000
++}
++
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
++{
++    int i = 0;
++    while (i < 2 && get_cabac_bypass(&lc->cc))
++        i++;
++    return i;
++}
++
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    int i;
++    int value = get_cabac_bypass(&lc->cc);
++
++    for (i = 0; i < 4; i++)
++        value = (value << 1) | get_cabac_bypass(&lc->cc);
++    return value;
++}
++
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret;
++    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
++        return 4;
++
++    ret  = get_cabac_bypass(&lc->cc) << 1;
++    ret |= get_cabac_bypass(&lc->cc);
++    return ret;
++}
++
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
++
++    if (i != 0) {
++        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
++            i++;
++    }
++    return i;
++}
++
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
++{
++    if (nPbW + nPbH == 12)
++        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
++        return PRED_BI;
++
++    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++}
++
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
++{
++    int i = 0;
++    int max = num_ref_idx_lx - 1;
++    int max_ctx = FFMIN(max, 2);
++
++    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
++        i++;
++    if (i == 2) {
++        while (i < max && get_cabac_bypass(&lc->cc))
++            i++;
++    }
++
++    return i;
++}
++
++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
++}
++
++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
++}
++
++#if !USE_BY22
++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret = 2;
++    int k = 1;
++
++    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++        ret += 1U << k;
++        k++;
++    }
++    if (k == CABAC_MAX_BIN) {
++        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++        return 0;
++    }
++
++    while (k--)
++        ret += get_cabac_bypass(&lc->cc) << k;
++    return get_cabac_bypass_sign(&lc->cc, -ret);
++}
++#endif
++
++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return get_cabac_bypass_sign(&lc->cc, -1);
++}
++
++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++}
++
++
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
++    int i =0;
++
++    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
++        i++;
++
++    return i;
++}
++
++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
++                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++{
++    int i = 0;
++    int max = (log2_size << 1) - 1;
++    int ctx_offset, ctx_shift;
++
++    if (!c_idx_nz) {
++        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
++        ctx_shift = (log2_size + 1) >> 2;
++    } else {
++        ctx_offset = 15;
++        ctx_shift = log2_size - 2;
++    }
++    while (i < max &&
++           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
++        i++;
++    *last_scx_prefix = i;
++
++    i = 0;
++    while (i < max &&
++           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
++        i++;
++    *last_scy_prefix = i;
++}
++
++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
++                                                 int last_significant_coeff_prefix)
++{
++    int i;
++    int length = (last_significant_coeff_prefix >> 1) - 1;
++    int value = get_cabac_bypass(&lc->cc);
++
++    for (i = 1; i < length; i++)
++        value = (value << 1) | get_cabac_bypass(&lc->cc);
++    return value;
++}
++
++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
++{
++    int inc;
++
++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
++
++    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++}
++
++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
++{
++    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++}
++
++#if !USE_BY22
++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
++#endif
++
++
++#ifndef coeff_abs_level_remaining_decode_bypass
++static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
++{
++    uint32_t y;
++    unsigned int prefix;
++    unsigned int last_coeff_abs_level_remaining;
++    unsigned int n;
++
++    y = get_cabac_by22_peek(c);
++    prefix = hevc_clz32(~y);
++    // y << prefix will always have top bit 0
++
++    if (prefix < 3) {
++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
++        n = prefix + 1 + rice_param;
++    }
++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
++    {
++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
++
++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++        n = prefix * 2 + rice_param - 2;
++    }
++    else {
++        unsigned int suffix;
++
++        get_cabac_by22_flush(c, prefix, y);
++        y = get_cabac_by22_peek(c);
++
++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++        n = prefix + rice_param - 2;
++    }
++
++    get_cabac_by22_flush(c, n, y);
++
++    return last_coeff_abs_level_remaining;
++}
++#endif
++
++static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
++{
++    int prefix = 0;
++    int suffix = 0;
++    int last_coeff_abs_level_remaining;
++    int i;
++
++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++        prefix++;
++    if (prefix == CABAC_MAX_BIN) {
++//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++        return 0;
++    }
++
++    if (prefix < 3) {
++        for (i = 0; i < rc_rice_param; i++)
++            suffix = (suffix << 1) | get_cabac_bypass(c);
++        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++    } else {
++        int prefix_minus3 = prefix - 3;
++        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++            suffix = (suffix << 1) | get_cabac_bypass(c);
++        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++                                              << rc_rice_param) + suffix;
++    }
++
++    return last_coeff_abs_level_remaining;
++}
++
++#if !USE_BY22
++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
++static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
++{
++    unsigned int i;
++    uint32_t ret = 0;
++
++    for (i = 0; i < nb; i++)
++        ret = (ret << 1) | get_cabac_bypass(c);
++
++    return ret << (32 - nb);
++}
++#endif
++
++#ifndef coeff_sign_flag_decode_bypass
++static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
++{
++    uint32_t y;
++    y = get_cabac_by22_peek(c);
++    get_cabac_by22_flush(c, nb, y);
++    return y & ~(0xffffffffU >> nb);
++}
++#endif
++
++
++#ifndef get_cabac_greater1_bits
++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
++    uint8_t * const state0)
++{
++    unsigned int i;
++    unsigned int rv = 0;
++    for (i = 0; i != n; ++i) {
++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
++        const unsigned int b = get_cabac(c, state0 + idx);
++        rv = (rv << 1) | b;
++    }
++    return rv;
++}
++#endif
++
++
++// N.B. levels returned are the values assuming coeff_abs_level_remaining
++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
++// this version of events.
++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
++    int * const pprev_subset_coded, int * const psum,
++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
++{
++    CABACContext * const c = &lc->cc;
++    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
++    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
++    unsigned int rv;
++    unsigned int i;
++    const unsigned int n = FFMIN(n_end, 8);
++
++    // Really this is i != n but the simple unconditional loop is cheaper
++    // and faster
++    for (i = 0; i != 8; ++i)
++        levels[i] = 1;
++
++    rv = get_cabac_greater1_bits(c, n, state0);
++
++    *pprev_subset_coded = 0;
++    *psum = n;
++
++    rv <<= (32 - n);
++    if (rv != 0)
++    {
++        *pprev_subset_coded = 1;
++        *psum = n + 1;
++        i = hevc_clz32(rv);
++        levels[i] = 2;
++        if (get_cabac(c, state_gt2) == 0)
++        {
++            // Unset first coded bit
++            rv &= ~(0x80000000U >> i);
++        }
++    }
++
++    if (n_end > 8) {
++        const unsigned int g8 = n_end - 8;
++        rv |= ((1 << g8) - 1) << (24 - g8);
++        for (i = 0; i != g8; ++i) {
++            levels[i + 8] = 0;
++        }
++    }
++
++    return rv;
++}
++
++// extended_precision_processing_flag must be false given we are
++// putting the result into a 16-bit array
++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
++// scale_m is uint8_t
++//
++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
++//   or it can be 2 (if we have transquant_bypass)
++// shift is set to one less than we really want but would normally be
++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
++// to achieve it
++
++#ifndef trans_scale_sat
++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
++}
++#endif
++
++
++#ifndef update_rice
++static inline void update_rice(uint8_t * const stat_coeff,
++    const unsigned int last_coeff_abs_level_remaining,
++    const unsigned int c_rice_param)
++{
++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
++    if (x >= 6)
++        (*stat_coeff)++;
++    else if (x == 0 && *stat_coeff > 0)
++        (*stat_coeff)--;
++}
++#endif
++
++
++// n must be > 0 on entry
++#ifndef get_cabac_sig_coeff_flag_idxs
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t const * ctx_map,
++    uint8_t * p)
++{
++    do {
++        if (get_cabac(c, state0 + ctx_map[n]))
++            *p++ = n;
++    } while (--n != 0);
++    return p;
++}
++#endif
++
++
++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t * ctx_map,  // const ptr here but not in asm
++    uint8_t * const flag_idx)
++{
++    int rv;
++
++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
++
++    return rv;
++}
++
++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x1,  x2,  x3,\
++     x4,  x5,  x6,  x7,\
++     x8,  x9, x10, x11,\
++    x12, x13, x14, x15}
++
++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x4,  x8, x12,\
++     x1,  x5,  x9, x13,\
++     x2,  x6, x10, x14,\
++     x3,  x7, x11, x15}
++
++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x4,  x1,  x8,\
++     x5,  x2, x12,  x9,\
++     x6,  x3, x13, x10,\
++     x7, x14, x11, x15}
++
++
++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
++    uint8_t * const significant_coeff_group_flag,
++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
++    int * const pPrev_sig)
++{
++    while (--i >= 0) {
++        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
++        const unsigned int x_cg = scan_x_cg[i];
++
++        // For the flag decode we only care about Z/NZ but
++        // we use the full Right * 2 + Down when calculating
++        // significant coeff flags so we obtain it here.
++        //
++        // The group flag array is one longer than it needs to
++        // be so we don't need to check for y_cg limits
++        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
++
++        if (i == 0 ||
++            significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
++        {
++            gf_y[0] |= (1 << x_cg);
++            *pPrev_sig = prev_sig;
++            break;
++        }
++    }
++
++    return i;
++}
++
++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    const unsigned int stride = frame_stride1(s->frame, c_idx);
++    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
++    uint8_t * const dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
++
++    const unsigned int i = jb->intra.n;
++    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++        pc->ta.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->ta.stride == stride);
++
++        pc->type = RPI_PRED_ADD_RESIDUAL_C;
++    }
++    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++        pc->dc.dst == dst)
++    {
++        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->dc.stride == stride);
++
++        // Rewrite as add residual - must rewrite all fields as different union member
++        pc->type = RPI_PRED_ADD_RESIDUAL_V;
++        pc->ta.buf = coeffs;
++        pc->ta.dst = dst;
++        pc->ta.stride = stride;
++        pc->ta.dc = dc;
++    }
++    else
++    {
++        HEVCPredCmd * const cmd = pc + 1;
++        jb->intra.n = i + 1;
++
++        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++        cmd->size = log2_trafo_size;
++        cmd->ta.buf = coeffs;
++        cmd->ta.dst = dst;
++        cmd->ta.stride = stride;
++        cmd->ta.dc = 0;
++    }
++}
++
++
++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    const unsigned int stride = frame_stride1(s->frame, c_idx);
++    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++    const int is_sliced = 1;
++    uint8_t * const dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
++
++    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
++    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
++
++    const unsigned int i = jb->intra.n;
++    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++        pc->ta.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->ta.stride == stride);
++
++        pc->ta.dc = (int16_t)coeff;
++    }
++    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++        pc->dc.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->dc.stride == stride &&
++                   (pc->dc.dc & ~0xffff) == 0);
++
++        pc->dc.dc |= (coeff << 16);
++    }
++    else
++    {
++        HEVCPredCmd * const cmd = pc + 1;
++        jb->intra.n = i + 1;
++
++        cmd->type = RPI_PRED_ADD_DC + c_idx;
++        cmd->size = log2_trafo_size;
++        cmd->dc.dst = dst;
++        cmd->dc.stride = stride;
++        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
++    }
++}
++
++
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const int x0, const int y0,
++                                const int log2_trafo_size, const enum ScanType scan_idx,
++                                const int c_idx)
++{
++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
++
++    int last_significant_coeff_x, last_significant_coeff_y;
++    int num_coeff = 0;
++    int prev_subset_coded = 0;
++
++    int num_last_subset;
++    int x_cg_last_sig, y_cg_last_sig;
++
++    const uint8_t *scan_x_cg, *scan_y_cg;
++    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++
++    int use_vpu;
++#if RPI_COMPRESS_COEFFS                                
++    int num_nonzero = 0;
++    int use_compress = 0;
++    int *coeffs32;
++#endif
++    int use_dc = 0;
++    int16_t *coeffs;
++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
++    int explicit_rdpcm_flag = 0;
++    int explicit_rdpcm_dir_flag;
++
++    int i;
++    int shift,scale;
++    const uint8_t *scale_matrix = NULL;
++    uint8_t dc_scale;
++    const int c_idx_nz = (c_idx != 0);
++    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++    int prev_sig = 0;
++    int may_hide_sign;
++
++    int16_t dummy_coeffs[16];
++
++    // Derive QP for dequant
++    if (!lc->cu.cu_transquant_bypass_flag) {
++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
++
++        if (s->ps.pps->transform_skip_enabled_flag &&
++            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
++            if (transform_skip_flag) {
++                trans_skip_or_bypass = 1;
++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
++                    may_hide_sign = 0;
++                }
++            }
++        }
++
++        {
++            static const uint8_t level_scale[8] = {
++                40, 45, 51, 57, 64, 72, 0, 0  // Pad to 8
++            };
++            const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
++
++            // Shift is set to one less than will actually occur as the scale
++            // and saturate step adds 1 and then shifts right again
++            scale = level_scale[qp6 & 7];
++//            shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
++            shift = log2_trafo_size - (qp6 >> 3);
++
++            if (shift < 0) {
++                scale <<= -shift;
++                shift = 0;
++            }
++        }
++
++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++            const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++            const unsigned int matrix_id =
++                lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
++
++            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
++            dc_scale = scale_matrix[0];
++            if (log2_trafo_size >= 4)
++                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++        }
++        else
++        {
++            static const uint8_t sixteen_scale[64] = {
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16
++            };
++            scale_matrix = sixteen_scale;
++            dc_scale = 16;
++        }
++    } else {
++        static const uint8_t unit_scale[64] = {
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++        };
++        scale_matrix = unit_scale;
++        shift        = 0;
++        scale        = 2;  // We will shift right to kill this
++        dc_scale     = 1;
++
++        may_hide_sign = 0;
++    }
++
++
++
++
++    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++        trans_skip_or_bypass) {
++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
++        if (explicit_rdpcm_flag) {
++            may_hide_sign = 0;
++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
++        }
++    }
++
++    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
++                                           &last_significant_coeff_x, &last_significant_coeff_y);
++
++    if (last_significant_coeff_x > 3) {
++        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
++        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
++        (2 + (last_significant_coeff_x & 1)) +
++        suffix;
++    }
++
++    if (last_significant_coeff_y > 3) {
++        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
++        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
++        (2 + (last_significant_coeff_y & 1)) +
++        suffix;
++    }
++
++    if (scan_idx == SCAN_VERT)
++        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
++
++    x_cg_last_sig = last_significant_coeff_x >> 2;
++    y_cg_last_sig = last_significant_coeff_y >> 2;
++
++    switch (scan_idx) {
++    case SCAN_DIAG: {
++        int last_x_c = last_significant_coeff_x & 3;
++        int last_y_c = last_significant_coeff_y & 3;
++
++        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++
++        switch (log2_trafo_size) {
++        case 2:
++            scan_x_cg = scan_1x1;
++            scan_y_cg = scan_1x1;
++            break;
++        case 3:
++            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = diag_scan2x2_x;
++            scan_y_cg = diag_scan2x2_y;
++            break;
++        case 4:
++            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
++            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
++            break;
++        case 5:
++        default:
++            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
++            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
++            break;
++        }
++        break;
++    }
++    case SCAN_HORIZ:
++        scan_x_cg = horiz_scan2x2_x;
++        scan_y_cg = horiz_scan2x2_y;
++        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++        break;
++    default: //SCAN_VERT
++        scan_x_cg = horiz_scan2x2_y;
++        scan_y_cg = horiz_scan2x2_x;
++        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++        break;
++    }
++    num_coeff++;
++    num_last_subset = (num_coeff - 1) >> 4;
++
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++        const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */;  // These need special processing
++        use_vpu = 0;
++        use_dc = (num_coeff == 1) && !special &&
++            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
++
++        if (use_dc) {
++            // Just need a little empty space
++            coeffs = dummy_coeffs;
++            // No need to clear
++        }
++        else
++        {
++            use_vpu = !special && log2_trafo_size >= 4;
++#if RPI_COMPRESS_COEFFS
++            use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
++#endif
++            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if RPI_COMPRESS_COEFFS
++            coeffs32 = (int*)coeffs;
++            if (!use_compress)
++#endif
++#if HAVE_NEON
++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++        }
++    }
++
++    i = num_last_subset;
++    do {
++        int implicit_non_zero_coeff = 0;
++        int n_end;
++
++        uint8_t significant_coeff_flag_idx[16];
++        unsigned int nb_significant_coeff_flag = 0;
++
++        if (i == num_last_subset) {
++            // First time through
++            int last_scan_pos = num_coeff - (i << 4) - 1;
++            n_end = last_scan_pos - 1;
++            significant_coeff_flag_idx[0] = last_scan_pos;
++            nb_significant_coeff_flag = 1;
++        } else {
++            n_end = 15;
++            implicit_non_zero_coeff = (i != 0);
++        }
++
++        if (n_end >= 0) {
++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
++            };
++            // N.B. prev_sig = Right * 2 + Down
++            static const uint8_t ctx_idx_maps[3][4][16] = {
++                {
++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                },
++                {
++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                },
++                {
++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                }
++            };
++            const uint8_t *ctx_idx_map_p;
++            int scf_offset = 0;
++
++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++                ctx_idx_map_p = ctx_idx_maps[0][3];
++                scf_offset = 40 + c_idx_nz;
++            } else {
++                if (c_idx_nz != 0)
++                    scf_offset = 27;
++
++                if (log2_trafo_size == 2) {
++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++                } else {
++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
++                    if (!c_idx_nz) {
++                        if (i != 0)
++                            scf_offset += 3;
++
++                        if (log2_trafo_size == 3) {
++                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++                        } else {
++                            scf_offset += 21;
++                        }
++                    } else {
++                        if (log2_trafo_size == 3)
++                            scf_offset += 9;
++                        else
++                            scf_offset += 12;
++                    }
++                }
++            }
++
++            if (n_end > 0) {
++                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
++                    lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
++                    n_end, ctx_idx_map_p,
++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
++
++                nb_significant_coeff_flag += cnt;
++                if (cnt != 0) {
++                    implicit_non_zero_coeff = 0;
++                }
++            }
++
++            if (implicit_non_zero_coeff == 0) {
++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++                    scf_offset = 42 + c_idx_nz;
++                } else {
++                    if (i == 0) {
++                        scf_offset = c_idx_nz ? 27 : 0;
++                    } else {
++                        scf_offset = 2 + scf_offset;
++                    }
++                }
++                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
++                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                    nb_significant_coeff_flag++;
++                }
++            } else {
++                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                nb_significant_coeff_flag++;
++            }
++        }
++#if RPI_COMPRESS_COEFFS
++        if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
++          int16_t temp[32*32];
++          const unsigned int ccount = 1 << (log2_trafo_size * 2);
++          lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
++          lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
++          memcpy(temp, coeffs, sizeof(int)*num_nonzero);
++          coeffs32 = (int *)temp;
++          memset(coeffs, 0, ccount * sizeof(int16_t));
++          num_nonzero--;
++          while (num_nonzero >= 0) {
++            const unsigned int res = coeffs32[num_nonzero];
++            const unsigned int offset = res & 0xffff;
++            coeffs[ offset ] = res >> 16;
++            num_nonzero--;
++          }
++          use_compress = 0;
++        }
++#endif            
++
++        if (nb_significant_coeff_flag != 0) {
++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
++                prev_subset_coded;
++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
++                (gt1_idx_delta << 2);
++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
++                gt1_idx_delta;
++
++            const unsigned int x_cg = scan_x_cg[i];
++            const unsigned int y_cg = scan_y_cg[i];
++            int16_t * const blk_coeffs = coeffs +
++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
++            // This calculation is 'wrong' for log2_traffo_size == 2
++            // but that doesn't matter as in this case x_cg & y_cg
++            // are always 0 so result is correct (0) anyway
++            const uint8_t * const blk_scale = scale_matrix +
++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
++
++            // * The following code block doesn't deal with these flags:
++            //   (nor did the one it replaces)
++            //
++            // cabac_bypass_alignment_enabled_flag
++            //    This should be easy but I can't find a test case
++            // extended_precision_processing_flag
++            //    This can extend the required precision past 16bits
++            //    so is probably tricky - also no example found yet
++
++#if USE_N_END_1
++            if (nb_significant_coeff_flag == 1) {
++                // There is a small gain to be had from special casing the single
++                // transform coefficient case.  The reduction in complexity
++                // makes up for the code duplicatioon.
++
++                int trans_coeff_level = 1;
++                int coeff_sign_flag;
++                int coded_val = 0;
++
++                // initialize first elem of coeff_bas_level_greater1_flag
++                prev_subset_coded = 0;
++
++                if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
++                    trans_coeff_level = 2;
++                    prev_subset_coded = 1;
++                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
++                }
++
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&lc->cc);
++
++                if (coded_val)
++                {
++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
++                    } else {
++                        uint8_t * const stat_coeff =
++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++                        const unsigned int c_rice_param = *stat_coeff >> 2;
++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
++
++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                    }
++                }
++
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
++                    const int res = trans_scale_sat(
++                        (trans_coeff_level ^ k) - k,  // Apply sign
++                        scale,
++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
++                        shift);
++#if RPI_COMPRESS_COEFFS                                
++                      if (use_compress)
++                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++                      else
++#endif
++                      blk_coeffs[xy_off->coeff] = res;
++                }
++            }
++            else
++#endif
++            {
++                int sign_hidden = may_hide_sign;
++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
++                uint32_t coeff_sign_flags;
++                uint32_t coded_vals = 0;
++                // Sum(abs(level[]))
++                // In fact we only need the bottom bit and in some future
++                // version that may be all we calculate
++                unsigned int sum_abs;
++
++                coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
++
++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
++                    sign_hidden = 0;
++
++                // -- Start bypass block
++
++                bypass_start(&lc->cc);
++
++                coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
++
++                if (coded_vals != 0)
++                {
++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
++                    int * level = levels - 1;
++
++                    do {
++                        {
++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
++                            level += z;
++                            coded_vals <<= z;
++                        }
++
++                        {
++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
++
++                            sum_abs += last_coeff_abs_level_remaining + 1;
++                            *level = trans_coeff_level;
++
++                            if (stat_coeff != NULL)
++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                            stat_coeff = NULL;
++
++                            if (trans_coeff_level > (3 << c_rice_param) &&
++                                (c_rice_param < 4 || rice_adaptation_enabled))
++                                ++c_rice_param;
++                        }
++                    } while (coded_vals != 0);
++                }
++
++                // sign_hidden = 0 or 1 so we can combine the tests
++                if ((sign_hidden & sum_abs) != 0) {
++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++                }
++
++                bypass_finish(&lc->cc);
++
++                // -- Finish bypass block
++
++                // Scale loop
++                {
++                    int m = nb_significant_coeff_flag - 1;
++
++                    // Deal with DC component (if any) first
++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
++                    {
++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++                        const int res = trans_scale_sat(
++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
++#if RPI_COMPRESS_COEFFS
++                        if (use_compress)
++                        {
++                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
++                        }
++                        else
++#endif
++                        {
++                            blk_coeffs[0] = res;
++                        }
++                        --m;
++                    }
++
++#if !USE_N_END_1
++                    // If N_END_1 set then m was at least 1 initially
++                    if (m >= 0)
++#endif
++                    {
++                        do {
++                            const xy_off_t * const xy_off = scan_xy_off +
++                                significant_coeff_flag_idx[m];
++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++                            const int res = trans_scale_sat(
++                                (levels[m] ^ k) - k,
++                                scale,
++                                blk_scale[xy_off->scale],
++                                shift);
++#if RPI_COMPRESS_COEFFS
++                            if (use_compress) {
++                              coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++                            } else
++#endif
++                              blk_coeffs[xy_off->coeff] = res;
++                        } while (--m >= 0);
++                    }
++                }
++
++            }
++        }
++    } while ((i = next_subset(lc, i, c_idx_nz,
++                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
++             !cabac_overflow(&lc->cc));
++
++    if (lc->cu.cu_transquant_bypass_flag) {
++        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
++
++            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++        }
++    } else {
++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++                      log2_trafo_size == 2 &&
++                      lc->cu.pred_mode == MODE_INTRA;
++            if (rot) {
++                for (i = 0; i < 8; i++)
++                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
++            }
++
++            s->hevcdsp.dequant(coeffs, log2_trafo_size);
++
++            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++                                        lc->cu.pred_mode == MODE_INTRA &&
++                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
++
++                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++            }
++        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++            s->hevcdsp.transform_4x4_luma(coeffs);
++        }
++        else if (!use_vpu)
++        {
++            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++            if (max_xy == 0)
++            {
++                if (use_dc)
++                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++                else
++                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++            }
++            else {
++                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++                if (max_xy < 4)
++                    col_limit = FFMIN(4, col_limit);
++                else if (max_xy < 8)
++                    col_limit = FFMIN(8, col_limit);
++                else if (max_xy < 12)
++                    col_limit = FFMIN(24, col_limit);
++                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
++            }
++        }
++    }
++
++#if 0
++    // Mildly rotted - we support no mode where cross is valid
++    if (lc->tu.cross_pf) {
++        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
++        const int ccount = 1 << (log2_trafo_size * 2);
++
++        for (i = 0; i < ccount; i++) {
++            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++        }
++    }
++#endif
++
++    if (!use_dc) {
++#if RPI_COMPRESS_COEFFS                                
++        if (use_compress) {
++          coeffs32[num_nonzero] = 0;
++        }
++#endif      
++        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++    }
++}
++
++#if !USE_BY22
++// Stores results to lc
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++    int x = abs_mvd_greater0_flag_decode(lc);
++    int y = abs_mvd_greater0_flag_decode(lc);
++
++    if (x)
++        x += abs_mvd_greater1_flag_decode(lc);
++    if (y)
++        y += abs_mvd_greater1_flag_decode(lc);
++
++    switch (x) {
++    case 2: x = mvd_decode(lc);           break;
++    case 1: x = mvd_sign_flag_decode(lc); break;
++    case 0: x = 0;                       break;
++    }
++
++    switch (y) {
++    case 2: y = mvd_decode(lc);           break;
++    case 1: y = mvd_sign_flag_decode(lc); break;
++    case 0: y = 0;                       break;
++    }
++    return MV_XY(x,y);
++}
++#else
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++    int x = abs_mvd_greater0_flag_decode(lc);
++    int y = abs_mvd_greater0_flag_decode(lc);
++
++    if ((x | y) == 0)
++        return 0;
++
++    if (x != 0)
++        x += abs_mvd_greater1_flag_decode(lc);
++    if (y != 0)
++        y += abs_mvd_greater1_flag_decode(lc);
++
++    if ((x | y) == 1)
++    {
++        // Not worth starting BY22
++        if (x != 0)
++            x = mvd_sign_flag_decode(lc);
++        if (y != 0)
++            y = mvd_sign_flag_decode(lc);
++    }
++    else
++    {
++        CABACContext * const cc = &lc->cc;
++        uint32_t val;
++        uint32_t b;
++        unsigned int n = 0;
++
++        bypass_start(cc);
++        b = val = get_cabac_by22_peek(cc);
++
++        if (x == 1) {
++            x = ((int32_t)b >> 31) | 1;
++            n = 1;
++            b <<= 1;
++        }
++        else if (x == 2) {
++            // EG1 so we have (leading one bits + 1) of suffix
++            // This makes prefix & suffix lengths the same
++            const unsigned int k = hevc_clz32(~b) + 1;
++            int s;
++
++            av_assert2(k <= 15);
++
++            b <<= k;
++            n = 2 * k + 1; // Includes suffix & sign
++
++            // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
++            // if we are going to do this without a flush
++            if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
++            {
++                // Need too many bits - flush
++                // n = k
++                get_cabac_by22_flush(cc, k, val);
++                b = val = get_cabac_by22_peek(cc);
++                n = k + 1;
++            }
++
++            x = (b >> (32 - k)) + (1 << k);
++            b <<= k;
++            s = (int32_t)b >> 31;
++            x = (x ^ s) - s;
++            b <<= 1;
++
++            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
++            if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
++            {
++                get_cabac_by22_flush(cc, n, val);
++                b = val = get_cabac_by22_peek(cc);
++                n = 0;
++            }
++        }
++
++        if (y == 1) {
++            y = ((int32_t)b >> 31) | 1;
++            ++n;
++            // don't care about b anymore
++        }
++        else if (y == 2) {
++            const unsigned int k = hevc_clz32(~b) + 1;
++            int s;
++
++            av_assert2(k <= 15);
++
++            // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
++            // if we are going to do this without a flush
++            b <<= k;
++            n += 2 * k + 1;
++
++            if (n > CABAC_BY22_PEEK_BITS)
++            {
++                // Need too many bits - flush
++                get_cabac_by22_flush(cc, n - (k + 1), val);
++                b = val = get_cabac_by22_peek(cc);
++                n = k + 1;
++            }
++
++            y = (b >> (32 - k)) + (1 << k);
++            s = (int32_t)(b << k) >> 31;
++            y = (y ^ s) - s;
++            // don't care about b anymore
++        }
++
++        get_cabac_by22_flush(cc, n, val);
++        bypass_finish(cc);
++    }
++
++    return MV_XY(x, y);
++}
++#endif
+diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
+new file mode 100644
+index 0000000000..ca191f00d9
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac_fns.h
+@@ -0,0 +1,217 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
++#define AVCODEC_RPI_HEVC_CABAC_FNS_H
++
++#include "config.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
++
++//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const int x0, const int y0,
++                                const int log2_trafo_size, const enum ScanType scan_idx,
++                                const int c_idx);
++
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
++
++#define HEVC_BIN_SAO_MERGE_FLAG                         0
++#define HEVC_BIN_SAO_TYPE_IDX                           1
++#define HEVC_BIN_SAO_EO_CLASS                           2
++#define HEVC_BIN_SAO_BAND_POSITION                      2
++#define HEVC_BIN_SAO_OFFSET_ABS                         2
++#define HEVC_BIN_SAO_OFFSET_SIGN                        2
++#define HEVC_BIN_END_OF_SLICE_FLAG                      2
++#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG                 2
++#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG              5
++#define HEVC_BIN_SKIP_FLAG                              6
++#define HEVC_BIN_CU_QP_DELTA                            9
++#define HEVC_BIN_PRED_MODE                              12
++#define HEVC_BIN_PART_MODE                              13
++#define HEVC_BIN_PCM_FLAG                               17
++#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE              17
++#define HEVC_BIN_MPM_IDX                                18
++#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE               18
++#define HEVC_BIN_INTRA_CHROMA_PRED_MODE                 18
++#define HEVC_BIN_MERGE_FLAG                             20
++#define HEVC_BIN_MERGE_IDX                              21
++#define HEVC_BIN_INTER_PRED_IDC                         22
++#define HEVC_BIN_REF_IDX_L0                             27
++#define HEVC_BIN_REF_IDX_L1                             29
++#define HEVC_BIN_ABS_MVD_GREATER0_FLAG                  31
++#define HEVC_BIN_ABS_MVD_GREATER1_FLAG                  33
++#define HEVC_BIN_ABS_MVD_MINUS2                         35
++#define HEVC_BIN_MVD_SIGN_FLAG                          35
++#define HEVC_BIN_MVP_LX_FLAG                            35
++#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG                  36
++#define HEVC_BIN_SPLIT_TRANSFORM_FLAG                   37
++#define HEVC_BIN_CBF_LUMA                               40
++#define HEVC_BIN_CBF_CB_CR                              42
++#define HEVC_BIN_TRANSFORM_SKIP_FLAG                    46
++#define HEVC_BIN_EXPLICIT_RDPCM_FLAG                    48
++#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG                50
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX        52
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX        70
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX        88
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX        88
++#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG           88
++#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG                 92
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG          136
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG          160
++#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING              166
++#define HEVC_BIN_COEFF_SIGN_FLAG                        166
++#define HEVC_BIN_LOG2_RES_SCALE_ABS                     166
++#define HEVC_BIN_RES_SCALE_SIGN_FLAG                    174
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG               176
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX                177
++
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
++
++static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
++    const uint8_t *ptr = c->bytestream;
++
++    if (c->low & 0x1)
++        ptr--;
++#if CABAC_BITS == 16
++    if (c->low & 0x1FF)
++        ptr--;
++#endif
++    if ((int) (c->bytestream_end - ptr) < n)
++        return NULL;
++    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
++        return NULL;
++
++    return ptr;
++}
++
++static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
++}
++
++static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                                            const unsigned int ct_depth,
++                                                            const unsigned int x0, const unsigned int y0)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
++                                 ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
++                                 ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
++}
++
++static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                             const int x0, const int y0, const int x_cb, const int y_cb)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
++                                 (s->cabac_stash_left[y0 >> 3] & 1) +
++                                 (s->cabac_stash_up[x0 >> 3] & 1));
++}
++
++static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++}
++
++static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
++}
++
++static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
++}
++
++static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
++}
++
++static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
++}
++
++static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
++}
++
++static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
++}
++
++
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c
+new file mode 100644
+index 0000000000..341bb77d9d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.c
+@@ -0,0 +1,75 @@
++/*
++ * HEVC shared tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "rpi_hevc_data.h"
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
++    0, 0, 1, 0,
++    1, 2, 0, 1,
++    2, 3, 1, 2,
++    3, 2, 3, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
++    0, 1, 0, 2,
++    1, 0, 3, 2,
++    1, 0, 3, 2,
++    1, 3, 2, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
++    0, 0, 1, 0,
++    1, 2, 0, 1,
++    2, 3, 0, 1,
++    2, 3, 4, 0,
++    1, 2, 3, 4,
++    5, 0, 1, 2,
++    3, 4, 5, 6,
++    0, 1, 2, 3,
++    4, 5, 6, 7,
++    1, 2, 3, 4,
++    5, 6, 7, 2,
++    3, 4, 5, 6,
++    7, 3, 4, 5,
++    6, 7, 4, 5,
++    6, 7, 5, 6,
++    7, 6, 7, 7,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
++    0, 1, 0, 2,
++    1, 0, 3, 2,
++    1, 0, 4, 3,
++    2, 1, 0, 5,
++    4, 3, 2, 1,
++    0, 6, 5, 4,
++    3, 2, 1, 0,
++    7, 6, 5, 4,
++    3, 2, 1, 0,
++    7, 6, 5, 4,
++    3, 2, 1, 7,
++    6, 5, 4, 3,
++    2, 7, 6, 5,
++    4, 3, 7, 6,
++    5, 4, 7, 6,
++    5, 7, 6, 7,
++};
+diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h
+new file mode 100644
+index 0000000000..0aee673d8b
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.h
+@@ -0,0 +1,31 @@
++/*
++ * HEVC shared data tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_DATA_H
++#define AVCODEC_RPI_HEVC_DATA_H
++
++#include <stdint.h>
++
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
++
++#endif /* AVCODEC_RPI_HEVC_DATA_H */
+diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
+new file mode 100644
+index 0000000000..5125d1eb6b
+--- /dev/null
++++ b/libavcodec/rpi_hevc_filter.c
+@@ -0,0 +1,1210 @@
++/*
++ * HEVC video decoder
++ *
++ * Originally by:
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Seppo Tomperi
++ * Copyright (C) 2013 Wassim Hamidouche
++ *
++ * Substantially rewritten:
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++//#define DISABLE_SAO
++//#define DISABLE_DEBLOCK
++//#define DISABLE_STRENGTHS
++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
++//#define DISABLE_DEBLOCK_NONREF
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++
++#include "rpi_qpu.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#define LUMA 0
++#define CB 1
++#define CR 2
++
++// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
++// so -12,75 overall
++static const uint8_t tctablex[] = {
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
++    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
++    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24,          // QP 38...53
++    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24                    // 54..75
++};
++#define tctable (tctablex + 12 + 6*8)
++
++static const uint8_t betatablex[] = {
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
++     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
++     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
++    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,                      // QP 38...51
++    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64                    // 52..73
++};
++#define betatable (betatablex + 12 + 6*8)
++
++static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
++                            const int c_idx, const int tc_offset)
++{
++    return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
++}
++
++static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int xBase, const unsigned int yBase)
++{
++    const unsigned int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
++    const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
++    const unsigned int xQgBase              = xBase & MinCuQpDeltaSizeMask;
++    const unsigned int yQgBase              = yBase & MinCuQpDeltaSizeMask;
++    const unsigned int min_cb_width         = s->ps.sps->min_cb_width;
++    const unsigned int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
++    const unsigned int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
++    const int qPy_pred = lc->qPy_pred;
++
++    return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
++             s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
++            ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
++             s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
++}
++
++// * Only called from bitstream decode in foreground
++//   so should be safe
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
++{
++    const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
++
++    if (lc->tu.cu_qp_delta != 0) {
++        // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
++        int off = s->ps.sps->qp_bd_offset;
++        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
++                                 52 + off) - off;
++    } else
++        lc->qp_y = qp_y;
++}
++
++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
++{
++    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
++}
++
++// "DSP" these?
++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
++{
++    switch (pixel_shift)
++    {
++        case 2:
++            *(uint32_t *)dst = *(uint32_t *)src;
++            break;
++        case 1:
++            *(uint16_t *)dst = *(uint16_t *)src;
++            break;
++        default:
++            *dst = *src;
++            break;
++    }
++}
++
++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
++                           ptrdiff_t stride_src, int x, int y, int width, int height,
++                           int c_idx, int x_ctb, int y_ctb)
++{
++    const unsigned int sh = pixel_shift(s, c_idx);
++    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
++    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
++
++    /* copy horizontal edges */
++    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
++        src, width << sh);
++    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
++        src + stride_src * (height - 1), width << sh);
++
++    /* copy vertical edges */
++    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
++
++    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
++}
++
++// N.B. Src & dst are swapped as this is a restore!
++// x0 & y0 are in luma coords
++// Width & height are in Y/C pels as appropriate
++// * Clear scope for optimsation here but not used enough to be worth it
++static void restore_tqb_pixels(const HEVCRpiContext * const s,
++                               uint8_t *src1, const uint8_t *dst1,
++                               const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int width, const int height,
++                               const int c_idx)
++{
++    if (s->ps.pps->transquant_bypass_enable_flag ||
++        s->ps.sps->pcm.loop_filter_disable_flag)
++    {
++        const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
++        int blks_y = height >> (c_idx == 0 ? 3 : 2);
++        const unsigned int bwidth = 8 << s->ps.sps->pixel_shift;  // Y & C have the same width in sand
++        const unsigned int bheight = (c_idx == 0) ? 8 : 4;
++        const unsigned int sh = ((x0 >> 3) & 7);
++        const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
++
++        do {
++            unsigned int m = (*pcm >> sh) & mask;
++            uint8_t * bd = src1;
++            const uint8_t * bs = dst1;
++            while (m != 0) {
++                if ((m & 1) != 0) {
++                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
++                }
++                m >>= 1;
++                bs += bwidth;
++                bd += bwidth;
++            }
++            src1 += stride_src * bheight;
++            dst1 += stride_dst * bheight;
++            pcm += s->ps.sps->pcm_width;
++        } while (--blks_y > 0);
++    }
++}
++
++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
++
++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
++{
++#if SAO_FILTER_N == 5
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#elif SAO_FILTER_N == 6
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#else
++#error Confused by size of sao fn array
++#endif
++    int c_idx;
++    int edges[4];  // 0 left 1 top 2 right 3 bottom
++    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
++    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
++    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
++    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
++    RpiSAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
++    // flags indicating unfilterable edges
++    uint8_t vert_edge[]      = { 0, 0 };
++    uint8_t horiz_edge[]     = { 0, 0 };
++    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
++    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
++    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
++                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
++    uint8_t restore          = no_tile_filter || !lfase;
++    uint8_t left_tile_edge   = 0;
++    uint8_t right_tile_edge  = 0;
++    uint8_t up_tile_edge     = 0;
++    uint8_t bottom_tile_edge = 0;
++    const int sliced = 1;
++    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
++
++    edges[0]   = x_ctb == 0;
++    edges[1]   = y_ctb == 0;
++    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
++    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++
++#ifdef DISABLE_SAO
++    return;
++#endif
++
++    if (restore) {
++        if (!edges[0]) {
++            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
++        }
++        if (!edges[2]) {
++            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
++            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
++        }
++        if (!edges[1]) {
++            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
++            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
++        }
++        if (!edges[3]) {
++            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
++            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
++        }
++        if (!edges[0] && !edges[1]) {
++            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
++        }
++        if (!edges[1] && !edges[2]) {
++            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
++        }
++        if (!edges[2] && !edges[3]) {
++            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
++        }
++        if (!edges[0] && !edges[3]) {
++            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
++        }
++    }
++
++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
++        const unsigned int vshift = ctx_vshift(s, c_idx);
++        const unsigned int hshift = ctx_hshift(s, c_idx);
++        const int x0 = x >> hshift;
++        const int y0 = y >> vshift;
++        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
++        const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
++        const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
++        const int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
++        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
++        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++        ptrdiff_t stride_dst;
++        uint8_t *dst;
++
++        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = !sliced ?
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
++            c_idx == 0 ?
++                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
++            !sliced ? src - (1 << sh) :
++            c_idx == 0 ?
++                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
++            !sliced ? src + (width << sh) :
++            c_idx == 0 ?
++                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
++
++        if (sliced && c_idx > 1) {
++            break;
++        }
++
++//        if (c_idx == 1)
++//            printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
++
++        switch (sao->type_idx[c_idx]) {
++        case SAO_BAND:
++            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                           x_ctb, y_ctb);
++            if (s->ps.pps->transquant_bypass_enable_flag ||
++                s->ps.sps->pcm.loop_filter_disable_flag)
++            {
++                // Can't use the edge buffer here as it may be in use by the foreground
++                DECLARE_ALIGNED(64, uint8_t, dstbuf)
++                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
++                dst = dstbuf;
++                stride_dst = 2*MAX_PB_SIZE;
++                s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                   x, y, width, height, c_idx);
++            } else {
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
++            }
++            sao->type_idx[c_idx] = SAO_APPLIED;
++            break;
++        case SAO_EDGE:
++        {
++            const int w = s->ps.sps->width >> hshift;
++            const int h = s->ps.sps->height >> vshift;
++            int top_edge = edges[1];
++            int bottom_edge = edges[3];
++            // Can't use the edge buffer here as it may be in use by the foreground
++            DECLARE_ALIGNED(64, uint8_t, dstbuf)
++                [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
++
++            stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
++            dst = dstbuf + stride_dst + 32;
++
++            if (!top_edge) {
++                uint8_t *dst1;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
++
++                dst1 = dst - stride_dst;
++
++                if (src_l != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++                }
++
++                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++                           SAO_APPLIED);
++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
++
++                if (src_r != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++                }
++            }
++            if (!bottom_edge) {
++                uint8_t * const dst1 = dst + height * stride_dst;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
++                const unsigned int hoff = height * stride_src;
++
++                if (src_l != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++                }
++
++                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++                           SAO_APPLIED);
++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
++
++                if (src_r != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++                }
++            }
++            if (src_l != NULL) {
++                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
++                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++                              sh, height, stride_dst, 1 << sh);
++                } else {
++                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
++                              src_l,
++                              sh, height, stride_dst, stride_src);
++                }
++            }
++            if (src_r != NULL) {
++                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                    ff_hevc_rpi_copy_vert(dst + (width << sh),
++                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++                              sh, height, stride_dst, 1 << sh);
++                } else {
++                    ff_hevc_rpi_copy_vert(dst + (width << sh),
++                              src_r,
++                              sh, height, stride_dst, stride_src);
++                }
++            }
++
++            s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++
++            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                           x_ctb, y_ctb);
++            if (sliced && c_idx != 0)
++            {
++                // Class always the same for both U & V (which is just as well :-))
++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
++                                                width, height);
++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
++            else
++            {
++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++                                                sao->eo_class[c_idx], width, height);
++                s->hevcdsp.sao_edge_restore[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
++            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                               x, y, width, height, c_idx);
++            sao->type_idx[c_idx] = SAO_APPLIED;
++            break;
++        }
++        }
++    }
++
++#if RPI_ZC_SAND_8_IN_10_BUF
++    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
++        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
++    {
++        const unsigned int stride1 = frame_stride1(s->frame, 1);
++        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
++        const unsigned int xoff = (x >> 8) * stride2 * stride1;
++        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
++        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
++        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
++        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
++        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
++        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
++
++//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
++        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
++        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
++    }
++#endif
++}
++
++// When bits are delivered to deblock we want them
++//#define TL 1
++//#define TR 2
++//#define BL 4
++//#define BR 8
++
++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
++// so we need to rearrange before passing on
++
++static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++    return (pcm[0] |
++        (pcm[1] << 8) |
++        (pcm[s->ps.sps->pcm_width] << 16) |
++        (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
++}
++
++static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++    return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
++}
++
++// We cast away const here as we want this to work for both get and set
++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++    return (uint32_t *)(bs +
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#warning Unexpected masks
++        // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
++        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++            (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++    return (uint8_t *)(bs +
++        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++            (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
++        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++
++// Get block strength
++// Given how we call we will always get within the 32bit boundries
++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
++                                unsigned int xl, unsigned int xr, const unsigned int y)
++{
++    if (xr <= xl) {
++        return 0;
++    }
++    else
++    {
++#if HAVE_ARMV6T2_INLINE
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#error This case not yet handled in bs_get32
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++        uint32_t tmp;
++        __asm__ (
++            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
++            "rsb         %[xr], %[xl], %[xr]                         \n\t"
++            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
++            "add         %[xr], %[xr], #7                            \n\t"
++            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
++            "bic         %[xr], %[xr], #7                            \n\t"
++            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
++            "lsr         %[xr], %[xr], #1                            \n\t"
++            "cmp         %[xr], #32                                  \n\t"
++            "mvn         %[tmp], #0                                  \n\t"
++            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
++            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
++            "lsr         %[xl], %[bs], %[xl]                         \n\t"
++            "it ne                                                   \n\t"
++            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
++            :  // Outputs
++                      [bs]"+r"(bs),
++                 [stride2]"+r"(stride2),
++                      [xl]"+r"(xl),
++                      [xr]"+r"(xr),
++                     [tmp]"=&r"(tmp)
++            :  // Inputs
++                       [y]"r"(y),
++                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
++                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
++                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++            :  // Clobbers
++                "cc"
++        );
++        return (uint32_t) bs;
++#else
++        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
++        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
++
++        return n == 32 ? a :
++            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
++#endif
++    }
++}
++
++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++    return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
++}
++
++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++    return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
++}
++
++
++static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
++    const unsigned int ctb_size = (1 << log2_ctb_size);
++    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  1);
++    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++    const DBParams * cb_dbp = s->deblock + ctb_n;
++    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++
++    unsigned int cb_x;
++
++    // Do in CTB-shaped blocks
++    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
++    {
++        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++        const unsigned int bv_l = FFMAX(cb_x, 8);
++        const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
++        const unsigned int bh_l = bv_l - 8;
++        unsigned int y;
++
++        // Main body
++        for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
++        {
++            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
++
++            const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
++            const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++            const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++            if (vbs != 0)
++            {
++                const uint8_t * const tcv = tctable + dbp->tc_offset;
++                const uint8_t * const betav = betatable + dbp->beta_offset;
++                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++                unsigned int x;
++
++                for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
++                {
++                    if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
++                    {
++                        const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                        s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++                                                         frame_stride1(s->frame, LUMA),
++                                                         betav[qp],
++                                                         ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
++                                                          (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
++                                                         pcmfa & 3,
++                                                         av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
++                    }
++                }
++            }
++
++            if (y != 0)
++            {
++                uint32_t hbs;
++
++                // H left - mostly separated out so we only need a uint32_t hbs
++                if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
++                {
++                    const unsigned int x = bh_l;
++                    const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++                    const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                    const DBParams * const dbph = dbp - 1;
++                    const uint8_t * const tc = tctable + dbph->tc_offset + qp;
++
++                    av_assert2(cb_x - bh_l == 8);
++
++                    s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++                                                         frame_stride1(s->frame, LUMA),
++                                                         betatable[qp + dbph->beta_offset],
++                                                         ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++                                                            (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++                                                         (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++                }
++
++                // H
++                if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0)  // Will give (x <= bh_r) in for loop
++                {
++                    unsigned int x;
++                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);
++
++                    for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
++                    {
++                        if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
++                        {
++                            const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                            const uint8_t * const tc = tctable + dbp->tc_offset + qp;
++                            s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++                                                                frame_stride1(s->frame, LUMA),
++                                                                betatable[qp + dbp->beta_offset],
++                                                                ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++                                                                   (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++                                                                (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++                        }
++                    }
++                }
++            }
++
++        }
++    }
++}
++
++static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
++    const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++    return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
++}
++
++static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
++    const unsigned int ctb_size = (1 << log2_ctb_size);
++    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  8);
++    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++    const DBParams * dbp = s->deblock + ctb_n;
++    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++    const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
++    const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
++
++    unsigned int cb_x;
++
++    av_assert1((bounds.x & (ctb_size - 1)) == 0);
++    av_assert1((bounds.y & (ctb_size - 1)) == 0);
++    av_assert1(bounds.h <= ctb_size);
++
++    // Do in CTB-shaped blocks
++    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
++        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++        const unsigned int bv_l = FFMAX(cb_x, 16);
++        unsigned int y;
++
++        // V above
++        if (bounds.y != 0) {
++            // Deblock V up 8
++            // CTB above current
++            // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
++            const unsigned int y = bounds.y - 8;
++            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
++
++            if (vbs != 0)
++            {
++                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++                const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
++                unsigned int x;
++
++                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++                {
++                    if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
++                    {
++                        const int qp0 = q2h(s, x, y);
++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                       frame_stride1(s->frame, 1),
++                                                       tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
++                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       pcmfa & 3);
++                    }
++                }
++            }
++        }
++
++        for (y = bounds.y; y < b_b; y += 16)
++        {
++            uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
++                (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
++
++            // V
++            if (vbs != 0)
++            {
++                unsigned int x;
++                unsigned int pcmfa =
++                    (y + 16 > b_b ?
++                        pcm2(s, bv_l - 1, y) | 0xffff0000 :
++                        pcm4(s, bv_l - 1, y));
++                const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++                {
++                    if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++                    {
++                        const int qp0 = q2h(s, x, y);
++                        const int qp1 = q2h(s, x, y + 8);
++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                            frame_stride1(s->frame, 1),
++                            ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++                                ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++                            av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++                    }
++                }
++            }
++
++            // H
++            if (y != 0)
++            {
++                uint32_t hbs;
++                const unsigned int bh_l = bv_l - 16;
++                const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
++                const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++                const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++                // H left - mostly separated out so we only need a uint32_t hbs
++                // Stub is width 8 to the left of bounds, but width 16 internally
++                if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
++                {
++                    unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++
++                    // Chop off bits we don't want...
++                    if (bh_l < bounds.x) {
++                        pcmfa |= 0x10001; // TL|BL pre rearrangement
++                        hbs &= ~3;  // Make BS 0
++                    }
++
++                    // Double check we still want this
++                    if (hbs != 0 && (~pcmfa & 0x30003) != 0)
++                    {
++                        const unsigned int x = bh_l;
++                        const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                        const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++                        const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
++
++                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                            frame_stride1(s->frame, 1),
++                            ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++                                ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++                    }
++                }
++
++                // H main
++                if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
++                {
++                    unsigned int x;
++                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);  // Might like to mask out far right writes but probably not worth it
++
++                    for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
++                    {
++                        if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++                        {
++                            const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                            const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++                            const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++                            s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                frame_stride1(s->frame, 1),
++                                ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++                                    ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++                                (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++                        }
++                    }
++                }
++            }
++        }
++    }
++}
++
++static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
++{
++    return x & ~(~0U << log2_n);
++}
++
++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++    av_assert2((y & 7) == 0);
++
++    // This doesn't have the same simultainious update issues that bsf_stash
++    // does (other threads will have a different y) so we can do it the easy way
++    if ((bsf &= mask) != 0)
++        *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
++}
++
++
++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++    // We arrange this in a slightly odd fashion but it lines up with
++    // how we are going to use it in the actual deblock code & it is easier
++    // to do the contortions here than there
++    //
++    // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
++
++    av_assert2((x & 7) == 0);
++
++    if ((bsf &= mask) != 0)
++    {
++        uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
++        const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
++
++        if (mask <= 0xf)
++        {
++            *p |= (bsf << sh);
++        }
++        else
++        {
++            do {
++                *p |= (bsf & 0xf) << sh;
++                p += HEVC_RPI_BS_STRIDE1_BYTES;
++            } while ((bsf >>= 4) != 0);
++        }
++    }
++}
++
++static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
++                              const unsigned int rep, const unsigned int dup,
++                              const unsigned int mvf_stride0,
++                              const unsigned int mvf_stride1,
++                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
++                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
++{
++    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
++            mvf_p, mvf_q,
++            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
++            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
++}
++
++
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
++                                               const HEVCRpiLocalContext * const lc,
++                                               const unsigned int x0, const unsigned int y0,
++                                               const unsigned int log2_trafo_size,
++                                               const int is_coded_block)
++{
++    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
++    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
++    const RefPicList * const rpl        = s->refPicList;
++    // Rep count for bsf_mv when running with min_pu chuncks
++    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
++    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
++    const unsigned int trafo_size       = (1U << log2_trafo_size);
++    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
++    const uint32_t bsf_cbf              = (bsf_mask & 0x55555555);
++
++    // Do we cover a pred split line?
++    const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
++    const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
++
++    uint32_t bsf_h;
++    uint32_t bsf_v;
++
++#ifdef DISABLE_STRENGTHS
++    return;
++#endif
++
++    // We are always on a size boundary
++    av_assert2((x0 & (trafo_size - 1)) == 0);
++    av_assert2((y0 & (trafo_size - 1)) == 0);
++    // log2_trafo_size not really a transform size; we can have to deal
++    // with size 2^6 blocks
++    av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
++
++    // Retrieve and update coded (b0), intra (b1) bs flags
++    //
++    // Store on min width (rather than uint32_t) to avoid possible issues
++    // with another thread on another core running wpp using the same
++    // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
++    //
++    // In bsf BS=2 is represented by 3 as it is much easier to test & set
++    // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
++    // 3 will work the same
++    {
++        // Given where we are called from is_cbf_luma & is_intra will be constant over the block
++        const uint32_t bsf0 =  (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
++        uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
++        uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
++
++        switch (log2_trafo_size)
++        {
++            case 2:
++            case 3:
++            {
++                const unsigned int sh_h = (x0 >> 1) & 7;
++                const unsigned int sh_v = (y0 >> 1) & 7;
++                bsf_h = *p;
++                bsf_v = *q;
++                *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
++                *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
++                bsf_h >>= sh_h;
++                bsf_v >>= sh_v;
++                break;
++            }
++            case 4:
++                bsf_h = *p;
++                bsf_v = *q;
++                *p = bsf0;
++                *q = bsf0;
++                break;
++            case 5:
++                bsf_h = *(uint16_t *)p;
++                bsf_v = *(uint16_t *)q;
++                *(uint16_t *)p = bsf0;
++                *(uint16_t *)q = bsf0;
++                break;
++            case 6:
++            default:
++                bsf_h = *(uint32_t *)p;
++                bsf_v = *(uint32_t *)q;
++                *(uint32_t *)p = bsf0;
++                *(uint32_t *)q = bsf0;
++                break;
++        }
++
++        bsf_h |= bsf0;
++        bsf_v |= bsf0;
++    }
++
++    // Do Horizontal
++    if ((y0 & 7) == 0)
++    {
++        // Boundary upper
++        if (y0 != 0 &&
++            (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
++             (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
++        {
++            // Look at MVs (BS=1) if we don't already has a full set of bs bits
++            if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
++            {
++                // If we aren't on the top boundary we must be in the middle
++                // and in that case we know where mvf can change
++                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
++                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
++                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
++                      rpl;
++
++                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                    trafo_size >> (log2_min_pu_size + log2_rep),
++                    trafo_size >> (log2_min_pu_size + log2_rep),
++                    rpl, rpl_top,
++                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
++            }
++
++            // Finally put the results into bs
++            hbs_set(s, x0, y0, bsf_mask, bsf_h);
++        }
++
++        // Max of 1 pu internal split - ignore if not on 8pel boundary
++        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
++        {
++            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
++            // If we have the x split as well then it must be in the middle
++            const unsigned int log2_rep = has_x_split ? 1 : 0;
++
++            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
++                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                   trafo_size >> (log2_min_pu_size + log2_rep),
++                   trafo_size >> (log2_min_pu_size + log2_rep),
++                   rpl, rpl,
++                   mvf, mvf - MVF_STASH_WIDTH_PU));
++        }
++    }
++
++    // And again for vertical - same logic as horizontal just in the other direction
++    if ((x0 & 7) == 0)
++    {
++        // Boundary left
++        if (x0 != 0 &&
++            (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
++             (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
++        {
++            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
++            {
++                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
++                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
++                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
++                    rpl;
++
++                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                    rpl, rpl_left,
++                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
++            }
++
++            vbs_set(s, x0, y0, bsf_mask, bsf_v);
++        }
++
++        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
++        {
++            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
++            const unsigned int log2_rep = has_y_split ? 1 : 0;
++
++            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
++                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                   rpl, rpl,
++                   mvf, mvf - 1));
++        }
++    }
++}
++
++#undef LUMA
++#undef CB
++#undef CR
++
++static inline unsigned int ussub(const unsigned int a, const unsigned int b)
++{
++    return a < b ? 0 : a - b;
++}
++
++static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
++{
++    return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
++}
++
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
++{
++    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++    int x, y;
++
++    const unsigned int br = bounds.x + bounds.w;
++    const unsigned int bb = bounds.y + bounds.h;
++
++    const int x_end = (br >= s->ps.sps->width);
++    const int y_end = (bb >= s->ps.sps->height);
++
++    // Deblock may not touch the edges of the bound as they are still needed
++    // for Intra pred
++    //
++    // Deblock is disabled with a per-slice flag
++    // Given that bounds may cover multiple slices & we dblock outside bounds
++    // anyway we can't avoid deblock using that flag - about the only thing we
++    // could do is have a "no deblock seen yet" flag but it doesn't really
++    // seem worth the effort
++
++    deblock_y_blk(s, bounds, x_end, y_end);
++    deblock_uv_blk(s, bounds, x_end, y_end);
++
++    // SAO needs
++    // (a) CTB alignment
++    // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
++    {
++        const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
++        const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
++        const unsigned int yt = ussub(bounds.y, yo);
++        const unsigned int yb = y_end ? bb : ussub(bb, yo);
++        const unsigned int xl = ussub(bounds.x, xo);
++        const unsigned int xr = x_end ? br : ussub(br, xo);
++
++        if (s->ps.sps->sao_enabled)
++        {
++            for (y = yt; y < yb; y += ctb_size) {
++                for (x = xl; x < xr; x += ctb_size) {
++                    sao_filter_CTB(s, x, y);
++                }
++            }
++        }
++
++        // Cache invalidate
++        y = 0;
++        if (xr != 0 && yb != 0)
++        {
++            const unsigned int llen =
++                (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
++            const unsigned int mask = ~(llen - 1);
++            const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
++            const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
++            const unsigned int it = ussub(yt, 1);
++            const unsigned int ib = y_end ? bb : yb - 1;
++
++            if (il < ir) {
++                rpi_cache_buf_t cbuf;
++                rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
++                rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++                  il, it, ir - il, ib - it,
++                  ctx_vshift(s, 1), 1, 1);
++
++                // If we have to commit the right hand tile boundry due to
++                // cache boundry considerations then at EoTile we must commit
++                // that boundry to bottom of tile (bounds)
++                if (ib != bb && ir == br && eot) {
++                    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++                      br - 1, ib, 1, bb - ib,
++                      ctx_vshift(s, 1), 1, 1);
++                }
++
++                rpi_cache_flush_finish(rfe);
++
++                if (x_end)
++                    y = y_end ? INT_MAX : ib;
++
++//                printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
++            }
++        }
++    }
++
++    return y;
++}
++
+diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h
+new file mode 100644
+index 0000000000..6b36f5e737
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mv.h
+@@ -0,0 +1,71 @@
++#ifndef AVCODEC_RPI_HEVC_MV_H
++#define AVCODEC_RPI_HEVC_MV_H
++
++#include "config.h"
++
++typedef int32_t MvXY;
++
++typedef struct HEVCRpiMvField {
++    MvXY xy[2];
++    int8_t ref_idx[2];
++    int8_t pred_flag;
++    int8_t dummy; // To 12 bytes
++} HEVCRpiMvField;
++
++
++#define MV_X(xy) (((xy) << 16) >> 16)
++#define MV_Y(xy) ((xy) >> 16)
++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_mv_arm.h"
++#endif
++
++#ifndef mvxy_add
++static inline MvXY mvxy_add(const MvXY a, const MvXY b)
++{
++    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
++}
++#endif
++
++
++#ifndef mv_scale_xy
++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
++{
++    int tx, scale_factor;
++
++    td = td == 0 ? 1 : av_clip_int8(td);
++    tb = av_clip_int8(tb);
++    tx = (0x4000 + (abs(td) >> 1)) / td;
++    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
++    return MV_XY(
++        av_clip_int16((scale_factor * MV_X(src) + 127 +
++                           (scale_factor * MV_X(src) < 0)) >> 8),
++        av_clip_int16((scale_factor * MV_Y(src) + 127 +
++                           (scale_factor * MV_Y(src) < 0)) >> 8));
++}
++#endif
++
++// 8.3.1 states that the bitstream may not contain poc diffs that do not
++// fit in 16 bits, so given that we don't care about the high bits we only
++// store the low 16 + LT & Inter flags
++
++#define COL_POC_INTRA   0
++#define COL_POC_INTER   (1 << 16)
++#define COL_POC_LT      (1 << 17)
++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
++
++typedef struct ColMv_s {
++    int32_t poc;
++    int32_t xy;
++} ColMv;
++
++typedef struct ColMvField_s {
++    ColMv L[2];
++} ColMvField;
++
++
++
++#endif // AVCODEC_RPI_HEVC_MV_H
+diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
+new file mode 100644
+index 0000000000..27a9f69525
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mvs.c
+@@ -0,0 +1,487 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Anand Meher Kotra
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++static av_always_inline int
++is_eq_mer(const unsigned int plevel,
++    const unsigned int xN, const unsigned int yN,
++    const unsigned int xP, const unsigned int yP)
++{
++    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
++}
++
++// check if the mv's and refidx are the same between A and B
++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++    return a->pred_flag == b->pred_flag &&
++        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
++        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
++    return 0;
++}
++
++/*
++ * 8.5.3.1.7  temporal luma motion vector prediction
++ */
++static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
++                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
++                                       const int nPbW, const int nPbH, const int refIdxLx,
++                                       MvXY * const mvLXCol, const int X)
++{
++    int x, y;
++    const ColMv * cmv = NULL;
++
++    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
++    const RefPicList * const refPicList = s->refPicList + X;
++    const int cur_lt = refPicList->isLongTerm[refIdxLx];
++
++    *mvLXCol = 0;
++    // Unlikely but we might have a col_ref IDR frame!
++    if (col_ref->col_mvf == NULL)
++        return 0;
++
++    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
++
++    //bottom right collocated motion vector
++    x = x0 + nPbW;
++    y = y0 + nPbH;
++
++    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++        y < s->ps.sps->height &&
++        x < s->ps.sps->width)
++    {
++        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++            (y >> 4) * s->col_mvf_stride;
++
++        if (col->L[0].poc != COL_POC_INTRA &&
++            (col->L[1].poc == COL_POC_INTRA ||
++             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++        {
++            cmv = col->L + 0;
++        }
++        else if (col->L[1].poc != COL_POC_INTRA)
++        {
++            cmv = col->L + 1;
++        }
++    }
++
++    // derive center collocated motion vector
++    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
++    {
++        cmv = NULL;
++        x                  = x0 + (nPbW >> 1);
++        y                  = y0 + (nPbH >> 1);
++
++        {
++            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++              (y >> 4) * s->col_mvf_stride;
++
++            if (col->L[0].poc != COL_POC_INTRA &&
++              (col->L[1].poc == COL_POC_INTRA ||
++               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++            {
++              cmv = col->L + 0;
++            }
++            else if (col->L[1].poc != COL_POC_INTRA)
++            {
++              cmv = col->L + 1;
++            }
++        }
++    }
++
++    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
++        return 0;
++
++    {
++        const int col_poc  = col_ref->poc;
++        const int ref_poc  = refPicList->list[refIdxLx];
++
++        *mvLXCol = (cur_lt ||
++                        cmv->poc == col_poc ||
++                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
++                    cmv->xy :
++                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
++    }
++
++    return cmv != NULL;
++}
++
++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++    return b != NULL && compare_mv_ref_idx(a, b);
++}
++
++
++
++/*
++ * 8.5.3.1.2  Derivation process for spatial merging candidates
++ */
++static inline const HEVCRpiMvField *
++derive_spatial_merge_candidates(
++    const HEVCRpiContext * const s,
++    const HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    const unsigned int part_idx,
++    const unsigned int merge_idx,
++    HEVCRpiMvField * const mvf_t)
++{
++    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
++    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
++
++    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
++    const unsigned int part_mode = lc->cu.part_mode;
++
++    const HEVCRpiMvField * perm[4];
++    unsigned int nb_merge_cand = 0;
++
++    // singleMCLFlag => part_idx == 0 so no need to test for it
++    if ((avail & AVAIL_L) == 0 ||
++        (part_idx == 1 &&
++            ((parts_a1 >> part_mode) & 1) != 0 ||
++                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
++        mvf_a1->pred_flag == PF_INTRA)
++    {
++        mvf_a1 = NULL;
++    }
++    else
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_a1;
++        perm[nb_merge_cand++] = mvf_a1;
++    }
++
++    if ((avail & AVAIL_U) == 0 ||
++            (part_idx == 1 &&
++               ((parts_b1 >> part_mode) & 1) != 0 ||
++                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
++            mvf_b1->pred_flag == PF_INTRA)
++    {
++        mvf_b1 = NULL;
++    }
++    else if (!mvf_eq(mvf_b1, mvf_a1))
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_b1;
++        perm[nb_merge_cand++] = mvf_b1;
++    }
++
++    // above right spatial merge candidate
++    // Never need mvf_b0 again so don't bother zeroing if navail
++    if ((avail & AVAIL_UR) != 0 &&
++        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
++        mvf_b0->pred_flag != PF_INTRA &&
++        !mvf_eq(mvf_b0, mvf_b1))
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_b0;
++        perm[nb_merge_cand++] = mvf_b0;
++    }
++
++    // left bottom spatial merge candidate
++    // Never need mvf_a0 again so don't bother zeroing if navail
++    if ((avail & AVAIL_DL) != 0 &&
++        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
++        mvf_a0->pred_flag != PF_INTRA &&
++        !mvf_eq(mvf_a0, mvf_a1))
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_a0;
++        perm[nb_merge_cand++] = mvf_a0;
++    }
++
++    // above left spatial merge candidate
++    if (nb_merge_cand != 4 &&
++        (avail & AVAIL_UL) != 0 &&
++        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
++    {
++        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
++
++        if (mvf_b2->pred_flag != PF_INTRA &&
++            !mvf_eq(mvf_b2, mvf_a1) &&
++            !mvf_eq(mvf_b2, mvf_b1))
++        {
++            if (merge_idx == nb_merge_cand)
++                return mvf_b2;
++            perm[nb_merge_cand++] = mvf_b2;
++        }
++    }
++
++    // temporal motion vector candidate
++    if (s->sh.slice_temporal_mvp_enabled_flag)
++    {
++        static const HEVCRpiMvField mvf_z = {{0}};
++
++        *mvf_t = mvf_z;
++
++        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++                                        0, mvf_t->xy + 0, 0))
++            mvf_t->pred_flag = PF_L0;
++
++        if (s->sh.slice_type == HEVC_SLICE_B &&
++                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++                                            0, mvf_t->xy + 1, 1))
++            mvf_t->pred_flag |= PF_L1;
++
++        if (mvf_t->pred_flag != 0)
++        {
++            if (merge_idx == nb_merge_cand)
++                return mvf_t;
++            perm[nb_merge_cand++] = mvf_t;
++        }
++    }
++
++    // combined bi-predictive merge candidates  (applies for B slices)
++    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
++    {
++        unsigned int comb_idx = 0;
++        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
++        const RefPicList * const refPicList = s->refPicList;
++
++        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
++        {
++            static const uint8_t l0_l1_cand_idx[12][2] = {
++                { 0, 1, },
++                { 1, 0, },
++                { 0, 2, },
++                { 2, 0, },
++                { 1, 2, },
++                { 2, 1, },
++                { 0, 3, },
++                { 3, 0, },
++                { 1, 3, },
++                { 3, 1, },
++                { 2, 3, },
++                { 3, 2, },
++            };
++
++            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
++            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
++            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
++            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
++
++            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
++                (mvf_c1->pred_flag & PF_L1) != 0 &&
++                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
++                 mvf_c0->xy[0] != mvf_c1->xy[1]))
++            {
++                if (merge_idx == nb_merge_cand++)
++                {
++                    // Need to be a bit careful as we will construct mvf_t and we
++                    // may already be using that as one of our condidates
++                    // so build & copy rather than build in place
++                    const HEVCRpiMvField mvf_m = {
++                        .xy = {
++                            mvf_c0->xy[0],
++                            mvf_c1->xy[1]},
++                        .ref_idx = {
++                            mvf_c0->ref_idx[0],
++                            mvf_c1->ref_idx[1]},
++                        .pred_flag = PF_BI
++                    };
++                    *mvf_t = mvf_m;
++                    return mvf_t;
++                }
++            }
++        }
++    }
++
++    // "append" Zero motion vector candidates
++    {
++        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
++                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
++        const unsigned int zero_idx = merge_idx - nb_merge_cand;
++
++        const HEVCRpiMvField mvf_m = {
++            .xy = {0, 0},
++            .ref_idx = {
++                zero_idx < nb_refs ? zero_idx : 0,
++                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
++            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
++        };
++
++        *mvf_t = mvf_m;
++        return mvf_t;
++    }
++}
++
++
++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++                                int nPbH, int log2_cb_size, int part_idx,
++                                int merge_idx, HEVCRpiMvField * const mv)
++{
++    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
++        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
++                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
++                                        0, merge_idx, mv) :
++        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
++                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
++                                        part_idx, merge_idx, mv);
++
++    if (mvf_m != mv)
++        *mv = *mvf_m;
++
++    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
++        mv->pred_flag = PF_L0;
++}
++
++
++static av_always_inline const MvXY *
++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
++{
++    if (mvf != NULL)
++    {
++        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
++            return mvf->xy + pfi0;
++        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
++            return mvf->xy + pfi1;
++    }
++    return NULL;
++}
++
++static av_always_inline const MvXY *
++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
++              const int islt0, const int poc0, const int poc_cur,
++              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
++{
++    if (mvf != NULL)
++    {
++        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
++        {
++            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
++            if (islt0 || poc1 == poc0) {
++                return mvf->xy + pfi0;
++            }
++            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
++            return mv_t;
++        }
++        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
++        {
++            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
++            if (islt0 || poc1 == poc0) {
++                return mvf->xy + pfi1;
++            }
++            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
++            return mv_t;
++        }
++    }
++    return NULL;
++}
++
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    HEVCRpiMvField * const mv,
++    const unsigned int mvp_lx_flag, const unsigned int LX)
++{
++    const unsigned int pfi0 = LX;
++    const unsigned int pfi1 = LX == 0 ? 1 : 0;
++    const RefPicList * const rpl = s->refPicList;
++    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
++    const int poc_cur = s->poc;
++    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
++
++    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
++    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++    const MvXY * mva = NULL;
++    const MvXY * mvb;
++    MvXY * const mv_rv = mv->xy + LX;
++    MvXY mvt_a, mvt_b;
++
++    *mv_rv = 0;
++
++    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
++        mvf_a0 = NULL;
++    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
++        goto use_mva;
++
++    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
++        mvf_a1 = NULL;
++
++    if (mva == NULL &&
++        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
++        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
++        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
++
++    if (mvp_lx_flag == 0 && mva != NULL)
++        goto use_mva;
++
++    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
++        mvf_b0 = NULL;
++    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
++        mvf_b1 = NULL;
++    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
++        mvf_b2 = NULL;
++
++    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
++        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
++        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
++
++    if (mvf_a0 == NULL && mvf_a1 == NULL) {
++        mva = mvb;
++        if (mvp_lx_flag == 0 && mva != NULL)
++            goto use_mva;
++
++        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
++            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
++            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
++    }
++
++    if (mva == NULL) {
++        mva = mvb;
++        mvb = NULL;
++    }
++
++    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
++        mvb = NULL;
++
++    if (mvp_lx_flag == 0 && mva != NULL) {
++        goto use_mva;
++    }
++    else if (mvp_lx_flag != 0 && mvb != NULL) {
++        *mv_rv = *mvb;
++    }
++    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
++        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
++                                    nPbH, mv->ref_idx[LX],
++                                    mv_rv, LX);
++    }
++    return;
++
++use_mva:
++    *mv_rv = *mva;
++    return;
++}
++
+diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
+new file mode 100644
+index 0000000000..e58a59ce5e
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.c
+@@ -0,0 +1,143 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "bytestream.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_parse.h"
++
++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
++                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
++                                 int err_recognition, int apply_defdispwin, void *logctx)
++{
++    int i;
++    int ret = 0;
++    H2645Packet pkt = { 0 };
++
++    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
++                                nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
++    if (ret < 0) {
++        goto done;
++    }
++
++    for (i = 0; i < pkt.nb_nals; i++) {
++        H2645NAL *nal = &pkt.nals[i];
++
++        /* ignore everything except parameter sets and VCL NALUs */
++        switch (nal->type) {
++        case HEVC_NAL_VPS:
++            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_SPS:
++            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_PPS:
++            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_SEI_PREFIX:
++        case HEVC_NAL_SEI_SUFFIX:
++            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
++            if (ret < 0)
++                goto done;
++            break;
++        default:
++            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
++            break;
++        }
++    }
++
++done:
++    ff_h2645_packet_uninit(&pkt);
++    if (err_recognition & AV_EF_EXPLODE)
++        return ret;
++
++    return 0;
++}
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++                             int err_recognition, int apply_defdispwin, void *logctx)
++{
++    int ret = 0;
++    GetByteContext gb;
++
++    bytestream2_init(&gb, data, size);
++
++    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
++        /* It seems the extradata is encoded as hvcC format.
++         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
++         * is finalized. When finalized, configurationVersion will be 1 and we
++         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
++        int i, j, num_arrays, nal_len_size;
++
++        *is_nalff = 1;
++
++        bytestream2_skip(&gb, 21);
++        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
++        num_arrays   = bytestream2_get_byte(&gb);
++
++        /* nal units in the hvcC always have length coded with 2 bytes,
++         * so put a fake nal_length_size = 2 while parsing them */
++        *nal_length_size = 2;
++
++        /* Decode nal units from hvcC. */
++        for (i = 0; i < num_arrays; i++) {
++            int type = bytestream2_get_byte(&gb) & 0x3f;
++            int cnt  = bytestream2_get_be16(&gb);
++
++            for (j = 0; j < cnt; j++) {
++                // +2 for the nal size field
++                int nalsize = bytestream2_peek_be16(&gb) + 2;
++                if (bytestream2_get_bytes_left(&gb) < nalsize) {
++                    av_log(logctx, AV_LOG_ERROR,
++                           "Invalid NAL unit size in extradata.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
++                                            *nal_length_size, err_recognition, apply_defdispwin,
++                                            logctx);
++                if (ret < 0) {
++                    av_log(logctx, AV_LOG_ERROR,
++                           "Decoding nal unit %d %d from hvcC failed\n",
++                           type, i);
++                    return ret;
++                }
++                bytestream2_skip(&gb, nalsize);
++            }
++        }
++
++        /* Now store right nal length size, that will be used to parse
++         * all other nals */
++        *nal_length_size = nal_len_size;
++    } else {
++        *is_nalff = 0;
++        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
++                                    err_recognition, apply_defdispwin, logctx);
++        if (ret < 0)
++            return ret;
++    }
++
++    return ret;
++}
+diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h
+new file mode 100644
+index 0000000000..4b4d032a16
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.h
+@@ -0,0 +1,36 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * H.265 parser code
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PARSE_H
++#define AVCODEC_RPI_HEVC_PARSE_H
++
++#include <stdint.h>
++
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++                             int err_recognition, int apply_defdispwin, void *logctx);
++
++#endif /* AVCODEC_RPI_HEVC_PARSE_H */
+diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
+new file mode 100644
+index 0000000000..f4e31f7d1d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.c
+@@ -0,0 +1,1938 @@
++/*
++ * HEVC Parameter Set decoding
++ *
++ * Copyright (C) 2012 - 2103 Guillaume Martres
++ * Copyright (C) 2012 - 2103 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/imgutils.h"
++#include "golomb.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevcdec.h"
++
++static const uint8_t default_scaling_list_intra[] = {
++    16, 16, 16, 16, 17, 18, 21, 24,
++    16, 16, 16, 16, 17, 19, 22, 25,
++    16, 16, 17, 18, 20, 22, 25, 29,
++    16, 16, 18, 21, 24, 27, 31, 36,
++    17, 17, 20, 24, 30, 35, 41, 47,
++    18, 19, 22, 27, 35, 44, 54, 65,
++    21, 22, 25, 31, 41, 54, 70, 88,
++    24, 25, 29, 36, 47, 65, 88, 115
++};
++
++static const uint8_t default_scaling_list_inter[] = {
++    16, 16, 16, 16, 17, 18, 20, 24,
++    16, 16, 16, 17, 18, 20, 24, 25,
++    16, 16, 17, 18, 20, 24, 25, 28,
++    16, 17, 18, 20, 24, 25, 28, 33,
++    17, 18, 20, 24, 25, 28, 33, 41,
++    18, 20, 24, 25, 28, 33, 41, 54,
++    20, 24, 25, 28, 33, 41, 54, 71,
++    24, 25, 28, 33, 41, 54, 71, 91
++};
++
++static const AVRational vui_sar[] = {
++    {  0,   1 },
++    {  1,   1 },
++    { 12,  11 },
++    { 10,  11 },
++    { 16,  11 },
++    { 40,  33 },
++    { 24,  11 },
++    { 20,  11 },
++    { 32,  11 },
++    { 80,  33 },
++    { 18,  11 },
++    { 15,  11 },
++    { 64,  33 },
++    { 160, 99 },
++    {  4,   3 },
++    {  3,   2 },
++    {  2,   1 },
++};
++
++
++// pps_cb_qp_offset: -12,+12
++// slice_cb_qp_offset: -12,+12 also
++//   "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
++// cr_qp_offset_list[n]: -12,+12
++// So worst case total offset: -24,+24
++
++#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
++#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
++#define M(B,n) C(B,(-n))
++
++// Sizeof the QP_START_BLOCK
++#define QP_OFFSET_0 (8*6 + 12*2)
++#define QP_START(B) \
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++\
++    M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
++    M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
++    M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
++    M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
++    M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
++    M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
++    M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
++    M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
++#define QP_END(B) \
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
++
++#define T1(B)\
++{\
++    QP_START(B),\
++    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++    C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
++    C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
++    C(B,44), C(B,45),\
++    C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
++    QP_END(B)\
++}
++#define T0(B)\
++{\
++    QP_START(B),\
++    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++    C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
++    C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
++    C(B,50), C(B,51),\
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++    QP_END(B)\
++}
++
++#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
++
++static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
++static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
++
++#undef T
++#undef C
++#undef QP_END
++
++#define C(B,n) ((n)<0?0:(n)>51?51:(n))
++// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
++#define QP_DBLK_OFFSET_0 QP_OFFSET_0
++#define QP_END(B)\
++ 51, 51, 51, 51, 51, 51
++
++// These don't need all the padding we have here (12 top/bottom would be enough)
++static const uint8_t qp_c_dblk_0[] = T0(0);
++static const uint8_t qp_c_dblk_1[] = T1(0);
++
++#undef T
++#undef M
++#undef C
++#undef QP_END
++#undef QP_START
++
++
++static void remove_pps(HEVCRpiParamSets * const s, const int id)
++{
++    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
++        s->pps = NULL;
++    av_buffer_unref(&s->pps_list[id]);
++}
++
++static void remove_sps(HEVCRpiParamSets * const s, const int id)
++{
++    int i;
++    if (s->sps_list[id]) {
++        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
++            s->sps = NULL;
++
++        /* drop all PPS that depend on this SPS */
++        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
++            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
++                remove_pps(s, i);
++
++        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
++    }
++    av_buffer_unref(&s->sps_list[id]);
++}
++
++static void remove_vps(HEVCRpiParamSets * const s, const int id)
++{
++    int i;
++    if (s->vps_list[id]) {
++        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
++            s->vps = NULL;
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
++            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
++                remove_sps(s, i);
++    }
++    av_buffer_unref(&s->vps_list[id]);
++}
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
++                                  ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
++{
++    uint8_t rps_predict = 0;
++    int delta_poc;
++    int k0 = 0;
++    int k1 = 0;
++    int k  = 0;
++    int i;
++
++    if (rps != sps->st_rps && sps->nb_st_rps)
++        rps_predict = get_bits1(gb);
++
++    if (rps_predict) {
++        const ShortTermRPS *rps_ridx;
++        int delta_rps;
++        unsigned abs_delta_rps;
++        uint8_t use_delta_flag = 0;
++        uint8_t delta_rps_sign;
++
++        if (is_slice_header) {
++            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
++            if (delta_idx > sps->nb_st_rps) {
++                av_log(avctx, AV_LOG_ERROR,
++                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
++                       delta_idx, sps->nb_st_rps);
++                return AVERROR_INVALIDDATA;
++            }
++            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
++            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
++        } else
++            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
++
++        delta_rps_sign = get_bits1(gb);
++        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
++        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "Invalid value of abs_delta_rps: %d\n",
++                   abs_delta_rps);
++            return AVERROR_INVALIDDATA;
++        }
++        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
++        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
++            int used = rps->used[k] = get_bits1(gb);
++
++            if (!used)
++                use_delta_flag = get_bits1(gb);
++
++            if (used || use_delta_flag) {
++                if (i < rps_ridx->num_delta_pocs)
++                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
++                else
++                    delta_poc = delta_rps;
++                rps->delta_poc[k] = delta_poc;
++                if (delta_poc < 0)
++                    k0++;
++                else
++                    k1++;
++                k++;
++            }
++        }
++
++        if (k >= FF_ARRAY_ELEMS(rps->used)) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "Invalid num_delta_pocs: %d\n", k);
++            return AVERROR_INVALIDDATA;
++        }
++
++        rps->num_delta_pocs    = k;
++        rps->num_negative_pics = k0;
++        // sort in increasing order (smallest first)
++        if (rps->num_delta_pocs != 0) {
++            int used, tmp;
++            for (i = 1; i < rps->num_delta_pocs; i++) {
++                delta_poc = rps->delta_poc[i];
++                used      = rps->used[i];
++                for (k = i - 1; k >= 0; k--) {
++                    tmp = rps->delta_poc[k];
++                    if (delta_poc < tmp) {
++                        rps->delta_poc[k + 1] = tmp;
++                        rps->used[k + 1]      = rps->used[k];
++                        rps->delta_poc[k]     = delta_poc;
++                        rps->used[k]          = used;
++                    }
++                }
++            }
++        }
++        if ((rps->num_negative_pics >> 1) != 0) {
++            int used;
++            k = rps->num_negative_pics - 1;
++            // flip the negative values to largest first
++            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
++                delta_poc         = rps->delta_poc[i];
++                used              = rps->used[i];
++                rps->delta_poc[i] = rps->delta_poc[k];
++                rps->used[i]      = rps->used[k];
++                rps->delta_poc[k] = delta_poc;
++                rps->used[k]      = used;
++                k--;
++            }
++        }
++    } else {
++        unsigned int prev, nb_positive_pics;
++        rps->num_negative_pics = get_ue_golomb_long(gb);
++        nb_positive_pics       = get_ue_golomb_long(gb);
++
++        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
++            nb_positive_pics >= HEVC_MAX_REFS) {
++            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
++            return AVERROR_INVALIDDATA;
++        }
++
++        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
++        if (rps->num_delta_pocs) {
++            prev = 0;
++            for (i = 0; i < rps->num_negative_pics; i++) {
++                delta_poc = get_ue_golomb_long(gb) + 1;
++                if (delta_poc < 1 || delta_poc > 32768) {
++                    av_log(avctx, AV_LOG_ERROR,
++                        "Invalid value of delta_poc: %d\n",
++                        delta_poc);
++                    return AVERROR_INVALIDDATA;
++                }
++                prev -= delta_poc;
++                rps->delta_poc[i] = prev;
++                rps->used[i]      = get_bits1(gb);
++            }
++            prev = 0;
++            for (i = 0; i < nb_positive_pics; i++) {
++                delta_poc = get_ue_golomb_long(gb) + 1;
++                if (delta_poc < 1 || delta_poc > 32768) {
++                    av_log(avctx, AV_LOG_ERROR,
++                        "Invalid value of delta_poc: %d\n",
++                        delta_poc);
++                    return AVERROR_INVALIDDATA;
++                }
++                prev += delta_poc;
++                rps->delta_poc[rps->num_negative_pics + i] = prev;
++                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
++            }
++        }
++    }
++    return 0;
++}
++
++
++static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
++                                      PTLCommon * const ptl)
++{
++    int i;
++
++    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
++        return -1;
++
++    ptl->profile_space = get_bits(gb, 2);
++    ptl->tier_flag     = get_bits1(gb);
++    ptl->profile_idc   = get_bits(gb, 5);
++    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
++        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
++        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
++        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
++        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
++    else
++        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
++
++    for (i = 0; i < 32; i++) {
++        ptl->profile_compatibility_flag[i] = get_bits1(gb);
++
++        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
++            ptl->profile_idc = i;
++    }
++    ptl->progressive_source_flag    = get_bits1(gb);
++    ptl->interlaced_source_flag     = get_bits1(gb);
++    ptl->non_packed_constraint_flag = get_bits1(gb);
++    ptl->frame_only_constraint_flag = get_bits1(gb);
++
++    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
++    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
++    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
++
++    return 0;
++}
++
++static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
++                      PTL * const ptl, const int max_num_sub_layers)
++{
++    int i;
++    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
++        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
++        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
++        return -1;
++    }
++
++    ptl->general_ptl.level_idc = get_bits(gb, 8);
++
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
++        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
++    }
++
++    if (max_num_sub_layers - 1> 0)
++        for (i = max_num_sub_layers - 1; i < 8; i++)
++            skip_bits(gb, 2); // reserved_zero_2bits[i]
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        if (ptl->sub_layer_profile_present_flag[i] &&
++            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "PTL information for sublayer %i too short\n", i);
++            return -1;
++        }
++        if (ptl->sub_layer_level_present_flag[i]) {
++            if (get_bits_left(gb) < 8) {
++                av_log(avctx, AV_LOG_ERROR,
++                       "Not enough data for sublayer %i level_idc\n", i);
++                return -1;
++            } else
++                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
++        }
++    }
++
++    return 0;
++}
++
++static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
++                                const int subpic_params_present)
++{
++    int i;
++
++    for (i = 0; i < nb_cpb; i++) {
++        get_ue_golomb_long(gb); // bit_rate_value_minus1
++        get_ue_golomb_long(gb); // cpb_size_value_minus1
++
++        if (subpic_params_present) {
++            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
++            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
++        }
++        skip_bits1(gb); // cbr_flag
++    }
++}
++
++static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
++                      const int max_sublayers)
++{
++    int nal_params_present = 0, vcl_params_present = 0;
++    int subpic_params_present = 0;
++    int i;
++
++    if (common_inf_present) {
++        nal_params_present = get_bits1(gb);
++        vcl_params_present = get_bits1(gb);
++
++        if (nal_params_present || vcl_params_present) {
++            subpic_params_present = get_bits1(gb);
++
++            if (subpic_params_present) {
++                skip_bits(gb, 8); // tick_divisor_minus2
++                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
++                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
++                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
++            }
++
++            skip_bits(gb, 4); // bit_rate_scale
++            skip_bits(gb, 4); // cpb_size_scale
++
++            if (subpic_params_present)
++                skip_bits(gb, 4);  // cpb_size_du_scale
++
++            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
++            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
++            skip_bits(gb, 5); // dpb_output_delay_length_minus1
++        }
++    }
++
++    for (i = 0; i < max_sublayers; i++) {
++        int low_delay = 0;
++        unsigned int nb_cpb = 1;
++        int fixed_rate = get_bits1(gb);
++
++        if (!fixed_rate)
++            fixed_rate = get_bits1(gb);
++
++        if (fixed_rate)
++            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
++        else
++            low_delay = get_bits1(gb);
++
++        if (!low_delay) {
++            nb_cpb = get_ue_golomb_long(gb) + 1;
++            if (nb_cpb < 1 || nb_cpb > 32) {
++                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
++                return AVERROR_INVALIDDATA;
++            }
++        }
++
++        if (nal_params_present)
++            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++        if (vcl_params_present)
++            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++    }
++    return 0;
++}
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
++                           HEVCRpiParamSets * const ps)
++{
++    int i,j;
++    int vps_id = 0;
++    ptrdiff_t nal_size;
++    HEVCRpiVPS *vps;
++    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
++
++    if (!vps_buf)
++        return AVERROR(ENOMEM);
++    vps = (HEVCRpiVPS*)vps_buf->data;
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
++
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(vps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(vps->data));
++        vps->data_size = sizeof(vps->data);
++    } else {
++        vps->data_size = nal_size;
++    }
++    memcpy(vps->data, gb->buffer, vps->data_size);
++
++    vps_id = get_bits(gb, 4);
++    if (vps_id >= HEVC_MAX_VPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
++        goto err;
++    }
++
++    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
++        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
++        goto err;
++    }
++
++    vps->vps_max_layers               = get_bits(gb, 6) + 1;
++    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
++    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
++
++    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
++        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
++        goto err;
++    }
++
++    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
++               vps->vps_max_sub_layers);
++        goto err;
++    }
++
++    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
++        goto err;
++
++    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
++
++    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
++    for (; i < vps->vps_max_sub_layers; i++) {
++        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
++        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
++        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
++
++        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
++            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
++                   vps->vps_max_dec_pic_buffering[i] - 1);
++            goto err;
++        }
++        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
++            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
++                   vps->vps_num_reorder_pics[i]);
++            if (avctx->err_recognition & AV_EF_EXPLODE)
++                goto err;
++        }
++    }
++
++    vps->vps_max_layer_id   = get_bits(gb, 6);
++    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
++    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
++        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
++        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
++        goto err;
++    }
++
++    for (i = 1; i < vps->vps_num_layer_sets; i++)
++        for (j = 0; j <= vps->vps_max_layer_id; j++)
++            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
++
++    vps->vps_timing_info_present_flag = get_bits1(gb);
++    if (vps->vps_timing_info_present_flag) {
++        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
++        vps->vps_time_scale                      = get_bits_long(gb, 32);
++        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
++        if (vps->vps_poc_proportional_to_timing_flag)
++            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
++        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
++        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
++            goto err;
++        }
++        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
++            int common_inf_present = 1;
++
++            get_ue_golomb_long(gb); // hrd_layer_set_idx
++            if (i)
++                common_inf_present = get_bits1(gb);
++            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
++        }
++    }
++    get_bits1(gb); /* vps_extension_flag */
++
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread VPS by %d bits\n", -get_bits_left(gb));
++        if (ps->vps_list[vps_id])
++            goto err;
++    }
++
++    if (ps->vps_list[vps_id] &&
++        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
++        av_buffer_unref(&vps_buf);
++    } else {
++        remove_vps(ps, vps_id);
++        ps->vps_list[vps_id] = vps_buf;
++    }
++
++    return 0;
++
++err:
++    av_buffer_unref(&vps_buf);
++    return AVERROR_INVALIDDATA;
++}
++
++static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
++                       const int apply_defdispwin, HEVCRpiSPS * const sps)
++{
++    VUI backup_vui, * const vui = &sps->vui;
++    GetBitContext backup;
++    int sar_present, alt = 0;
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
++
++    sar_present = get_bits1(gb);
++    if (sar_present) {
++        uint8_t sar_idx = get_bits(gb, 8);
++        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
++            vui->sar = vui_sar[sar_idx];
++        else if (sar_idx == 255) {
++            vui->sar.num = get_bits(gb, 16);
++            vui->sar.den = get_bits(gb, 16);
++        } else
++            av_log(avctx, AV_LOG_WARNING,
++                   "Unknown SAR index: %u.\n", sar_idx);
++    }
++
++    vui->overscan_info_present_flag = get_bits1(gb);
++    if (vui->overscan_info_present_flag)
++        vui->overscan_appropriate_flag = get_bits1(gb);
++
++    vui->video_signal_type_present_flag = get_bits1(gb);
++    if (vui->video_signal_type_present_flag) {
++        vui->video_format                    = get_bits(gb, 3);
++        vui->video_full_range_flag           = get_bits1(gb);
++        vui->colour_description_present_flag = get_bits1(gb);
++        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
++            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
++        if (vui->colour_description_present_flag) {
++            vui->colour_primaries        = get_bits(gb, 8);
++            vui->transfer_characteristic = get_bits(gb, 8);
++            vui->matrix_coeffs           = get_bits(gb, 8);
++
++            // Set invalid values to "unspecified"
++            if (!av_color_primaries_name(vui->colour_primaries))
++                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
++            if (!av_color_transfer_name(vui->transfer_characteristic))
++                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
++            if (!av_color_space_name(vui->matrix_coeffs))
++                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
++            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
++                switch (sps->pix_fmt) {
++                case AV_PIX_FMT_YUV444P:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP;
++                    break;
++                case AV_PIX_FMT_YUV444P10:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
++                    break;
++                case AV_PIX_FMT_YUV444P12:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
++                    break;
++                }
++            }
++        }
++    }
++
++    vui->chroma_loc_info_present_flag = get_bits1(gb);
++    if (vui->chroma_loc_info_present_flag) {
++        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
++        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
++    }
++
++    vui->neutra_chroma_indication_flag = get_bits1(gb);
++    vui->field_seq_flag                = get_bits1(gb);
++    vui->frame_field_info_present_flag = get_bits1(gb);
++
++    // Backup context in case an alternate header is detected
++    memcpy(&backup, gb, sizeof(backup));
++    memcpy(&backup_vui, vui, sizeof(backup_vui));
++    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
++        vui->default_display_window_flag = 0;
++        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
++    } else
++        vui->default_display_window_flag = get_bits1(gb);
++
++    if (vui->default_display_window_flag) {
++        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
++        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
++        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
++        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
++        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
++
++        if (apply_defdispwin &&
++            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++            av_log(avctx, AV_LOG_DEBUG,
++                   "discarding vui default display window, "
++                   "original values are l:%u r:%u t:%u b:%u\n",
++                   vui->def_disp_win.left_offset,
++                   vui->def_disp_win.right_offset,
++                   vui->def_disp_win.top_offset,
++                   vui->def_disp_win.bottom_offset);
++
++            vui->def_disp_win.left_offset   =
++            vui->def_disp_win.right_offset  =
++            vui->def_disp_win.top_offset    =
++            vui->def_disp_win.bottom_offset = 0;
++        }
++    }
++
++timing_info:
++    vui->vui_timing_info_present_flag = get_bits1(gb);
++
++    if (vui->vui_timing_info_present_flag) {
++        if( get_bits_left(gb) < 66 && !alt) {
++            // The alternate syntax seem to have timing info located
++            // at where def_disp_win is normally located
++            av_log(avctx, AV_LOG_WARNING,
++                   "Strange VUI timing information, retrying...\n");
++            memcpy(vui, &backup_vui, sizeof(backup_vui));
++            memcpy(gb, &backup, sizeof(backup));
++            alt = 1;
++            goto timing_info;
++        }
++        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
++        vui->vui_time_scale                      = get_bits_long(gb, 32);
++        if (alt) {
++            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
++                   vui->vui_time_scale, vui->vui_num_units_in_tick);
++        }
++        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
++        if (vui->vui_poc_proportional_to_timing_flag)
++            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
++        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
++        if (vui->vui_hrd_parameters_present_flag)
++            decode_hrd(gb, 1, sps->max_sub_layers);
++    }
++
++    vui->bitstream_restriction_flag = get_bits1(gb);
++    if (vui->bitstream_restriction_flag) {
++        if (get_bits_left(gb) < 8 && !alt) {
++            av_log(avctx, AV_LOG_WARNING,
++                   "Strange VUI bitstream restriction information, retrying"
++                   " from timing information...\n");
++            memcpy(vui, &backup_vui, sizeof(backup_vui));
++            memcpy(gb, &backup, sizeof(backup));
++            alt = 1;
++            goto timing_info;
++        }
++        vui->tiles_fixed_structure_flag              = get_bits1(gb);
++        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
++        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
++        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
++        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
++        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
++        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
++        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
++    }
++
++    if (get_bits_left(gb) < 1 && !alt) {
++        // XXX: Alternate syntax when sps_range_extension_flag != 0?
++        av_log(avctx, AV_LOG_WARNING,
++               "Overread in VUI, retrying from timing information...\n");
++        memcpy(vui, &backup_vui, sizeof(backup_vui));
++        memcpy(gb, &backup, sizeof(backup));
++        alt = 1;
++        goto timing_info;
++    }
++}
++
++static void set_default_scaling_list_data(ScalingList * const sl)
++{
++    int matrixId;
++
++    for (matrixId = 0; matrixId < 6; matrixId++) {
++        // 4x4 default is 16
++        memset(sl->sl[0][matrixId], 16, 16);
++        sl->sl_dc[0][matrixId] = 16; // default for 16x16
++        sl->sl_dc[1][matrixId] = 16; // default for 32x32
++    }
++
++    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
++
++    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
++
++    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
++
++    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
++
++    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
++
++    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
++}
++
++static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
++                             const HEVCRpiSPS * const sps)
++{
++    uint8_t scaling_list_pred_mode_flag;
++    int32_t scaling_list_dc_coef[2][6];
++    int size_id, matrix_id, pos;
++    int i;
++
++    for (size_id = 0; size_id < 4; size_id++)
++        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
++            scaling_list_pred_mode_flag = get_bits1(gb);
++            if (!scaling_list_pred_mode_flag) {
++                unsigned int delta = get_ue_golomb_long(gb);
++                /* Only need to handle non-zero delta. Zero means default,
++                 * which should already be in the arrays. */
++                if (delta) {
++                    // Copy from previous array.
++                    delta *= (size_id == 3) ? 3 : 1;
++                    if (matrix_id < delta) {
++                        av_log(avctx, AV_LOG_ERROR,
++                               "Invalid delta in scaling list data: %d.\n", delta);
++                        return AVERROR_INVALIDDATA;
++                    }
++
++                    memcpy(sl->sl[size_id][matrix_id],
++                           sl->sl[size_id][matrix_id - delta],
++                           size_id > 0 ? 64 : 16);
++                    if (size_id > 1)
++                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
++                }
++            } else {
++                int next_coef, coef_num;
++                int32_t scaling_list_delta_coef;
++
++                next_coef = 8;
++                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
++                if (size_id > 1) {
++                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
++                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
++                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
++                }
++                for (i = 0; i < coef_num; i++) {
++                    if (size_id == 0)
++                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
++                                  ff_hevc_rpi_diag_scan4x4_x[i];
++                    else
++                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
++                                  ff_hevc_rpi_diag_scan8x8_x[i];
++
++                    scaling_list_delta_coef = get_se_golomb(gb);
++                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
++                    sl->sl[size_id][matrix_id][pos] = next_coef;
++                }
++            }
++        }
++
++    if (sps->chroma_format_idc == 3) {
++        for (i = 0; i < 64; i++) {
++            sl->sl[3][1][i] = sl->sl[2][1][i];
++            sl->sl[3][2][i] = sl->sl[2][2][i];
++            sl->sl[3][4][i] = sl->sl[2][4][i];
++            sl->sl[3][5][i] = sl->sl[2][5][i];
++        }
++        sl->sl_dc[1][1] = sl->sl_dc[0][1];
++        sl->sl_dc[1][2] = sl->sl_dc[0][2];
++        sl->sl_dc[1][4] = sl->sl_dc[0][4];
++        sl->sl_dc[1][5] = sl->sl_dc[0][5];
++    }
++
++
++    return 0;
++}
++
++static int map_pixel_format(HEVCRpiSPS * const sps)
++{
++    const int cfmt = sps->chroma_format_idc;
++
++    sps->pix_fmt = AV_PIX_FMT_NONE;
++    switch (sps->bit_depth) {
++    case 8:
++        if (cfmt == 1)
++            sps->pix_fmt = AV_PIX_FMT_SAND128;
++        break;
++    case 10:
++        if (cfmt == 1)
++            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
++        break;
++    default:
++        break;
++    }
++
++    sps->hshift[0] = sps->vshift[0] = 0;
++    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
++    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
++
++    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
++
++    return 0;
++}
++
++static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
++                      const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
++{
++    HEVCRpiWindow *ow;
++    int ret = 0;
++    int log2_diff_max_min_transform_block_size;
++    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
++    int i;
++
++    // Coded parameters
++
++    sps->vps_id = get_bits(gb, 4);
++    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (vps_list && !vps_list[sps->vps_id]) {
++        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
++               sps->vps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->max_sub_layers = get_bits(gb, 3) + 1;
++    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
++               sps->max_sub_layers);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->temporal_id_nesting_flag = get_bits(gb, 1);
++
++    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
++        return ret;
++
++    *sps_id = get_ue_golomb_long(gb);
++    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->chroma_format_idc = get_ue_golomb_long(gb);
++    if (sps->chroma_format_idc > 3U) {
++        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->chroma_format_idc == 3)
++        sps->separate_colour_plane_flag = get_bits1(gb);
++
++    if (sps->separate_colour_plane_flag)
++        sps->chroma_format_idc = 0;
++
++    sps->width  = get_ue_golomb_long(gb);
++    sps->height = get_ue_golomb_long(gb);
++    if ((ret = av_image_check_size(sps->width,
++                                   sps->height, 0, avctx)) < 0)
++        return ret;
++
++    if (get_bits1(gb)) { // pic_conformance_flag
++        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
++        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
++        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
++        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
++        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
++
++        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++            av_log(avctx, AV_LOG_DEBUG,
++                   "discarding sps conformance window, "
++                   "original values are l:%u r:%u t:%u b:%u\n",
++                   sps->pic_conf_win.left_offset,
++                   sps->pic_conf_win.right_offset,
++                   sps->pic_conf_win.top_offset,
++                   sps->pic_conf_win.bottom_offset);
++
++            sps->pic_conf_win.left_offset   =
++            sps->pic_conf_win.right_offset  =
++            sps->pic_conf_win.top_offset    =
++            sps->pic_conf_win.bottom_offset = 0;
++        }
++        sps->output_window = sps->pic_conf_win;
++    }
++
++    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
++    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
++    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Luma bit depth (%d) is different from chroma bit depth (%d), "
++               "this is unsupported.\n",
++               sps->bit_depth, bit_depth_chroma);
++        return AVERROR_INVALIDDATA;
++    }
++
++    ret = map_pixel_format(sps);
++    if (ret < 0)
++        return ret;
++
++    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
++    if (sps->log2_max_poc_lsb > 16) {
++        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
++               sps->log2_max_poc_lsb - 4);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sublayer_ordering_info = get_bits1(gb);
++    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
++    for (i = start; i < sps->max_sub_layers; i++) {
++        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
++        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
++        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
++        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
++            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
++                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
++            return AVERROR_INVALIDDATA;
++        }
++        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
++            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
++                   sps->temporal_layer[i].num_reorder_pics);
++            if (avctx->err_recognition & AV_EF_EXPLODE ||
++                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
++                return AVERROR_INVALIDDATA;
++            }
++            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
++        }
++    }
++
++    if (!sublayer_ordering_info) {
++        for (i = 0; i < start; i++) {
++            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
++            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
++            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
++        }
++    }
++
++    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
++    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
++    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
++    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
++    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
++                                               sps->log2_min_tb_size;
++
++    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->log2_diff_max_min_coding_block_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    {
++        const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
++        // Not a bitstream limitation, but all profiles
++        if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
++            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
++            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
++            return AVERROR_INVALIDDATA;
++        }
++
++        // Inferred parameters
++        sps->log2_ctb_size = CtbLog2SizeY;
++//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
++    }
++
++    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
++    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
++
++    sps->scaling_list_enable_flag = get_bits1(gb);
++    if (sps->scaling_list_enable_flag) {
++        set_default_scaling_list_data(&sps->scaling_list);
++
++        if (get_bits1(gb)) {
++            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
++            if (ret < 0)
++                return ret;
++        }
++    }
++
++    sps->amp_enabled_flag = get_bits1(gb);
++    sps->sao_enabled      = get_bits1(gb);
++
++    // Set pcm defaults (0) so we don't have to test _enabled when we
++    // want to use them
++    memset(&sps->pcm, 0, sizeof(sps->pcm));
++
++    if (get_bits1(gb))  // pcm_enabled_flag
++    {
++        const unsigned int limit_max_pcm = FFMIN(5,
++            sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
++        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
++        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
++        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
++        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
++                                        get_ue_golomb_long(gb);
++        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
++                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
++            return AVERROR_INVALIDDATA;
++        }
++        if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
++            sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
++            av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
++                   sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
++            return AVERROR_INVALIDDATA;
++        }
++
++        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
++    }
++
++    // Could be based on min_pcm_cb_size but much easier logic if we just stick
++    // with 8 (and costs us little)
++    sps->pcm_width = (sps->width + 63) >> 6;  // 8 for min size, 8 bits per byte - round up
++    sps->pcm_height = (sps->height + 7) >> 3;
++
++    sps->nb_st_rps = get_ue_golomb_long(gb);
++    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
++        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
++               sps->nb_st_rps);
++        return AVERROR_INVALIDDATA;
++    }
++    for (i = 0; i < sps->nb_st_rps; i++) {
++        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
++                                                 sps, 0)) < 0)
++            return ret;
++    }
++
++    sps->long_term_ref_pics_present_flag = get_bits1(gb);
++    if (sps->long_term_ref_pics_present_flag) {
++        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
++        if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
++            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
++                   sps->num_long_term_ref_pics_sps);
++            return AVERROR_INVALIDDATA;
++        }
++        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
++            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
++            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
++        }
++    }
++
++    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
++    sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
++    sps->vui.sar = (AVRational){0, 1};
++    vui_present = get_bits1(gb);
++    if (vui_present)
++        decode_vui(gb, avctx, apply_defdispwin, sps);
++
++    if (get_bits1(gb)) { // sps_extension_flag
++        int sps_extension_flag[1];
++        for (i = 0; i < 1; i++)
++            sps_extension_flag[i] = get_bits1(gb);
++        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
++        if (sps_extension_flag[0]) {
++            int extended_precision_processing_flag;
++            int cabac_bypass_alignment_enabled_flag;
++
++            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
++            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
++            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
++
++            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
++
++            extended_precision_processing_flag = get_bits1(gb);
++            if (extended_precision_processing_flag)
++                av_log(avctx, AV_LOG_WARNING,
++                   "extended_precision_processing_flag not yet implemented\n");
++
++            if (get_bits1(gb))          // sps->intra_smoothing_disabled_flag
++                sps->intra_filters_disable |= FILTER_EITHER;
++            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
++            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
++
++            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
++            if (cabac_bypass_alignment_enabled_flag)
++                av_log(avctx, AV_LOG_WARNING,
++                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
++        }
++    }
++    if (apply_defdispwin) {
++        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
++        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
++        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
++        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
++    }
++
++    ow = &sps->output_window;
++    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
++        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
++        ow->left_offset + ow->right_offset  >= sps->width ||
++        ow->top_offset  + ow->bottom_offset >= sps->height) {
++        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
++               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
++        if (avctx->err_recognition & AV_EF_EXPLODE) {
++            return AVERROR_INVALIDDATA;
++        }
++        av_log(avctx, AV_LOG_WARNING,
++               "Displaying the whole video surface.\n");
++        memset(ow, 0, sizeof(*ow));
++        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
++    }
++
++    // Inferred parameters
++
++    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
++
++    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
++    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
++    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
++    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
++    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
++    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
++    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
++
++    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
++    sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
++
++    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
++        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
++        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
++               sps->max_transform_hierarchy_depth_inter);
++        return AVERROR_INVALIDDATA;
++    }
++    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
++        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
++               sps->max_transform_hierarchy_depth_intra);
++        return AVERROR_INVALIDDATA;
++    }
++    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
++        av_log(avctx, AV_LOG_ERROR,
++               "max transform block size out of range: %d\n",
++               sps->log2_max_trafo_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread SPS by %d bits\n", -get_bits_left(gb));
++        return AVERROR_INVALIDDATA;
++    }
++
++    return 0;
++}
++
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps, int apply_defdispwin)
++{
++    HEVCRpiSPS *sps;
++    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
++    unsigned int sps_id;
++    int ret;
++    ptrdiff_t nal_size;
++
++    if (!sps_buf)
++        return AVERROR(ENOMEM);
++    sps = (HEVCRpiSPS*)sps_buf->data;
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
++
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(sps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(sps->data));
++        sps->data_size = sizeof(sps->data);
++    } else {
++        sps->data_size = nal_size;
++    }
++    memcpy(sps->data, gb->buffer, sps->data_size);
++
++    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
++                            apply_defdispwin,
++                            ps->vps_list, avctx);
++    if (ret < 0) {
++        av_buffer_unref(&sps_buf);
++        return ret;
++    }
++
++    if (avctx->debug & FF_DEBUG_BITSTREAM) {
++        av_log(avctx, AV_LOG_DEBUG,
++               "Parsed SPS: id %d; coded wxh: %dx%d; "
++               "cropped wxh: %dx%d; pix_fmt: %s.\n",
++               sps_id, sps->width, sps->height,
++               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
++               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
++               av_get_pix_fmt_name(sps->pix_fmt));
++    }
++
++    /* check if this is a repeat of an already parsed SPS, then keep the
++     * original one.
++     * otherwise drop all PPSes that depend on it */
++    if (ps->sps_list[sps_id] &&
++        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
++        av_buffer_unref(&sps_buf);
++    } else {
++        remove_sps(ps, sps_id);
++        ps->sps_list[sps_id] = sps_buf;
++    }
++
++    return 0;
++}
++
++static void hevc_pps_free(void *opaque, uint8_t *data)
++{
++    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
++
++    av_freep(&pps->column_width);
++    av_freep(&pps->row_height);
++    av_freep(&pps->col_bd);
++    av_freep(&pps->row_bd);
++    av_freep(&pps->col_idxX);
++    av_freep(&pps->ctb_addr_rs_to_ts);
++    av_freep(&pps->ctb_addr_ts_to_rs);
++    av_freep(&pps->tile_pos_ts);
++    av_freep(&pps->tile_size);
++    av_freep(&pps->tile_id);
++    av_freep(&pps->ctb_ts_flags);
++
++    av_freep(&pps);
++}
++
++static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
++{
++    do
++    {
++        const int offset = get_se_golomb_long(gb);
++        if (offset < -12 || offset > 12) {
++            av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
++            return AVERROR_INVALIDDATA;
++        }
++        *offsets++ = offset;
++    } while (n_minus_1-- != 0);
++    return 0;
++}
++
++static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
++                                HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++    if (pps->transform_skip_enabled_flag) {
++        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
++    }
++    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
++    if (pps->cross_component_prediction_enabled_flag &&
++        (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
++    {
++        av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
++        return AVERROR_INVALIDDATA;
++    }
++    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
++    if (pps->chroma_qp_offset_list_enabled_flag) {
++        int err;
++
++        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
++        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
++        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
++            return AVERROR_INVALIDDATA;
++        }
++        av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
++
++        if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
++            (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
++            return err;
++    }
++
++    {
++        const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
++
++        pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
++        if (pps->log2_sao_offset_scale_luma > max_offset) {
++            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
++            return AVERROR_INVALIDDATA;
++        }
++        pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
++        if (pps->log2_sao_offset_scale_chroma > max_offset) {
++            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    return(0);
++}
++
++static inline int setup_pps(AVCodecContext * const avctx,
++                            HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++    int pic_area_in_ctbs;
++    int i, j, x, y, ctb_addr_rs, tile_id;
++
++    // Inferred parameters
++
++    // qp_y -> qp_u/qp_v tables
++    // The tables have at least -24,+24 overrun after adding offset here
++    // which should allow for clipless offseting
++
++    pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0;  // No offset for luma, but may be useful for general code
++    pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
++
++    if (sps->chroma_format_idc == 1) {
++        pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++        pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++    }
++    else
++    {
++        pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++        pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++    }
++
++    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
++    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
++    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
++    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
++        return AVERROR(ENOMEM);
++
++    if (pps->uniform_spacing_flag) {
++        if (!pps->column_width) {
++            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
++        }
++        if (!pps->column_width || !pps->row_height)
++            return AVERROR(ENOMEM);
++
++        for (i = 0; i < pps->num_tile_columns; i++) {
++            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
++                                   (i * sps->ctb_width) / pps->num_tile_columns;
++        }
++
++        for (i = 0; i < pps->num_tile_rows; i++) {
++            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
++                                 (i * sps->ctb_height) / pps->num_tile_rows;
++        }
++    }
++
++    {
++        const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
++        pps->col_bd[0] = 0;
++        pps->tile_wpp_inter_disable = 0;
++        for (i = 0; i < pps->num_tile_columns; i++)
++        {
++            pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
++
++            // Avoid trying tile parallel if the columns don't fall on cache boundries
++            // (this causes too much pain syncing flushes with the QPU)
++            // Ignore the final (RHS of pic) tile boundry
++            if ((pps->col_bd[i] & td_mask) != 0) {
++                pps->tile_wpp_inter_disable = 1;
++            }
++        }
++
++        // If we can start the next row before finishing the first line of
++        // this one then we must wait at the end of the tile
++        // * if this happens a lot then there are better but more complicated
++        //   conditions that we could apply
++        if (pps->tile_wpp_inter_disable) {
++            for (i = 0; i < pps->num_tile_rows; i++)
++            {
++                if (pps->row_height[i] <= RPI_MAX_JOBS) {
++                    pps->tile_wpp_inter_disable = 2;
++                    break;
++                }
++            }
++        }
++    }
++
++    pps->row_bd[0] = 0;
++    for (i = 0; i < pps->num_tile_rows; i++)
++        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
++
++    for (i = 0, j = 0; i < sps->ctb_width; i++) {
++        if (i >= pps->col_bd[j + 1])
++            j++;
++        pps->col_idxX[i] = j;
++    }
++
++    /**
++     * 6.5
++     */
++    pic_area_in_ctbs     = sps->ctb_size;
++
++    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
++    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
++    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
++    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
++    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
++    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
++    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
++        !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
++        return AVERROR(ENOMEM);
++    }
++
++    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
++
++    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
++        int tb_x   = ctb_addr_rs % sps->ctb_width;
++        int tb_y   = ctb_addr_rs / sps->ctb_width;
++        int tile_x = 0;
++        int tile_y = 0;
++        int val    = 0;
++
++        for (i = 0; i < pps->num_tile_columns; i++) {
++            if (tb_x < pps->col_bd[i + 1]) {
++                tile_x = i;
++                break;
++            }
++        }
++
++        for (i = 0; i < pps->num_tile_rows; i++) {
++            if (tb_y < pps->row_bd[i + 1]) {
++                tile_y = i;
++                break;
++            }
++        }
++
++        for (i = 0; i < tile_x; i++)
++            val += pps->row_height[tile_y] * pps->column_width[i];
++        for (i = 0; i < tile_y; i++)
++            val += sps->ctb_width * pps->row_height[i];
++
++        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
++               tb_x - pps->col_bd[tile_x];
++
++        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
++        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
++    }
++
++    {
++        uint8_t * pflags = pps->ctb_ts_flags;
++        uint16_t * ptid = pps->tile_id;
++
++        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
++        {
++            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
++            {
++                const unsigned int tile_w = pps->column_width[i];
++
++                pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++                for (x = 0; x != tile_w; ++x) {
++                    pflags[x] |= CTB_TS_FLAGS_TOT;
++                }
++
++                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
++                {
++                    pflags[0] |= CTB_TS_FLAGS_SOTL;
++
++                    if (pps->entropy_coding_sync_enabled_flag)
++                    {
++                        if (pps->column_width[i] != 1)
++                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
++                        else
++                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
++                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
++                    }
++
++                    for (x = 0; x != tile_w; ++x)
++                        *ptid++ = tile_id;
++
++                    pflags += tile_w;
++                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
++                    if (i + 1 == pps->num_tile_columns)
++                        pflags[-1] |= CTB_TS_FLAGS_EOL;
++                }
++
++                pflags[-1] |= CTB_TS_FLAGS_EOT;
++            }
++        }
++    }
++
++    {
++        unsigned int ts = 0;
++        for (j = 0; j < pps->num_tile_rows; j++)
++            for (i = 0; i < pps->num_tile_columns; i++)
++            {
++                const unsigned int size = pps->column_width[i] * pps->row_height[j];
++                pps->tile_size[j * pps->num_tile_columns + i] = size;
++                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
++                ts += size;
++            }
++    }
++
++    return 0;
++}
++
++int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
++                           HEVCRpiParamSets * const ps)
++{
++    const HEVCRpiSPS *sps = NULL;
++    int i, ret = 0;
++    unsigned int pps_id = 0;
++    ptrdiff_t nal_size;
++    unsigned log2_parallel_merge_level_minus2;
++
++    AVBufferRef *pps_buf;
++    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
++
++    if (!pps)
++        return AVERROR(ENOMEM);
++
++    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
++                               hevc_pps_free, NULL, 0);
++    if (!pps_buf) {
++        av_freep(&pps);
++        return AVERROR(ENOMEM);
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
++
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(pps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(pps->data));
++        pps->data_size = sizeof(pps->data);
++    } else {
++        pps->data_size = nal_size;
++    }
++    memcpy(pps->data, gb->buffer, pps->data_size);
++
++    // Default values
++    pps->loop_filter_across_tiles_enabled_flag = 1;
++    pps->num_tile_columns                      = 1;
++    pps->num_tile_rows                         = 1;
++    pps->uniform_spacing_flag                  = 1;
++    pps->disable_dbf                           = 0;
++    pps->beta_offset                           = 0;
++    pps->tc_offset                             = 0;
++    pps->log2_max_transform_skip_block_size    = 2;
++
++    // Coded parameters
++    pps_id = get_ue_golomb_long(gb);
++    if (pps_id >= HEVC_MAX_PPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->sps_id = get_ue_golomb_long(gb);
++    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    if (!ps->sps_list[pps->sps_id]) {
++        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
++
++    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
++    pps->output_flag_present_flag              = get_bits1(gb);
++    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
++
++    pps->sign_data_hiding_flag = get_bits1(gb);
++
++    pps->cabac_init_present_flag = get_bits1(gb);
++
++    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
++    if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
++        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
++    if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
++        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++
++    pps->pic_init_qp_minus26 = get_se_golomb(gb);
++    if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
++        av_log(avctx, AV_LOG_ERROR,
++               "init_qp_minus26 %d is outside the valid range "
++               "[%d, %d].\n",
++               pps->pic_init_qp_minus26,
++               -(26 + sps->qp_bd_offset), 25);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++
++    pps->constrained_intra_pred_flag = get_bits1(gb);
++    pps->transform_skip_enabled_flag = get_bits1(gb);
++
++    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
++    pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
++    if (pps->cu_qp_delta_enabled_flag)
++    {
++        const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
++
++        if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
++            av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
++                   diff_cu_qp_delta_depth);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
++
++        pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
++    }
++
++    pps->cb_qp_offset = get_se_golomb(gb);
++    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
++        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
++               pps->cb_qp_offset);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->cr_qp_offset = get_se_golomb(gb);
++    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
++        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
++               pps->cr_qp_offset);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
++
++    pps->weighted_pred_flag   = get_bits1(gb);
++    pps->weighted_bipred_flag = get_bits1(gb);
++
++    pps->transquant_bypass_enable_flag    = get_bits1(gb);
++    pps->tiles_enabled_flag               = get_bits1(gb);
++    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
++
++    if (pps->tiles_enabled_flag) {
++        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
++        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
++        if (pps->num_tile_columns <= 0 ||
++            pps->num_tile_columns >= sps->width) {
++            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
++                   pps->num_tile_columns - 1);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
++        if (pps->num_tile_rows <= 0 ||
++            pps->num_tile_rows >= sps->height) {
++            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
++                   pps->num_tile_rows - 1);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
++
++        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
++        if (!pps->column_width || !pps->row_height) {
++            ret = AVERROR(ENOMEM);
++            goto err;
++        }
++
++        pps->uniform_spacing_flag = get_bits1(gb);
++        if (!pps->uniform_spacing_flag) {
++            uint64_t sum = 0;
++            for (i = 0; i < pps->num_tile_columns - 1; i++) {
++                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
++                sum                 += pps->column_width[i];
++            }
++            if (sum >= sps->ctb_width) {
++                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
++
++            sum = 0;
++            for (i = 0; i < pps->num_tile_rows - 1; i++) {
++                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
++                sum               += pps->row_height[i];
++            }
++            if (sum >= sps->ctb_height) {
++                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
++        }
++        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
++    }
++
++    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++
++    pps->deblocking_filter_control_present_flag = get_bits1(gb);
++    if (pps->deblocking_filter_control_present_flag) {
++        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
++        pps->disable_dbf                             = get_bits1(gb);
++        if (!pps->disable_dbf) {
++            int beta_offset_div2 = get_se_golomb(gb);
++            int tc_offset_div2   = get_se_golomb(gb) ;
++            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
++                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
++                       beta_offset_div2);
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
++                       tc_offset_div2);
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->beta_offset = 2 * beta_offset_div2;
++            pps->tc_offset   = 2 *   tc_offset_div2;
++        }
++    }
++
++    pps->scaling_list_data_present_flag = get_bits1(gb);
++    if (pps->scaling_list_data_present_flag) {
++        set_default_scaling_list_data(&pps->scaling_list);
++        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
++        if (ret < 0)
++            goto err;
++    }
++    pps->lists_modification_present_flag = get_bits1(gb);
++    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
++    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
++        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
++               log2_parallel_merge_level_minus2);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
++
++    pps->slice_header_extension_present_flag = get_bits1(gb);
++
++    if (get_bits1(gb)) { // pps_extension_present_flag
++        int pps_range_extensions_flag = get_bits1(gb);
++        skip_bits(gb, 7); // pps_extension_7bits
++        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
++            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
++                goto err;
++        }
++    }
++
++    ret = setup_pps(avctx, pps, sps);
++    if (ret < 0)
++        goto err;
++
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread PPS by %d bits\n", -get_bits_left(gb));
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++
++    remove_pps(ps, pps_id);
++    ps->pps_list[pps_id] = pps_buf;
++
++    return 0;
++
++err:
++    av_buffer_unref(&pps_buf);
++    return ret;
++}
++
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
++{
++    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
++    int prev_poc_lsb = pocTid0 % max_poc_lsb;
++    int prev_poc_msb = pocTid0 - prev_poc_lsb;
++    int poc_msb;
++
++    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
++        poc_msb = prev_poc_msb + max_poc_lsb;
++    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
++        poc_msb = prev_poc_msb - max_poc_lsb;
++    else
++        poc_msb = prev_poc_msb;
++
++    // For BLA picture types, POCmsb is set to 0.
++    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
++        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
++        nal_unit_type == HEVC_NAL_BLA_N_LP)
++        poc_msb = 0;
++
++    return poc_msb + poc_lsb;
++}
+diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
+new file mode 100644
+index 0000000000..c725ebb9ca
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.h
+@@ -0,0 +1,449 @@
++/*
++ * HEVC parameter set parsing
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PS_H
++#define AVCODEC_RPI_HEVC_PS_H
++
++#include <stdint.h>
++
++#include "libavutil/buffer.h"
++#include "libavutil/pixfmt.h"
++#include "libavutil/rational.h"
++
++#include "avcodec.h"
++#include "get_bits.h"
++#include "hevc.h"
++
++typedef struct ShortTermRPS {
++    unsigned int num_negative_pics;
++    int num_delta_pocs;
++    int rps_idx_num_delta_pocs;
++    int32_t delta_poc[32];
++    uint8_t used[32];
++} ShortTermRPS;
++
++typedef struct LongTermRPS {
++    int     poc[32];
++    uint8_t used[32];
++    uint8_t nb_refs;
++} LongTermRPS;
++
++typedef struct RpiSliceHeader {
++    unsigned int pps_id;
++
++    ///< address (in raster order) of the first block in the current slice segment
++    unsigned int   slice_segment_addr;
++    ///< address (in raster order) of the first block in the current slice
++    unsigned int   slice_addr;
++
++    enum HEVCSliceType slice_type;
++
++    int pic_order_cnt_lsb;
++
++    uint8_t first_slice_in_pic_flag;
++    uint8_t dependent_slice_segment_flag;
++    uint8_t pic_output_flag;
++    uint8_t colour_plane_id;
++
++    ///< RPS coded in the slice header itself is stored here
++    int short_term_ref_pic_set_sps_flag;
++    int short_term_ref_pic_set_size;
++    ShortTermRPS slice_rps;
++    const ShortTermRPS *short_term_rps;
++    int long_term_ref_pic_set_size;
++    LongTermRPS long_term_rps;
++    unsigned int list_entry_lx[2][32];
++
++    uint8_t rpl_modification_flag[2];
++    uint8_t no_output_of_prior_pics_flag;
++    uint8_t slice_temporal_mvp_enabled_flag;
++
++    unsigned int nb_refs[2];
++
++    uint8_t slice_sample_adaptive_offset_flag[3];
++    uint8_t mvd_l1_zero_flag;
++
++    uint8_t cabac_init_flag;
++    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
++    uint8_t slice_loop_filter_across_slices_enabled_flag;
++    uint8_t collocated_list;
++
++    uint8_t no_dblk_boundary_flags;
++
++    unsigned int collocated_ref_idx;
++
++    int slice_qp_delta;
++    int slice_cb_qp_offset;  // -12, +12
++    int slice_cr_qp_offset;  // -12, +12
++
++    uint8_t cu_chroma_qp_offset_enabled_flag;
++
++    int beta_offset;    ///< beta_offset_div2 * 2
++    int tc_offset;      ///< tc_offset_div2 * 2
++
++    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
++
++    unsigned *entry_point_offset;
++    int * offset;
++    int * size;
++    int num_entry_point_offsets;
++    int offsets_allocated;
++
++    uint8_t offload_wpp;
++    uint8_t offload_tiles;
++
++    int8_t slice_qp;
++
++    uint8_t luma_log2_weight_denom;
++    uint8_t chroma_log2_weight_denom;
++
++    int16_t luma_weight_l0[16];     // -128, +255
++    int16_t luma_offset_l0[16];
++    int16_t chroma_weight_l0[16][2];
++    int16_t chroma_offset_l0[16][2];
++
++    int16_t luma_weight_l1[16];
++    int16_t luma_offset_l1[16];
++    int16_t chroma_weight_l1[16][2];
++    int16_t chroma_offset_l1[16][2];
++
++} RpiSliceHeader;
++
++typedef struct HEVCRpiWindow {
++    uint16_t left_offset;
++    uint16_t right_offset;
++    uint16_t top_offset;
++    uint16_t bottom_offset;
++} HEVCRpiWindow;
++
++typedef struct VUI {
++    AVRational sar;
++
++    int overscan_info_present_flag;
++    int overscan_appropriate_flag;
++
++    int video_signal_type_present_flag;
++    int video_format;
++    int video_full_range_flag;
++    int colour_description_present_flag;
++    uint8_t colour_primaries;
++    uint8_t transfer_characteristic;
++    uint8_t matrix_coeffs;
++
++    int chroma_loc_info_present_flag;
++    int chroma_sample_loc_type_top_field;
++    int chroma_sample_loc_type_bottom_field;
++    int neutra_chroma_indication_flag;
++
++    int field_seq_flag;
++    int frame_field_info_present_flag;
++
++    int default_display_window_flag;
++    HEVCRpiWindow def_disp_win;
++
++    int vui_timing_info_present_flag;
++    uint32_t vui_num_units_in_tick;
++    uint32_t vui_time_scale;
++    int vui_poc_proportional_to_timing_flag;
++    int vui_num_ticks_poc_diff_one_minus1;
++    int vui_hrd_parameters_present_flag;
++
++    int bitstream_restriction_flag;
++    int tiles_fixed_structure_flag;
++    int motion_vectors_over_pic_boundaries_flag;
++    int restricted_ref_pic_lists_flag;
++    int min_spatial_segmentation_idc;
++    int max_bytes_per_pic_denom;
++    int max_bits_per_min_cu_denom;
++    int log2_max_mv_length_horizontal;
++    int log2_max_mv_length_vertical;
++} VUI;
++
++typedef struct PTLCommon {
++    uint8_t profile_space;
++    uint8_t tier_flag;
++    uint8_t profile_idc;
++    uint8_t profile_compatibility_flag[32];
++    uint8_t level_idc;
++    uint8_t progressive_source_flag;
++    uint8_t interlaced_source_flag;
++    uint8_t non_packed_constraint_flag;
++    uint8_t frame_only_constraint_flag;
++} PTLCommon;
++
++typedef struct PTL {
++    PTLCommon general_ptl;
++    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
++
++    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
++    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
++} PTL;
++
++typedef struct HEVCRpiVPS {
++    uint8_t vps_temporal_id_nesting_flag;
++    int vps_max_layers;
++    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
++
++    PTL ptl;
++    int vps_sub_layer_ordering_info_present_flag;
++    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
++    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
++    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
++    int vps_max_layer_id;
++    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
++    uint8_t vps_timing_info_present_flag;
++    uint32_t vps_num_units_in_tick;
++    uint32_t vps_time_scale;
++    uint8_t vps_poc_proportional_to_timing_flag;
++    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
++    int vps_num_hrd_parameters;
++
++    uint8_t data[4096];
++    int data_size;
++} HEVCRpiVPS;
++
++typedef struct ScalingList {
++    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
++     * and size ID 3 only has 2 arrays, not 6. */
++    uint8_t sl[4][6][64];
++    uint8_t sl_dc[2][6];
++} ScalingList;
++
++typedef struct HEVCRpiSPS {
++    unsigned vps_id;
++    uint8_t chroma_format_idc;
++    uint8_t separate_colour_plane_flag;
++
++    HEVCRpiWindow output_window;
++
++    HEVCRpiWindow pic_conf_win;
++
++    uint16_t wp_offset_half_range;  // WpOffsetHalfRange
++
++    uint8_t bit_depth;
++
++//    int bit_depth_chroma;  // We only support lum_bit_depth = chroma_bit_depth
++    uint8_t pixel_shift;
++    enum AVPixelFormat pix_fmt;
++
++    unsigned int log2_max_poc_lsb;
++
++    int max_sub_layers;
++    struct {
++        int max_dec_pic_buffering;
++        int num_reorder_pics;
++        int max_latency_increase;
++    } temporal_layer[HEVC_MAX_SUB_LAYERS];
++    uint8_t temporal_id_nesting_flag;
++
++    uint8_t scaling_list_enable_flag;
++    ScalingList scaling_list;
++
++    unsigned int nb_st_rps;
++    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
++
++    uint8_t amp_enabled_flag;
++    uint8_t sao_enabled;
++
++    uint8_t long_term_ref_pics_present_flag;
++    uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
++    uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
++    uint8_t num_long_term_ref_pics_sps;
++
++    struct {
++        uint8_t bit_depth;
++        uint8_t bit_depth_chroma;
++        uint8_t log2_min_pcm_cb_size;
++        uint8_t log2_max_pcm_cb_size;
++        uint8_t loop_filter_disable_flag;
++    } pcm;
++    char sps_temporal_mvp_enabled_flag;
++//    char sps_strong_intra_smoothing_enable_flag;  -> intra_filtes_disable
++
++    uint8_t log2_min_cb_size;  // 3..6
++    uint8_t log2_diff_max_min_coding_block_size;
++    uint8_t log2_min_tb_size;  // 2..5
++    uint8_t log2_max_trafo_size;
++    uint8_t log2_ctb_size;     // 4..6
++//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
++#define LOG2_MIN_PU_SIZE 2
++#define LOG2_MIN_CU_SIZE 3
++
++    uint8_t max_transform_hierarchy_depth_inter;
++    uint8_t max_transform_hierarchy_depth_intra;
++
++    char transform_skip_rotation_enabled_flag;
++    char transform_skip_context_enabled_flag;
++    char implicit_rdpcm_enabled_flag;
++    char explicit_rdpcm_enabled_flag;
++//    char intra_smoothing_disabled_flag;  -> intra_filtes_disable
++    char high_precision_offsets_enabled_flag;
++    char persistent_rice_adaptation_enabled_flag;
++
++    uint8_t intra_filters_disable;
++
++    ///< coded frame dimension in various units
++    int width;
++    int height;
++    int ctb_width;
++    int ctb_height;
++    int ctb_size;   // Pic size in CTBs not size of a CTB
++    int min_cb_width;
++    int min_cb_height;
++    int min_tb_width;
++    int min_tb_height;
++    int min_pu_width;
++    int min_pu_height;
++    int pcm_width;
++    int pcm_height;
++    int tb_mask;
++
++    int hshift[3];
++    int vshift[3];
++
++    int qp_bd_offset;
++
++    uint8_t data[4096];
++    int data_size;
++
++    VUI vui;
++    PTL ptl;
++} HEVCRpiSPS;
++
++#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
++#define CTB_TS_FLAGS_EOTL       (1U << 1)       // Last CTB of a tile line
++#define CTB_TS_FLAGS_EOL        (1U << 2)       // Last CTB of a complete line
++#define CTB_TS_FLAGS_EOT        (1U << 3)       // Last CTB of a tile
++#define CTB_TS_FLAGS_CSAVE      (1U << 4)
++#define CTB_TS_FLAGS_CIREQ      (1U << 5)       // Cabac init request
++#define CTB_TS_FLAGS_TOT        (1U << 6)       // CTB on top row of a tile
++#define CTB_TS_FLAGS_CLOAD      (1U << 7)
++
++typedef struct HEVCRpiPPS {
++    unsigned int sps_id; ///< seq_parameter_set_id
++
++    uint8_t sign_data_hiding_flag;
++
++    uint8_t cabac_init_present_flag;
++
++    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
++    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
++    int pic_init_qp_minus26;
++
++    uint8_t constrained_intra_pred_flag;
++    uint8_t transform_skip_enabled_flag;
++
++    uint8_t cu_qp_delta_enabled_flag;
++    uint8_t log2_min_cu_qp_delta_size;
++    int cb_qp_offset;   // -12..12
++    int cr_qp_offset;   // -12..12
++    const uint8_t * qp_dblk_x[3];
++    const int8_t * qp_bd_x[3];
++
++    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
++    uint8_t weighted_pred_flag;
++    uint8_t weighted_bipred_flag;
++    uint8_t output_flag_present_flag;
++    uint8_t transquant_bypass_enable_flag;
++
++    uint8_t dependent_slice_segments_enabled_flag;
++    uint8_t tiles_enabled_flag;
++    uint8_t entropy_coding_sync_enabled_flag;
++
++    uint8_t tile_wpp_inter_disable;
++    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
++    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
++    uint8_t uniform_spacing_flag;
++    uint8_t loop_filter_across_tiles_enabled_flag;
++
++    uint8_t seq_loop_filter_across_slices_enabled_flag;
++
++    uint8_t deblocking_filter_control_present_flag;
++    uint8_t deblocking_filter_override_enabled_flag;
++    uint8_t disable_dbf;
++    int beta_offset;    ///< beta_offset_div2 * 2
++    int tc_offset;      ///< tc_offset_div2 * 2
++
++    uint8_t scaling_list_data_present_flag;
++    ScalingList scaling_list;
++
++    uint8_t lists_modification_present_flag;
++    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
++    int num_extra_slice_header_bits;
++    uint8_t slice_header_extension_present_flag;
++    uint8_t log2_max_transform_skip_block_size;
++    uint8_t cross_component_prediction_enabled_flag;
++    uint8_t chroma_qp_offset_list_enabled_flag;
++    uint8_t diff_cu_chroma_qp_offset_depth;
++    uint8_t chroma_qp_offset_list_len_minus1;
++    int8_t  cb_qp_offset_list[6];
++    int8_t  cr_qp_offset_list[6];
++    uint8_t log2_sao_offset_scale_luma;
++    uint8_t log2_sao_offset_scale_chroma;
++
++    // Inferred parameters
++    uint16_t *column_width;  ///< ColumnWidth
++    uint16_t *row_height;    ///< RowHeight
++    uint16_t *col_bd;        ///< ColBd
++    uint16_t *row_bd;        ///< RowBd
++    uint16_t *col_idxX;
++
++    // We can limit these to uint16_t given our other size limits
++    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
++    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
++    uint16_t *tile_id;           ///< TileId
++    uint16_t *tile_pos_ts;       ///< TilePosRS
++    uint16_t *tile_size;         ///< TileSize
++    uint8_t * ctb_ts_flags;
++
++    uint8_t data[4096];
++    int data_size;
++} HEVCRpiPPS;
++
++typedef struct HEVCRpiParamSets {
++    /* currently active parameter sets */
++    const HEVCRpiVPS *vps;
++    const HEVCRpiSPS *sps;
++    const HEVCRpiPPS *pps;
++
++    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
++    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
++    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
++} HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps);
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps, int apply_defdispwin);
++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps);
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
++
++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
++                           uint8_t *buf, int buf_size);
++
++/**
++ * Compute POC of the current frame and return it.
++ */
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
++
++#endif /* AVCODEC_RPI_HEVC_PS_H */
+diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
+new file mode 100644
+index 0000000000..8cc5796cf0
+--- /dev/null
++++ b/libavcodec/rpi_hevc_refs.c
+@@ -0,0 +1,485 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "internal.h"
++#include "thread.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
++{
++    /* frame->frame can be NULL if context init failed */
++    if (!frame->frame || !frame->frame->buf[0])
++        return;
++
++    frame->flags &= ~flags;
++    if (!frame->flags) {
++        ff_thread_release_buffer(s->avctx, &frame->tf);
++
++        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
++        frame->col_mvf = NULL;
++
++        frame->collocated_ref = NULL;
++    }
++}
++
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
++{
++    int i;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
++                            HEVC_FRAME_FLAG_SHORT_REF |
++                            HEVC_FRAME_FLAG_LONG_REF);
++}
++
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
++{
++    int i;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++}
++
++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
++{
++    int i, ret;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame * const frame = &s->DPB[i];
++        if (frame->frame->buf[0])
++            continue;
++
++        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
++                                   AV_GET_BUFFER_FLAG_REF);
++        if (ret < 0)
++            return NULL;
++
++        frame->col_mvf = NULL;
++        frame->col_mvf_buf = NULL;
++        if (s->used_for_ref && !s->is_irap)
++        {
++            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
++            if (!frame->col_mvf_buf)
++                goto fail;
++            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
++        }
++
++        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
++        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
++
++        return frame;
++
++fail:
++        ff_hevc_rpi_unref_frame(s, frame, ~0);
++        return NULL;
++    }
++    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
++    return NULL;
++}
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
++{
++    HEVCRpiFrame *ref;
++    int i;
++
++    /* check that this POC doesn't already exist */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *frame = &s->DPB[i];
++
++        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
++            frame->poc == poc) {
++            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
++                   poc);
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    ref = alloc_frame(s);
++    if (!ref)
++        return AVERROR(ENOMEM);
++
++    *frame = ref->frame;
++    s->ref = ref;
++
++    if (s->sh.pic_output_flag)
++        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
++    else
++        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
++
++    ref->poc      = poc;
++    ref->sequence = s->seq_decode;
++    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
++    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
++    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
++    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
++
++    return 0;
++}
++
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
++{
++    do {
++        int nb_output = 0;
++        int min_poc   = INT_MAX;
++        int i, min_idx, ret;
++
++        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
++            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++                HEVCRpiFrame *frame = &s->DPB[i];
++                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
++                        frame->sequence == s->seq_output) {
++                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++                }
++            }
++        }
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCRpiFrame *frame = &s->DPB[i];
++            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
++                frame->sequence == s->seq_output) {
++                nb_output++;
++                if (frame->poc < min_poc || nb_output == 1) {
++                    min_poc = frame->poc;
++                    min_idx = i;
++                }
++            }
++        }
++
++        /* wait for more frames before output */
++        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
++            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
++            return 0;
++
++        if (nb_output) {
++            HEVCRpiFrame *frame = &s->DPB[min_idx];
++            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
++                return 0;
++
++            ret = av_frame_ref(out, frame->frame);
++            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
++                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
++            else
++                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++            if (ret < 0)
++                return ret;
++            av_log(s->avctx, AV_LOG_DEBUG,
++                   "Output frame with POC %d.\n", frame->poc);
++            return 1;
++        }
++
++        if (s->seq_output != s->seq_decode)
++            s->seq_output = (s->seq_output + 1) & 0xff;
++        else
++            break;
++    } while (1);
++
++    return 0;
++}
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
++{
++    int dpb = 0;
++    int min_poc = INT_MAX;
++    int i;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *frame = &s->DPB[i];
++        if ((frame->flags) &&
++            frame->sequence == s->seq_output &&
++            frame->poc != s->poc) {
++            dpb++;
++        }
++    }
++
++    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCRpiFrame *frame = &s->DPB[i];
++            if ((frame->flags) &&
++                frame->sequence == s->seq_output &&
++                frame->poc != s->poc) {
++                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
++                    min_poc = frame->poc;
++                }
++            }
++        }
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCRpiFrame *frame = &s->DPB[i];
++            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
++                frame->sequence == s->seq_output &&
++                frame->poc <= min_poc) {
++                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
++            }
++        }
++
++        dpb--;
++    }
++}
++
++static int init_slice_rpl(HEVCRpiContext *s)
++{
++    if (s->slice_idx >= s->rpl_tab_size)
++        return AVERROR_INVALIDDATA;
++
++    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
++    return 0;
++}
++
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
++{
++    RpiSliceHeader *sh = &s->sh;
++
++    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
++    uint8_t list_idx;
++    int i, j, ret;
++
++    ret = init_slice_rpl(s);
++    if (ret < 0)
++        return ret;
++
++    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
++          s->rps[LT_CURR].nb_refs)) {
++        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    for (list_idx = 0; list_idx < nb_list; list_idx++) {
++        RefPicList  rpl_tmp = { { 0 } };
++        RefPicList *rpl     = &s->refPicList[list_idx];
++
++        /* The order of the elements is
++         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
++         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
++        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
++                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
++                              LT_CURR };
++
++        /* concatenate the candidate lists for the current frame */
++        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
++            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
++                RefPicList *rps = &s->rps[cand_lists[i]];
++                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
++                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
++                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
++                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
++                    rpl_tmp.nb_refs++;
++                }
++            }
++        }
++
++        /* reorder the references if necessary */
++        if (sh->rpl_modification_flag[list_idx]) {
++            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
++                int idx = sh->list_entry_lx[list_idx][i];
++
++                if (idx >= rpl_tmp.nb_refs) {
++                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                rpl->list[i]       = rpl_tmp.list[idx];
++                rpl->ref[i]        = rpl_tmp.ref[idx];
++                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
++                rpl->nb_refs++;
++            }
++        } else {
++            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
++            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
++        }
++
++        if (sh->collocated_list == list_idx &&
++            sh->collocated_ref_idx < rpl->nb_refs)
++            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
++    }
++
++    return 0;
++}
++
++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
++{
++    int i;
++    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *ref = &s->DPB[i];
++        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
++            if ((ref->poc & LtMask) == poc)
++                return ref;
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *ref = &s->DPB[i];
++        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
++            if (ref->poc == poc || (ref->poc & LtMask) == poc)
++                return ref;
++        }
++    }
++
++    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Could not find ref with POC %d\n", poc);
++    return NULL;
++}
++
++static void mark_ref(HEVCRpiFrame *frame, int flag)
++{
++    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
++    frame->flags |= flag;
++}
++
++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
++{
++    HEVCRpiFrame *frame;
++    int i, x, y;
++
++    frame = alloc_frame(s);
++    if (!frame)
++        return NULL;
++
++    if (!s->ps.sps->pixel_shift) {
++        for (i = 0; frame->frame->buf[i]; i++)
++            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
++                   frame->frame->buf[i]->size);
++    } else {
++        for (i = 0; frame->frame->data[i]; i++)
++            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
++                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
++                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
++                            1 << (s->ps.sps->bit_depth - 1));
++                }
++    }
++
++    frame->poc      = poc;
++    frame->sequence = s->seq_decode;
++    frame->flags    = 0;
++
++    ff_hevc_rpi_progress_set_all_done(frame);
++
++    return frame;
++}
++
++/* add a reference with the given poc to the list and mark it as used in DPB */
++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
++                             int poc, int ref_flag)
++{
++    HEVCRpiFrame *ref = find_ref_idx(s, poc);
++
++    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
++        return AVERROR_INVALIDDATA;
++
++    if (!ref) {
++        ref = generate_missing_ref(s, poc);
++        if (!ref)
++            return AVERROR(ENOMEM);
++    }
++
++    list->list[list->nb_refs] = ref->poc;
++    list->ref[list->nb_refs]  = ref;
++    list->nb_refs++;
++
++    mark_ref(ref, ref_flag);
++    return 0;
++}
++
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
++{
++    const ShortTermRPS *short_rps = s->sh.short_term_rps;
++    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
++    RefPicList               *rps = s->rps;
++    int i, ret = 0;
++
++    if (!short_rps) {
++        rps[0].nb_refs = rps[1].nb_refs = 0;
++        return 0;
++    }
++
++    /* clear the reference flags on all frames except the current one */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *frame = &s->DPB[i];
++
++        if (frame == s->ref)
++            continue;
++
++        mark_ref(frame, 0);
++    }
++
++    for (i = 0; i < NB_RPS_TYPE; i++)
++        rps[i].nb_refs = 0;
++
++    /* add the short refs */
++    for (i = 0; i < short_rps->num_delta_pocs; i++) {
++        int poc = s->poc + short_rps->delta_poc[i];
++        int list;
++
++        if (!short_rps->used[i])
++            list = ST_FOLL;
++        else if (i < short_rps->num_negative_pics)
++            list = ST_CURR_BEF;
++        else
++            list = ST_CURR_AFT;
++
++        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
++        if (ret < 0)
++            goto fail;
++    }
++
++    /* add the long refs */
++    for (i = 0; i < long_rps->nb_refs; i++) {
++        int poc  = long_rps->poc[i];
++        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
++
++        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
++        if (ret < 0)
++            goto fail;
++    }
++
++fail:
++    /* release any frames that are now unused */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
++
++    return ret;
++}
++
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
++{
++    int ret = 0;
++    int i;
++    const ShortTermRPS *rps = s->sh.short_term_rps;
++    LongTermRPS *long_rps   = &s->sh.long_term_rps;
++
++    if (rps) {
++        for (i = 0; i < rps->num_negative_pics; i++)
++            ret += !!rps->used[i];
++        for (; i < rps->num_delta_pocs; i++)
++            ret += !!rps->used[i];
++    }
++
++    if (long_rps) {
++        for (i = 0; i < long_rps->nb_refs; i++)
++            ret += !!long_rps->used[i];
++    }
++    return ret;
++}
+diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
+new file mode 100644
+index 0000000000..cd8149d58e
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.c
+@@ -0,0 +1,368 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "golomb.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
++{
++    int cIdx, i;
++    uint8_t hash_type;
++    //uint16_t picture_crc;
++    //uint32_t picture_checksum;
++    hash_type = get_bits(gb, 8);
++
++    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
++        if (hash_type == 0) {
++            s->is_md5 = 1;
++            for (i = 0; i < 16; i++)
++                s->md5[cIdx][i] = get_bits(gb, 8);
++        } else if (hash_type == 1) {
++            // picture_crc = get_bits(gb, 16);
++            skip_bits(gb, 16);
++        } else if (hash_type == 2) {
++            // picture_checksum = get_bits_long(gb, 32);
++            skip_bits(gb, 32);
++        }
++    }
++    return 0;
++}
++
++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
++{
++    int i;
++    // Mastering primaries
++    for (i = 0; i < 3; i++) {
++        s->display_primaries[i][0] = get_bits(gb, 16);
++        s->display_primaries[i][1] = get_bits(gb, 16);
++    }
++    // White point (x, y)
++    s->white_point[0] = get_bits(gb, 16);
++    s->white_point[1] = get_bits(gb, 16);
++
++    // Max and min luminance of mastering display
++    s->max_luminance = get_bits_long(gb, 32);
++    s->min_luminance = get_bits_long(gb, 32);
++
++    // As this SEI message comes before the first frame that references it,
++    // initialize the flag to 2 and decrement on IRAP access unit so it
++    // persists for the coded video sequence (e.g., between two IRAPs)
++    s->present = 2;
++    return 0;
++}
++
++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
++{
++    // Max and average light levels
++    s->max_content_light_level     = get_bits_long(gb, 16);
++    s->max_pic_average_light_level = get_bits_long(gb, 16);
++    // As this SEI message comes before the first frame that references it,
++    // initialize the flag to 2 and decrement on IRAP access unit so it
++    // persists for the coded video sequence (e.g., between two IRAPs)
++    s->present = 2;
++    return  0;
++}
++
++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
++{
++    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
++    s->present = !get_bits1(gb);
++
++    if (s->present) {
++        s->arrangement_type               = get_bits(gb, 7);
++        s->quincunx_subsampling           = get_bits1(gb);
++        s->content_interpretation_type    = get_bits(gb, 6);
++
++        // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
++        skip_bits(gb, 3);
++        s->current_frame_is_frame0_flag = get_bits1(gb);
++        // frame0_self_contained_flag, frame1_self_contained_flag
++        skip_bits(gb, 2);
++
++        if (!s->quincunx_subsampling && s->arrangement_type != 5)
++            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
++        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
++        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
++    }
++    skip_bits1(gb);             // upsampled_aspect_ratio_flag
++    return 0;
++}
++
++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
++{
++    s->present = !get_bits1(gb);
++
++    if (s->present) {
++        s->hflip = get_bits1(gb);     // hor_flip
++        s->vflip = get_bits1(gb);     // ver_flip
++
++        s->anticlockwise_rotation = get_bits(gb, 16);
++        skip_bits1(gb);     // display_orientation_persistence_flag
++    }
++
++    return 0;
++}
++
++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
++                                     void *logctx, int size)
++{
++    HEVCSEIPictureTiming *h = &s->picture_timing;
++    HEVCRpiSPS *sps;
++
++    if (!ps->sps_list[s->active_seq_parameter_set_id])
++        return(AVERROR(ENOMEM));
++    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
++
++    if (sps->vui.frame_field_info_present_flag) {
++        int pic_struct = get_bits(gb, 4);
++        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
++        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
++            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
++            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
++        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
++            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
++            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
++        }
++        get_bits(gb, 2);                   // source_scan_type
++        get_bits(gb, 1);                   // duplicate_flag
++        skip_bits1(gb);
++        size--;
++    }
++    skip_bits_long(gb, 8 * size);
++
++    return 0;
++}
++
++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
++                                                      int size)
++{
++    int flag;
++    int user_data_type_code;
++    int cc_count;
++
++    if (size < 3)
++       return AVERROR(EINVAL);
++
++    user_data_type_code = get_bits(gb, 8);
++    if (user_data_type_code == 0x3) {
++        skip_bits(gb, 1); // reserved
++
++        flag = get_bits(gb, 1); // process_cc_data_flag
++        if (flag) {
++            skip_bits(gb, 1);
++            cc_count = get_bits(gb, 5);
++            skip_bits(gb, 8); // reserved
++            size -= 2;
++
++            if (cc_count && size >= cc_count * 3) {
++                const uint64_t new_size = (s->a53_caption_size + cc_count
++                                           * UINT64_C(3));
++                int i, ret;
++
++                if (new_size > INT_MAX)
++                    return AVERROR(EINVAL);
++
++                /* Allow merging of the cc data from two fields. */
++                ret = av_reallocp(&s->a53_caption, new_size);
++                if (ret < 0)
++                    return ret;
++
++                for (i = 0; i < cc_count; i++) {
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                }
++                skip_bits(gb, 8); // marker_bits
++            }
++        }
++    } else {
++        int i;
++        for (i = 0; i < size - 1; i++)
++            skip_bits(gb, 8);
++    }
++
++    return 0;
++}
++
++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
++                                                         int size)
++{
++    uint32_t country_code;
++    uint32_t user_identifier;
++
++    if (size < 7)
++        return AVERROR(EINVAL);
++    size -= 7;
++
++    country_code = get_bits(gb, 8);
++    if (country_code == 0xFF) {
++        skip_bits(gb, 8);
++        size--;
++    }
++
++    skip_bits(gb, 8);
++    skip_bits(gb, 8);
++
++    user_identifier = get_bits_long(gb, 32);
++
++    switch (user_identifier) {
++        case MKBETAG('G', 'A', '9', '4'):
++            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
++        default:
++            skip_bits_long(gb, size * 8);
++            break;
++    }
++    return 0;
++}
++
++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
++{
++    int num_sps_ids_minus1;
++    int i;
++    unsigned active_seq_parameter_set_id;
++
++    get_bits(gb, 4); // active_video_parameter_set_id
++    get_bits(gb, 1); // self_contained_cvs_flag
++    get_bits(gb, 1); // num_sps_ids_minus1
++    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
++
++    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
++        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
++        return AVERROR_INVALIDDATA;
++    }
++
++    active_seq_parameter_set_id = get_ue_golomb_long(gb);
++    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
++        return AVERROR_INVALIDDATA;
++    }
++    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
++
++    for (i = 1; i <= num_sps_ids_minus1; i++)
++        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
++
++    return 0;
++}
++
++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
++{
++    s->present = 1;
++    s->preferred_transfer_characteristics = get_bits(gb, 8);
++    return 0;
++}
++
++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
++                                 int type, int size)
++{
++    switch (type) {
++    case 256:  // Mismatched value from HM 8.1
++        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++    case HEVC_SEI_TYPE_FRAME_PACKING:
++        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
++    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
++        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
++    case HEVC_SEI_TYPE_PICTURE_TIMING:
++        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
++    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
++        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
++    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
++        return decode_nal_sei_content_light_info(&s->content_light, gb);
++    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
++        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
++    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
++        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
++    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
++        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
++    default:
++        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
++        skip_bits_long(gb, 8 * size);
++        return 0;
++    }
++}
++
++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                                 int type, int size)
++{
++    switch (type) {
++    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
++        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++    default:
++        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
++        skip_bits_long(gb, 8 * size);
++        return 0;
++    }
++}
++
++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
++                                  const HEVCRpiParamSets * const ps, const int nal_unit_type)
++{
++    int payload_type = 0;
++    int payload_size = 0;
++    int byte = 0xFF;
++    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
++
++    while (byte == 0xFF) {
++       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
++           return AVERROR_INVALIDDATA;
++        byte          = get_bits(gb, 8);
++        payload_type += byte;
++    }
++    byte = 0xFF;
++    while (byte == 0xFF) {
++        if (get_bits_left(gb) < 8 + 8LL*payload_size)
++            return AVERROR_INVALIDDATA;
++         byte          = get_bits(gb, 8);
++        payload_size += byte;
++    }
++    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
++        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
++    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
++        return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
++    }
++}
++
++static int more_rbsp_data(GetBitContext *gb)
++{
++    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
++}
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                           const HEVCRpiParamSets *ps, int type)
++{
++    int ret;
++
++    do {
++        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
++        if (ret < 0)
++            return ret;
++    } while (more_rbsp_data(gb));
++    return 1;
++}
++
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
++{
++    s->a53_caption.a53_caption_size = 0;
++    av_freep(&s->a53_caption.a53_caption);
++}
+diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
+new file mode 100644
+index 0000000000..d4ac348df9
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.h
+@@ -0,0 +1,135 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_SEI_H
++#define AVCODEC_RPI_HEVC_SEI_H
++
++#include <stdint.h>
++
++#include "libavutil/md5.h"
++
++#include "get_bits.h"
++
++/**
++ * SEI message types
++ */
++typedef enum {
++    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
++    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
++    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
++    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
++    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
++    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
++    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
++    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
++    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
++    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
++    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
++    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
++    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
++    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
++    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
++    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
++    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
++    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
++    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
++    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
++    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
++    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
++    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
++    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
++    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
++    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
++} HEVC_SEI_Type;
++
++typedef struct HEVCSEIPictureHash {
++    uint8_t       md5[3][16];
++    uint8_t is_md5;
++} HEVCSEIPictureHash;
++
++typedef struct HEVCSEIFramePacking {
++    int present;
++    int arrangement_type;
++    int content_interpretation_type;
++    int quincunx_subsampling;
++    int current_frame_is_frame0_flag;
++} HEVCSEIFramePacking;
++
++typedef struct HEVCSEIDisplayOrientation {
++    int present;
++    int anticlockwise_rotation;
++    int hflip, vflip;
++} HEVCSEIDisplayOrientation;
++
++typedef struct HEVCSEIPictureTiming {
++    int picture_struct;
++} HEVCSEIPictureTiming;
++
++typedef struct HEVCSEIA53Caption {
++    int a53_caption_size;
++    uint8_t *a53_caption;
++} HEVCSEIA53Caption;
++
++typedef struct HEVCSEIMasteringDisplay {
++    int present;
++    uint16_t display_primaries[3][2];
++    uint16_t white_point[2];
++    uint32_t max_luminance;
++    uint32_t min_luminance;
++} HEVCSEIMasteringDisplay;
++
++typedef struct HEVCSEIContentLight {
++    int present;
++    uint16_t max_content_light_level;
++    uint16_t max_pic_average_light_level;
++} HEVCSEIContentLight;
++
++typedef struct HEVCSEIAlternativeTransfer {
++    int present;
++    int preferred_transfer_characteristics;
++} HEVCSEIAlternativeTransfer;
++
++typedef struct HEVCSEIContext {
++    HEVCSEIPictureHash picture_hash;
++    HEVCSEIFramePacking frame_packing;
++    HEVCSEIDisplayOrientation display_orientation;
++    HEVCSEIPictureTiming picture_timing;
++    HEVCSEIA53Caption a53_caption;
++    HEVCSEIMasteringDisplay mastering_display;
++    HEVCSEIContentLight content_light;
++    int active_seq_parameter_set_id;
++    HEVCSEIAlternativeTransfer alternative_transfer;
++} HEVCSEIContext;
++
++struct HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                           const struct HEVCRpiParamSets *ps, int type);
++
++/**
++ * Reset SEI values that are stored on the Context.
++ * e.g. Caption data that was extracted during NAL
++ * parsing.
++ *
++ * @param s HEVCRpiContext.
++ */
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
++
++#endif /* AVCODEC_RPI_HEVC_SEI_H */
+diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
+new file mode 100644
+index 0000000000..23b49a99ae
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.c
+@@ -0,0 +1,1537 @@
++#include "rpi_hevc_shader.h"
++
++#ifdef _MSC_VER
++   #include <stdint.h>
++   /* cast through uintptr_t to avoid warnings */
++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
++#else
++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
++#endif
++
++#ifdef __cplusplus
++extern "C" { /* the types are probably wrong... */
++#endif
++#ifdef __cplusplus
++}
++#endif
++
++#ifdef _MSC_VER
++__declspec(align(8))
++#elif defined(__GNUC__)
++__attribute__((aligned(8)))
++#endif
++unsigned int ff_hevc_rpi_shader[] = {
++// ::mc_setup_c_q0
++// ::mc_start
++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c_qn
++/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
++/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000110] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
++/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
++/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
++/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
++/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
++// :1
++/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
++/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
++/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
++/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
++// ::mc_filter_c_p
++/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
++/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
++/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
++/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
++/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
++/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_p_l1
++/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
++/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
++/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
++/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
++/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
++/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_b
++/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
++/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
++/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
++/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
++/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
++/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
++/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
++/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
++/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
++/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
++/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
++/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
++/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
++/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
++/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
++/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
++/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
++/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
++/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
++/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
++// :1
++/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
++/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
++/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
++/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
++/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
++/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
++/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
++/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
++/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
++/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
++/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
++/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
++/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
++/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
++/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
++/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
++/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
++/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
++/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
++/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
++/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
++/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
++/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
++/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_sync_q0
++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q1
++/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q2
++/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q3
++/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q4
++/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q5
++/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q6
++/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q7
++/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q8
++/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q9
++/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q10
++/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q11
++/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_qn
++// ::mc_exit_y_qn
++/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_q0
++// ::mc_exit_y_q0
++/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y_q0
++/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y_qn
++/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
++/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
++/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
++/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
++/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
++// :1
++/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000df0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
++/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
++/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
++/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
++// :per_block_setup_8
++/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
++/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
++/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
++/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
++/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
++/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
++/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
++/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
++/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
++/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
++/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
++/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
++/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
++/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
++/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
++/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
++/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
++/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
++/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
++// ::mc_filter_y_pxx
++/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
++/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
++/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_bxx
++/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
++/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
++/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
++/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
++/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
++/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
++/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_p00
++/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
++/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
++/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
++/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
++/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
++/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
++/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
++/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
++/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
++/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
++/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
++/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
++/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_b00
++/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
++/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
++/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
++/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
++/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
++/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_setup_c10_q0
++/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c10_qn
++/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
++/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
++/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
++/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
++/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
++/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
++/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00001770] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
++/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
++/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
++/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
++/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
++// :1
++/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
++/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
++/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
++/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
++// ::mc_filter_c10_p
++/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
++/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
++/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
++/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
++/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_p_l1
++/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
++/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
++/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
++/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
++/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_b
++/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
++/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
++/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
++/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
++/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
++/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
++/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
++/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
++/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
++/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
++/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
++/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
++/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
++/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
++/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
++/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
++/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
++/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
++/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
++/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
++// :1
++/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
++/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
++/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
++/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
++/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
++/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
++/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
++/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
++/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
++/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
++/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
++/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
++/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
++/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
++/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
++/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
++/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
++/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
++/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
++/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
++/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
++/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
++/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_sync10_q0
++/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q1
++/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q2
++/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q3
++/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q4
++/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q5
++/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q6
++/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q7
++/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q8
++/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q9
++/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q10
++/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q11
++/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_q0
++// ::mc_exit_y10_q0
++/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_qn
++// ::mc_exit_y10_qn
++/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y10_q0
++/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y10_qn
++/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
++/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
++/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
++/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
++/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
++/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
++// :1
++/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00002428] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
++/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
++/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
++/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
++// :per_block_setup_10
++/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
++/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
++/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
++/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
++/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
++/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
++/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
++/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
++/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
++/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
++/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
++/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
++/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
++/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
++/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
++/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
++/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
++/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
++/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
++// ::mc_filter_y10_pxx
++/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
++/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
++/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_p00
++/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
++/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
++/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
++/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
++/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
++/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
++/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
++/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
++/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
++/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
++/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
++/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
++/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_bxx
++/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
++/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
++/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
++/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
++/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
++/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
++/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_b00
++/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
++/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
++/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
++/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
++/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_end
++};
++#ifdef __HIGHC__
++#pragma Align_to(8, ff_hevc_rpi_shader)
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
+new file mode 100644
+index 0000000000..79651c9b6c
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.h
+@@ -0,0 +1,63 @@
++#ifndef rpi_hevc_shader_H
++#define rpi_hevc_shader_H
++
++extern unsigned int ff_hevc_rpi_shader[];
++
++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
++#define mc_start (ff_hevc_rpi_shader + 0)
++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
++#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
++#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
++#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
++#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
++#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
++#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
++#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
++#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
++#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
++#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
++#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
++#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
++#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
++#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
++#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
++#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
++#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
++#define mc_end (ff_hevc_rpi_shader + 2860)
++
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
+new file mode 100644
+index 0000000000..af5b59e181
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.qasm
+@@ -0,0 +1,1850 @@
++# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++# All rights reserved.
++#
++# Redistribution and use in source and binary forms, with or without
++# modification, are permitted provided that the following conditions are met:
++#     * Redistributions of source code must retain the above copyright
++#       notice, this list of conditions and the following disclaimer.
++#     * Redistributions in binary form must reproduce the above copyright
++#       notice, this list of conditions and the following disclaimer in the
++#       documentation and/or other materials provided with the distribution.
++#     * Neither the name of the copyright holder nor the
++#       names of its contributors may be used to endorse or promote products
++#       derived from this software without specific prior written permission.
++#
++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++#
++# Written by Peter de Rivaz, John Cox
++
++
++
++# Inter pred asm
++#
++# Logic here should be good to 14 bits without modification
++# but only 8 & 10 are currently instantiated & tested
++# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
++# in _p00 & _b00
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4.  As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
++# Number limits in P/B calculation
++#
++# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
++# we offset our intermediates s.t. they always end up +ve before the next
++# multiply (may be -ve whilst summing but that doesn't matter).
++#
++# Range calc for up to 14 bits (Y-B pred):
++#
++# denom: [0, 7]
++# bmax = (1 << bits) - 1
++# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
++#
++# wt_mul: [-128, 255]
++# wt_off = off * 2 + 1: [-bmax, bmax]
++#
++# pel: [0, bmax]
++# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
++# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
++# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
++# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
++# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
++#  [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
++#
++# This all looks good and is mostly bit depth independant - and as we manage
++# to do unsigned multiplies everywhere (now) this should be good for any bit
++# depth up to 14 (we could probably do 16 - but that requires a few tweaks
++# to the shifts we don't currently have logic for)
++
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# As the test for read-next is is the main part of the Luma loop (rather than
++# the preload FIFO part) we are limited to min_luma_height - 1
++# Min_luma_height is 4 so we can only have a preload of 3
++# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
++# in chroma without abandoning preload pretty much entirely (which would be bad)
++#
++# Timing tests vs preload of 4 suggests this doesn't hurt us much
++# Could have preread 4 for Chroma but when tested it didn't help
++
++.set PREREAD,                      3
++
++# Offset added (effectively) at the exit of the H FIR filter
++# This is enough to force the result +ve
++# Is good if it is a power of 2 as that allows for >> without loss
++#
++# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
++# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
++# Round up to next power of 2
++
++.set FIR_OFFSET,                   0x4000
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8,               16
++.set C_BLK_HEIGHT_16,              8
++.set Y_BLK_HEIGHT_8,               16
++.set Y_BLK_HEIGHT_16,              8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
++
++.set N_QPU_8,                      12
++.set N_QPU_16,                     12
++
++# Value to add to the weight multiplier to convert it into an unsigned value
++# Should be power of two for convienience
++
++.set LOG2_MUL_ADD,                 14
++.set MUL_ADD,                      (1 << LOG2_MUL_ADD)
++
++# Fixed denom (max that it can be set to)
++.set DENOM,                        7
++
++# register allocation
++#
++
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
++
++# ra4-11
++# V FIFO / temp / free
++
++# -- free --                       ra12
++
++# -- free --                       ra13
++
++# -- free --                       ra14
++
++# -- free --                       ra15
++
++# uniform: width:height
++.set ra_width_height,              ra16
++.set ra_width,                     ra16.16b
++.set ra_height,                    ra16.16a
++
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2,                      ra17
++.set ra_y2,                        ra17.16a
++.set ra_y,                         ra17.16b
++
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1,             ra18
++.set ra_wt_off_l1,                 ra18.16b
++.set ra_wt_mul_l1,                 ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next,                 ra19
++.set ra_y_next,                    ra19.16b
++.set ra_y2_next,                   ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff800100,                 ra20
++.set ra_k256,                      ra20.16a
++.set ra_k0,                        ra20.8a
++.set ra_k1,                        ra20.8b
++.set ra_k128,                      ra20.8c
++.set ra_k255,                      ra20.8d
++
++# Loop: xshifts
++.set ra_xshift,                    ra21.16a
++.set ra_xshift_next,               ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0,             ra22
++.set ra_wt_mul_l0,                 ra22.16a
++.set ra_wt_off_l0,                 ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax,           ra23
++.set ra_pmax,                      ra23.16a
++.set ra_blk_height,                ra23.8c
++# --free --                        ra23.8d
++
++# Loop:  src frame base (L0)
++.set ra_base,                      ra24
++
++# Misc  offsets
++.set ra_fir_off_val_wt_den_p7,     ra25
++.set ra_wt_den_p7,                 ra25.8a
++# -- free --                       ra25.8b
++.set ra_fir_off_val,               ra25.16b
++
++# As it happens these constants are the same
++.if FIR_OFFSET == MUL_ADD
++# Weight multiplier unsigned add
++.set ra_kmul_add,                  ra_fir_off_val
++.else
++.error "FIR_OFFSET != MUL_ADD: Need new register & init"
++.endif
++
++# Loop: next src frame base (L0)
++.set ra_base_next,                 ra26
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set ra_dma0,                      ra27
++
++# Loop: destination address
++.set ra_dest,                      ra28
++
++# Setup: Dup of rb_ef
++# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
++# (top bits are ignored by mul24)
++.set ra_ef,                        ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link,                      ra30
++
++# -- free --                       ra31
++
++.set rb_xshift2,                   rb0
++.set rb_xshift2_next,              rb1
++
++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x,                    rb2
++
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++# Duped into ra_ef as sometimes that is easier to use
++.set rb_ef,                        rb3
++
++# rb4-11
++# Loop: V filter FIFO or V filter coeff
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off,                    rb12
++
++# -- free --                       rb13
++
++# -- free --                       rb14
++
++# Loop: src frame base (L1)
++.set rb_base2,                     rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch,                     rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu,                     rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount,                    rb18
++
++# frame_base2_next
++.set rb_base2_next,                rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
++.set rb_xpitch,                    rb20
++
++# These 3 consts each save 1 instruction in Y loop setup
++# so whilst they are worthwhile they should be the 1st to die if we need
++# another b reg
++.set rb_y_coeffs_2,                rb21                         # 0x050b0a00
++.set rb_y_coeffs_3,                rb22                         # 0x11283a40
++.set rb_y_coeffs_5,                rb23                         # 0x0a0b0500
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask,                     rb24
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base,                 rb25
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x,                     rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base,                 rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init,                  rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1,                      rb29
++
++# Setup: pic_height - 1
++.set rb_max_y,                     rb30
++
++# Setup: FIR H offset
++.set rb_fir_off_h,                 rb31
++
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16,                    -16
++.set i_shift21,                    -11
++.set i_shift23,                     -9
++.set i_shift30,                     -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
++  mov r2, qpu_num
++.if v_bit_depth <= 8
++  # 8 bit version
++  asr r1, r2, 2
++  shl r1, r1, 6
++  and r0, r2, 3
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++  shl r0, r0, 5
++
++.else
++  # 16 bit version
++  # Limited to 8 QPUs if blk height > 8
++  asr r1, r2, 1
++.if v_blk_height <= 8
++  shl r1, r1, 4
++.else
++  shl r1, r1, 5
++.endif
++  and r0, r2, 1
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
++  add r_vpm, r0, r1
++
++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
++  shl r0, r0, 6
++.endif
++  add r_dma, r0, r1  # DMA out
++.endm
++
++
++.macro m_setup_q0
++  srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
++################################################################################
++# mc_setup_c
++#
++# typedef struct qpu_mc_pred_c_s_s {
++#     int16_t y;
++#     int16_t x;
++#     uint32_t base;
++#     uint32_t pic_cw;            // C Width (== Y width / 2)
++#     uint32_t pic_ch;            // C Height (== Y Height / 2)
++#     uint32_t stride2;
++#     uint32_t stride1;
++#     uint32_t wdenom;
++#     int16_t y2;
++#     int16_t x2;
++#     uint32_t base2;
++#     uint32_t next_fn;
++# } qpu_mc_pred_c_s_t;
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_pmask,           0xff
++.set v_blk_height,      C_BLK_HEIGHT_8
++.else
++.set v_x_shift,         2
++.set v_pmask,           0xffff
++.set v_blk_height,      C_BLK_HEIGHT_16
++.endif
++
++  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
++
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30      ; mov ra_base, unif             # ; ref_c_base
++
++# Read image dimensions
++  sub r0, unif, 1                                               # pic c width
++  shl rb_max_x, r0, v_x_shift                                   # rb_max_x in bytes
++  sub rb_max_y, unif, 1                                         # pic c height
++
++# load constants
++  mov ra_kff800100, 0xff800100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++
++# get source pitch
++  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # ; stride2
++  mov rb_pitch, unif                                            # stride1
++  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay] Merged with dst_stride shortly
++  add rb_dma1_base, r1, rb_pitch                                # vdw_setup_1
++
++  and r0, 1, elem_num
++  nop                           ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++  add rb_elem_x, r0, elem_num
++.else
++  add r0, r0, elem_num
++  add rb_elem_x, r0, r0
++.endif
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  shl r0, ra0.16b, v_x_shift                                    # [rb_elem_x delay]
++  add r0, r0, rb_elem_x                                         # Add elem no to x to get X for this slice
++  max r0, r0, 0                 ; mov ra_y, ra0.16a             # ; stash Y
++  min r0, r0, rb_max_x
++
++# Get shift
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.else
++  mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
++.endif
++
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra0, unif                 # ; next_x2_y2
++  add ra_base, ra_base, r0
++
++# Compute part of VPM to use for DMA output
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++# And again for L1, but only worrying about frame2 stuff
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# rb_base2 ends up with t1s base
++
++  shl r0, ra0.16b, v_x_shift
++  add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a            # Add QPU slice offset
++  max r0, r0, 0                 ; mov rb_base2, unif            # ref_c_base2
++  min r0, r0, rb_max_x
++
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1                ; mov r3, PREREAD
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov r2, ra_y2
++  add rb_base2, rb_base2, r0    ; mov r0, ra_y
++
++# Do preloads
++# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1          ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t1s, rb_base2, r1         ; mov ra_y2, r2
++# >>> .anynz 1b
++
++  mov ra_link, unif                                             # link
++# touch registers to keep simulator happy (and fills in delay slots)
++  mov ra4, 0                    ; mov rb4, 0
++  bra -, ra_link
++  mov ra5, 0                    ; mov rb5, 0
++  mov ra6, 0                    ; mov rb6, 0
++  mov ra7, 0                    ; mov rb7, 0
++# >>> ra_link
++.endm
++
++::mc_setup_c_q0
++  m_setup_q0
++::mc_setup_c_qn
++  m_setup_c 8
++
++################################################################################
++#
++# mc_filter_c_p
++#
++# typedef struct qpu_mc_pred_c_p_s {
++#     int16_t y;
++#     int16_t x;
++#     uint32_t base;
++#     uint16_t h;
++#     uint16_t w;
++#     uint32_t coeffs_x;
++#     uint32_t coeffs_y;
++#     uint32_t wo_u;
++#     uint32_t wo_v;
++#     uint32_t dst_addr_c;
++#     uint32_t next_fn;
++# } qpu_mc_pred_c_p_t;
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_x_mul,           2
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_x_mul,           4
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift,        rb_xshift2              # b side more convienient
++.set vrx_xshift_next,   ra_xshift_next
++.set vra_y_next,        ra_y_next
++.set vrx_base_next,     ra_base_next
++.set vra_y,             ra_y
++.set vra_base,          ra_base
++.set vr_txs,            t0s
++.else
++.set vrx_xshift,        ra_xshift               # a side more convienient
++.set vrx_xshift_next,   rb_xshift2_next
++.set vra_y_next,        ra_y2_next
++.set vrx_base_next,     rb_base2_next
++.set vra_y,             ra_y2
++.set vra_base,          rb_base2
++.set vr_txs,            t1s
++.endif
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++# get base addresses and per-channel shifts for *next* invocation
++  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
++
++  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; base
++
++  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0          # r5 = 0
++  add r0, r0, rb_elem_x         ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
++  sub r1, r5, rb_pitch          ; mov ra0, unif                 # ; H filter coeffs
++  max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++  min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++
++.if v_bit_depth <= 8
++  shl vrx_xshift_next, r0, 3
++  and r0, r0, -4
++.endif
++  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra3, unif                 # ; V filter coeffs
++  add vrx_base_next, r3, r0     ; mov r1, ra_height
++
++# set up VPM write
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
++
++# Misc final setup...
++
++  shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif             # ; dst_addr
++  add r0, r0, r2                ; mov r2, ra_fir_off_val        # Combine width and height of destination area (r0=h<<8, r2=w*2)
++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c              # Shift into bits 16 upwards of the vdw_setup0 register
++  add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0          # ; r1=weight
++  shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++  sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++  add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4            # ; loop counter (V FIFO fill = 4)
++  mov rb11, ra3.8d              ; mov ra_link, unif             # ; Link
++
++# r5           = -4                     (loop counter)
++# ra_wt_mul_l0 = weight L0 + 128        (now unsigned)
++# rb_wt_off    = (offset * 2 + 1) << (wt_den + 5)
++# rb31         = FIR value offset
++
++# FIFO: rb4, ra5, rb6, ra7
++# Coeffs in ra3.8a, ra3.8b, rb10, rb11
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++.if v_tmu == 0
++  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
++  shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
++  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
++.else
++  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
++  shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
++  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next       # [r1 << delay]
++.endif
++
++  add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++  min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++
++  and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++.if v_tmu == 0
++  add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask        # ; mask bytes
++.else
++  add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax         # ; mask bytes
++.endif
++
++# apply horizontal filter
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are valid for all QPUs
++
++  add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++  sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
++# We would like to save the r5->r4 shift but we need a delay slot
++# for both r7 & r6 which we can't find anything to put in if we have
++# already multiplied r4 & r5!
++  brr.anyn -, r:1b
++  add r2, r2, r3                ; mul24 r0, ra7, rb10           # r6 post
++  mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b         # r5 post
++  asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++# >>> .anyn 1b
++
++  add r1, r1, r0                ; mul24 r0, rb4, ra3.8a         # [ra7 delay]
++  sub r1, r1, r0                ; mul24 r0, ra7, rb11
++  sub r1, r1, r0
++
++  asr r1, r1, 6                 ; mov r3, ra_blk_height         # ; NxtLoop
++  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++  sub r1, r0, r1                ; v8subs r0, ra_height, r3      # ; NxtLoop
++  brr.anyn -, r:1b
++  asr r1, r1, i_wt_den_p6
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_p
++  m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++  m_filter_c_p 1, 8
++
++################################################################################
++#
++# mc_filter_c_b
++#
++# typedef struct qpu_mc_pred_c_b_s {
++#     int16_t y;
++#     int16_t x;
++#     uint32_t base;
++#     uint16_t h;
++#     uint16_t w;
++#     uint32_t coeffs_x1;
++#     uint32_t coeffs_y1;
++#     int16_t weight_u1;
++#     int16_t weight_v1;
++#     int16_t y2;
++#     int16_t x2;
++#     uint32_t base2;
++#     uint32_t coeffs_x2;
++#     uint32_t coeffs_y2;
++#     uint32_t wo_u2;
++#     uint32_t wo_v2;
++#     uint32_t dst_addr_c;
++#     uint32_t next_fn;
++# } qpu_mc_pred_c_b_t;
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++.set v_x_mul,           (1 << v_x_shift)
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
++
++  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; r3=base
++
++  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1          # x ; r5=0
++  add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
++  sub r1, r5, rb_pitch          ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
++  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x          ; mov ra0, unif                 # ; L0 H filter coeffs
++
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.endif
++
++  and r0, r0, -4                ; mov ra2, unif                 # ; L0 V filter coeffs
++  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov r1, ra_height             # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++
++# set up VPM write
++
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U weight
++  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
++
++  shl r0, r1, v_dma_h_shift     ; mov ra3, unif                 # ; x2_y2
++  add r0, r0, r2                ; mov r3, unif                  # [ra3 delay] ; base
++  shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a       # Shift into bits 16 upwards of the vdw_setup0 register
++  add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b               # r0=x
++
++# L1 - uniform layout could possibly be optimized
++
++  shl r0, r0, v_x_shift         ; mov ra1, unif                 # r0=x<<shift ; L1 H filter coeffs
++  add r0, r0, rb_elem_x         ; mov ra3, unif                 # ; L1 V filter coeffs
++  sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif    # [ra3 delay] r1=pitch2 mask ; U offset/weight
++  max r0, r0, r5                ; mov ra9, rb_max_y
++  min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
++
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
++
++  and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++  and r1, r0, r1                ; mov r5rep, -4
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra_dest, unif             #  Add stripe offsets ; dst_addr
++  add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
++
++  add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++  add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++  add r0, r0, r1                ; mov r1, ra_wt_off_l1          # ; L0 off unset
++  shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
++  sub rb_wt_off, r1, r0         ; mov ra_link, unif             # ; link
++
++  mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
++
++# r5        loop counter (-4)
++# ra0       H coeffs L0
++# ra1       H coeffs L1
++# ra2       V coeffs L0
++# ra3       V coeffs L1
++# ra9       rb_max_y alias
++# ra10      rb_xshift2 alias
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
++  shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
++  shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
++  add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next # [ra_y delay]
++  add ra_y, 1, ra_y             ; mov r3, ra_y
++
++  max r3, r3, ra_k0             ; mov      r0, r1 << 15
++  min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
++
++  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++  add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask        # ; masks bytes
++
++# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
++
++  and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
++  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
++  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++  add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
++
++  shr r2, r4, ra10              ; mov rb5, rb6
++  shr r1, r2, v_v_shift         ; mov r3, ra_y2
++  shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7                  # [r1 << delay]
++
++  add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
++  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++  min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
++
++  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++  add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax         # ; masks bytes
++
++# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
++
++  add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
++  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
++  sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++  add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++  add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++
++  brr.anyn -, r:1b
++  add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
++  mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
++  shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++# >>> .anyn 1b
++
++  sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b        # L1 ; L0
++  sub.setf -, r5, rb_lcount     ; mov r0, ra4
++  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++  add r1, r1, r0                ; mul24 r0, ra7,  rb7
++
++  sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c        # L1
++  add r2, r2, r0                ; mul24 r0, ra11, rb11          # L1
++  sub r2, r2, r0
++
++  shr r1, r1, 6
++  shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
++  add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
++  add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
++  sub r1, r1, r2                ; mov r3, ra_blk_height         # ; NxtLoop
++  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3      # ; NxtLoop
++
++  brr.anyn -, r:1b
++  asr r1, r1, ra_wt_den_p7
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_b
++  m_filter_c_b 8
++
++################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  nop                   ; nop           ; ldtmu0
++  mov -, vw_wait        ; nop           ; ldtmu1
++.else
++  mov.setf r3, PREREAD - 1
++:1
++  brr.anynz -, r:1b
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  sub.setf r3, r3, 1
++ # >>>
++  mov  -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 -  fns should never be called
++.if n_qpu < n_quads * 4
++  mov ra_link, unif     # Can only branch to an a reg (not r0)
++  mov -, vw_wait        # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in,  12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  bra -, ra_link
++  sacq -, n_sem_quad_in
++  srel -, n_sem_out
++  srel -, n_sem_quad_out
++
++.else
++  bra -, ra_link
++  srel -, n_sem_sync
++  sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++  srel -, n_sem_out
++.else
++  nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++  m_sync_q 0, v_quads8
++::mc_sync_q1
++  m_sync_q 1, v_quads8
++::mc_sync_q2
++  m_sync_q 2, v_quads8
++::mc_sync_q3
++  m_sync_q 3, v_quads8
++::mc_sync_q4
++  m_sync_q 4, v_quads8
++::mc_sync_q5
++  m_sync_q 5, v_quads8
++::mc_sync_q6
++  m_sync_q 6, v_quads8
++::mc_sync_q7
++  m_sync_q 7, v_quads8
++::mc_sync_q8
++  m_sync_q 8, v_quads8
++::mc_sync_q9
++  m_sync_q 9, v_quads8
++::mc_sync_q10
++  m_sync_q 10, v_quads8
++::mc_sync_q11
++  m_sync_q 11, v_quads8
++
++# mc_exit()
++# Chroma & Luma the same now
++
++.macro m_exit_qn
++  m_exit_drain
++  nop                   ; nop           ; thrend
++  nop
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++  m_exit_qn
++
++
++
++# mc_interrupt_exit12()
++
++.macro m_exit_q0
++  m_exit_drain
++  sacq -, 12
++  nop                   ; nop           ; thrend
++  mov interrupt, 1
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_q0
++::mc_exit_y_q0
++  m_exit_q0
++
++# LUMA CODE
++
++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
++# For P frames we make the second x,y coordinates offset by +8
++
++
++################################################################################
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t pic_h;
++#    uint16_t pic_w;
++#    uint32_t stride2;
++#    uint32_t stride1;
++#    uint32_t wdenom;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_pmask,           0xff
++.set v_blk_height,      Y_BLK_HEIGHT_8
++.else
++.set v_x_shift,         1
++.set v_pmask,           0xffff
++.set v_blk_height,      Y_BLK_HEIGHT_16
++.endif
++
++
++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
++  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
++  mov ra9, unif                                                 # ref_y_base
++  mov ra1, unif                                                 # x2_y2
++
++
++# load constants
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30      ; mov ra11, unif                # ; ref_y2_base
++
++  mov ra_kff800100, 0xff800100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++  mov rb_y_coeffs_2, 0x050b0a00
++  mov rb_y_coeffs_3, 0x11283a40
++  mov rb_y_coeffs_5, 0x0a0b0500
++
++# Compute part of VPM to use
++
++# Read image dimensions
++  mov ra3, unif                                                 # width_height
++  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # [ra3 delay] ; stride2
++.if v_x_shift == 0
++  sub rb_max_x, ra3.16b, 1
++.else
++  sub r0, ra3.16b, 1
++  shl rb_max_x, r0, v_x_shift
++.endif
++  sub rb_max_y, ra3.16a, 1
++  mov r3, elem_num              ; mov rb_pitch, unif            # stride1
++
++# get destination pitch
++  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay]
++  or  rb_dma1_base, r1, rb_pitch
++
++# Compute base address for first and second access
++  add r0, ra0.16b, r3                                           # Load x + elem_num
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl ra_xshift_next, r0, 3                                     # Compute shifts
++
++# X is byte offset - we can only load words - mask
++
++  and r0, r0, -4                ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                                                # Add stripe offsets
++  add ra_base, ra9, r0
++
++  # r3 still contains elem_num
++  add r0, ra1.16b, r3                                           # Load x
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl rb_xshift2_next, r0, 3                                    # Compute shifts
++
++  # r2 still contains mask
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                                                # Add stripe offsets
++  add rb_base2, ra11, r0
++
++# Do preloads
++  nop                           ; mov r0, ra0.16a               # ; r0 = y
++  mov r3, PREREAD               ; mov r2, ra1.16a               # ; r2 = y2
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1          ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t1s, rb_base2, r1         ; mov ra_y2, r2
++# >>> .anynz 1b
++
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++  mov ra_link, unif                                             # Next fn
++
++# touch vertical context to keep simulator happy
++  mov ra8,  0                   ; mov rb8,  0                   # [ra_link delay]
++  bra -, ra_link
++  mov ra9,  0                   ; mov rb9,  0
++  mov ra10, 0                   ; mov rb10, 0
++  mov ra11, 0                   ; mov rb11, 0
++# >>> ra_link
++.endm
++
++::mc_setup_y_q0
++  m_setup_q0
++::mc_setup_y_qn
++  m_setup_y 8
++
++################################################################################
++#
++# Start of per-block setup code
++# P and B blocks share the same setup code to save on Icache space
++
++# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
++
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t mymx21;
++#    uint32_t wo1;
++#    uint32_t wo2;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++  brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++  brr ra_link, r:per_block_setup_10
++.endif
++  mov ra0, unif                 ; mov r3, elem_num              # y_x ; elem_num has implicit unpack??
++  add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2          # [ra0 delay] ; r5 = 0
++  add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3                                     # Compute shifts
++  and r0, r0, -4
++  sub r2, r5, rb_pitch          ; mov ra_base_next, unif        # ; src1.base
++  and r1, r0, r2                ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra1, unif                 # Add stripe offsets ; src2.x_y
++  add ra_base_next, ra_base_next, r0                            # [ra1 delay]
++
++  add r0, ra1.16b, r3                                           # Load x2
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5                ; mov ra_y2_next, ra1.16a
++  min r0, r0, rb_max_x          ; mov rb_base2_next, unif       # ; src2.base
++  shl rb_xshift2_next, r0, 3                                    # Compute shifts
++  and r0, r0, -4                ; mov ra_width_height, unif     # ; width_height
++  and r1, r0, r2                ; mov vw_setup, rb_vpm_init     # ; set up VPM write
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul   # Add stripe offsets ; r1 = x in bytes
++  add rb_base2_next, rb_base2_next, r0
++
++# get width,height of block (unif load above), r1 = width * pel_size
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height             # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++  add rb_lcount, r0, (7-8)
++  shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add           # ; r3 return val
++  add r0, r0, r1                                                # Combine width and height of destination area
++  shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val        # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
++  add ra_dma0, r0, rb_dma0_base ; mov r0, unif                  # ; Packed filter offsets
++
++# get filter coefficients and discard unused B frame values
++  shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif    #  Pick half to use ; L0 offset/weight
++  shl ra8, r0, 3                ; mov rb5, ra_k255
++
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++
++# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
++# but I can't see a way of doing that that is cheap enough to be worth it
++
++# Picked out in a slightly random order to space out uniform loads
++
++  # 1
++  mov r1, 0x01040400            # [ra8 delay]
++  ror ra2.8b, r1, ra8.8d
++  ror ra0.8b, r1, ra8.8c
++  # 2
++  ror ra2.8c, rb_y_coeffs_2, ra8.8d
++  ror ra0.8c, rb_y_coeffs_2, ra8.8c
++  # 0
++  mov r1,0x00010100             # -ve  [ra8 delay]
++  ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif    # ; L1 Wt/Offset
++  ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
++  # 7
++  shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
++  ror r0, r1, ra8.8d            ; mov ra_dest, unif             # ; Destination address
++  ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
++  # 3
++  ror ra2.8d, rb_y_coeffs_3, ra8.8d
++  ror ra0.8d, rb_y_coeffs_3, ra8.8c
++  # 5
++  ror ra3.8b, rb_y_coeffs_5, ra8.8d
++  ror ra1.8b, rb_y_coeffs_5, ra8.8c
++  # 6
++  mov r1,0x04040100
++  ror ra3.8c, r1, ra8.8d
++  ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8                 # ; r5 return val
++
++  bra -, ra_link
++  # 4
++  mov r1,0x3a281100
++  ror r0, r1, ra8.8d            ; mov ra_link, unif             # ; link - load after we've used its previous val
++  ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
++# >>> branch ra_link
++
++# r5 = -8
++# r2 = fir_off_val
++# r3 = 128
++.endm
++
++:per_block_setup_8
++  m_per_block_setup 8
++
++
++
++################################################################################
++#
++# mc_filter_y_pxx
++#
++# Setup (& therefore uniform struct) shared with _bxx
++# Struct in m_luma_setup
++#
++# We can have 2 separate P reqs here as long as they mate to generate a
++# rectangular output block (i.e. h0 = h1, w0 = 8)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_pxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++  m_luma_setup v_bit_depth
++
++  shl r1, ra_wt_off_l0, i_wt_den_p5
++  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
++  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# This loop is identical to the B loop from here --->
++:1
++  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++
++  max r2, ra_y, 0               ; mov r1, 0
++  min r2, r2, rb_max_y          ; mov r3, ra_k1
++  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++  add t0s, ra_base, r2          ; mov rb5,  rb6
++  shr r0, r4, ra_xshift         ; mov rb6,  rb7
++
++  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
++  shr r1, r4, rb_xshift2        ; mov rb7, ra8
++  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++  add t1s, rb_base2, r2         ; mov ra8,  ra9
++
++# apply horizontal filter
++  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++  brr.anyn -, r:1b
++  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++  # >>> .anyn 1b (r5 + r5)
++
++  # apply vertical filter and write to VPM
++  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++  add r1, r1, r0                ; mul24 r0, ra8,  rb8
++  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++  add r1, r1, r0                ; mul24 r0, ra11, rb11
++# <--- to here
++  sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height                 # ; NxtLoop: r3 = block height
++  sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
++  sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
++
++  asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
++  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++  sub r1, r0, r1                ; v8subs r0, ra_height, r3              # ; NxtLoop: r0 = remaining height (0 saturate)
++
++  brr.anyn -, r:1b
++  asr r1, r1, i_wt_den_p6
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch                # ; NxtLoop
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_pxx
++  m_filter_y_pxx 8
++
++
++################################################################################
++
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++#
++# Setup (& therefore uniform struct) shared with _pxx
++# Struct in m_luma_setup
++#
++# l0 calc in els 0-7, L1 in 8-15
++# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_bxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++  m_luma_setup v_bit_depth
++
++  shl r1, ra_wt_off_l0, i_wt_den_p6
++  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++  sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
++  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++
++# This loop is identical to the P loop from here --->
++:1
++  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++
++  max r2, ra_y, 0               ; mov r1, 0
++  min r2, r2, rb_max_y          ; mov r3, ra_k1
++  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++  add t0s, ra_base, r2          ; mov rb5,  rb6
++  shr r0, r4, ra_xshift         ; mov rb6,  rb7
++
++  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
++  shr r1, r4, rb_xshift2        ; mov rb7, ra8
++  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++  add t1s, rb_base2, r2         ; mov ra8,  ra9
++
++# apply horizontal filter
++  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++  brr.anyn -, r:1b
++  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++  # >>> .anyn 1b (r5 + r5)
++
++  # apply vertical filter and write to VPM
++  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++  add r1, r1, r0                ; mul24 r0, ra8,  rb8
++  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++  add r1, r1, r0                ; mul24 r0, ra11, rb11
++# <--- to here
++  sub r1, r1, ra4
++  sub r1, r1, r0                ; mov r2, rb_wt_off
++
++  asr r1, r1, 6
++  sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
++  mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++  sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
++  sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
++  add r1, r1, r2                ; mov r0, r1 << 8
++  add r1, r1, r0                ; mov r3, ra_blk_height         # ; NxtLoop: r3 = block height
++
++  brr.anyn -, r:1b
++  asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch        # ; NxtLoop
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, 0                ; v8subs r0, ra_height, r3      # ; NxtLoop: r0 = remaining height (0 saturate)
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed block_height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link (ra_height - remaining height)
++
++# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_bxx
++  m_filter_y_bxx 8
++
++################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++#    qpu_mc_src_t next_src1;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t wo1;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++  mov ra0, unif                 ; mov r0, elem_num              # y_x
++  mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5          # [ra0 delay] ; r5 = 0
++  add r0, ra0.16b, r0           ; mov ra_base_next, unif        # ; src1.base
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++
++  max r0, r0, r5                ; mov ra_y_next, ra0.16a        # ; width_height
++  min r0, r0, rb_max_x          ; mov ra_width_height, unif
++
++  shl ra_xshift_next, r0, 3                                     # Compute shifts
++  and r0, r0, -4
++  sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif    # ; weight_offset
++  and r1, r0, r2
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra_dest, unif             # Add stripe offsets ; dest addr
++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++  shl r1, ra_width, v_x_shift
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++  sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
++  shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
++  add r0, r0, r1                                                # Combine width and height of destination area
++  shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++  shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif             # Shift into bits 16 upwards of the vdw_setup0 register ; link
++  add ra_dma0, r0, rb_dma0_base
++
++:1
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
++  nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
++
++  sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
++  shl r1, r1, 8                 ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, DENOM + 8
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, ra_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++  m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++  m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++  mov r2, 1
++  add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0      # Need in rX rather than raX for <<8 to do what we want
++  shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
++  nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax         # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
++
++  sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
++  add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
++
++  shl r1, r1, 8                 ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, (DENOM + 9) - 32                                  # -32 to get valid shift immediate
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++  m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++  m_setup_q0
++::mc_setup_c10_qn
++  m_setup_c 10
++
++::mc_filter_c10_p
++  m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++  m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++  m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++  m_sync_q 0, v_quads10
++::mc_sync10_q1
++  m_sync_q 1, v_quads10
++::mc_sync10_q2
++  m_sync_q 2, v_quads10
++::mc_sync10_q3
++  m_sync_q 3, v_quads10
++::mc_sync10_q4
++  m_sync_q 4, v_quads10
++::mc_sync10_q5
++  m_sync_q 5, v_quads10
++::mc_sync10_q6
++  m_sync_q 6, v_quads10
++::mc_sync10_q7
++  m_sync_q 7, v_quads10
++::mc_sync10_q8
++  m_sync_q 8, v_quads10
++::mc_sync10_q9
++  m_sync_q 9, v_quads10
++::mc_sync10_q10
++  m_sync_q 10, v_quads10
++::mc_sync10_q11
++  m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++  m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++  m_exit_qn
++
++::mc_setup_y10_q0
++  m_setup_q0
++::mc_setup_y10_qn
++  m_setup_y 10
++
++:per_block_setup_10
++  m_per_block_setup 10
++
++::mc_filter_y10_pxx
++  m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++  m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++  m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++  m_filter_y_b00 10
++
++
++
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
+diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
+new file mode 100644
+index 0000000000..89711d776b
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_cmd.h
+@@ -0,0 +1,165 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++    int16_t y;
++    int16_t x;
++    qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++    qpu_mc_src_t next_src;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x;
++    uint32_t coeffs_y;
++    uint32_t wo_u;
++    uint32_t wo_v;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x1;
++    uint32_t coeffs_y1;
++    int16_t weight_u1;
++    int16_t weight_v1;
++    qpu_mc_src_t next_src2;
++    uint32_t coeffs_x2;
++    uint32_t coeffs_y2;
++    uint32_t wo_u2;
++    uint32_t wo_v2;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++    qpu_mc_src_t next_src1;
++    uint32_t pic_cw;            // C Width (== Y width / 2)
++    uint32_t pic_ch;            // C Height (== Y Height / 2)
++    uint32_t stride2;
++    uint32_t stride1;
++    qpu_mc_src_t next_src2;
++    uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
++    union {
++        qpu_mc_pred_c_p_t p;
++        qpu_mc_pred_c_b_t b;
++        qpu_mc_pred_c_s_t s;
++    };
++} qpu_mc_pred_c_t;
++
++
++typedef struct qpu_mc_pred_y_p_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t h;
++    uint16_t w;
++    uint32_t mymx21;
++    uint32_t wo1;
++    uint32_t wo2;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t wo1;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t pic_h;
++    uint16_t pic_w;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++typedef struct qpu_mc_pred_sync_s {
++    uint32_t next_fn;
++} qpu_mc_pred_sync_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++    union {
++        qpu_mc_pred_y_p_t p;
++        qpu_mc_pred_y_p00_t p00;
++        qpu_mc_pred_y_s_t s;
++    };
++} qpu_mc_pred_y_t;
++
++typedef union qpu_mc_pred_cmd_u {
++    qpu_mc_pred_y_t y;
++    qpu_mc_pred_c_t c;
++    qpu_mc_pred_sync_t sync;
++} qpu_mc_pred_cmd_t;
++
++static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
++{
++    // Link is last el of previous cmd
++    ((uint32_t *)cmd)[-1] = fn;
++}
++
++#define QPU_MC_PRED_N_Y8        12
++#define QPU_MC_PRED_N_C8        12
++
++#define QPU_MC_PRED_N_Y10       12
++#define QPU_MC_PRED_N_C10       12
++
++#define QPU_MC_DENOM            7
++
++#pragma pack(pop)
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
+new file mode 100644
+index 0000000000..77d8366eb8
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.c
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++
++typedef struct shader_track_s
++{
++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    const struct qpu_mc_src_s *last_l0;
++    const struct qpu_mc_src_s *last_l1;
++    uint32_t width;  // pic_width * PW
++    uint32_t height;
++    uint32_t stride2;
++    uint32_t stride1;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++    return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++    int rv;
++    // As it happens we can take the 2nd filter term & divide it by 8
++    // (dropping fractions) to get the fractional move
++    rv = 8 - ((x >> 11) & 0xf);
++    av_assert2(rv >= 0 && rv <= 7);
++    return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++    return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCRpiContext *const s, int32_t x)
++{
++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCRpiContext *const s, int32_t x)
++{
++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++    return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_hevc_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_hevc_shader_template_fn.h"
++
+diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
+new file mode 100644
+index 0000000000..0fc5a45e9f
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.h
+@@ -0,0 +1,49 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++struct HEVCRpiContext;
++struct HEVCRpiInterPredEnv;
++
++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
+new file mode 100644
+index 0000000000..10c163a4b9
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template_fn.h
+@@ -0,0 +1,502 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++        const pixel s = *(const pixel *)src;
++        pixel * d = (pixel *)dst;
++        for (unsigned int j = 0; j < w; j += PW) {
++            *d++ = s;
++        }
++    }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
++        memcpy(dst, src, w);
++    }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++                         uint8_t * dst, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > st->width) {
++        if (x >= st->width)
++            x = st->width - PW;
++        dr = (x + w) - st->width;
++        w = st->width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > st->height) {
++        if (y >= st->height)
++            y = st->height - 1;
++        db = (y + h) - st->height;
++        h = st->height - y;
++    }
++
++    dst += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++    if (dr != 0)
++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++    w += dl + dr;
++    dst -= dl;
++
++    if (dt != 0)
++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++    if (db != 0)
++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++    const int width = st->width;
++    const int height = st->height;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > width) {
++        if (x >= width)
++            x = width - PW;
++        dr = (x + w) - width;
++        w = width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > height) {
++        if (y >= height)
++            y = height - 1;
++        db = (y + h) - height;
++        h = height - y;
++    }
++
++    dst_u += dl + dt * dst_stride;
++    dst_v += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++    {
++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++    }
++    if (dr != 0)
++    {
++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++    }
++    w += dl + dr;
++    dst_u -= dl;
++    dst_v -= dl;
++
++    if (dt != 0)
++    {
++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++    }
++    if (db != 0)
++    {
++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++    }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++    if (is_c) {
++        x *= 2;
++        w *= 2;
++    }
++
++    for (int i = y; i != y + h; ++i) {
++        for (int j = x; j != x + w; ++j) {
++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++            if (j < 0 || i < 0)
++                printf("..%c", sep);
++            else
++                printf("%02x%c", *(const pixel*)p, sep);
++#else
++            if (j < 0 || i < 0)
++                printf("...%c", sep);
++            else
++                printf("%03x%c", *(const pixel*)p, sep);
++#endif
++        }
++        printf("\n");
++    }
++}
++
++
++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
++                  const HEVCRpiInterPredEnv *const ipe_y,
++                  const HEVCRpiInterPredEnv *const ipe_c)
++{
++    for (int c_idx = 0; c_idx < 2; ++c_idx)
++    {
++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++        unsigned int exit_n = 0;
++
++        if (ipe == NULL || !ipe->used) {
++            continue;
++        }
++
++        do {
++            for (unsigned int i = 0; i != ipe->n; ++i) {
++                const HEVCRpiInterPredQ * const q = ipe->q + i;
++                shader_track_t * const st = tracka + i;
++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++                for (;;) {
++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++                    if (link == q->code_setup) {
++                        if (c_idx == 0) {
++                            // Luma
++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++                            st->height = c->pic_h;
++                            st->width = c->pic_w * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                        else {
++                            // Chroma
++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++                            st->height = c->pic_ch;
++                            st->width = c->pic_cw * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                    }
++                    else if (link == s->qpu.y_pxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++                        const int w1 = FFMIN(c->w, 8);
++                        const int w2 = c->w - w1;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        if (w2 > 0) {
++                            FUNC(get_patch_y)(st,
++                                        patch_y2, PATCH_STRIDE,
++                                        st->last_l1,
++                                        16, c->h + 7);
++                        }
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++                        if (w2 > 0) {
++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                                c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++                        }
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_bxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h + 7);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_p00) {
++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++                        st->last_l0 = &c->next_src1;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_b00) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        av_assert0(c->w <= 16 && c->h <= 64);
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++                           patch_y3, patch_y1, PATCH_STRIDE,
++                           c->h, 0, 0, c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), 0, 0, c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx_l1) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l1 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_bxx) {
++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++                        const int mx1 = fctom(c->coeffs_x1);
++                        const int my1 = fctom(c->coeffs_y1);
++                        const int mx2 = fctom(c->coeffs_x2);
++                        const int my2 = fctom(c->coeffs_y2);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72];
++                        uint8_t patch_v1[PATCH_STRIDE * 72];
++                        uint8_t patch_u2[PATCH_STRIDE * 72];
++                        uint8_t patch_v2[PATCH_STRIDE * 72];
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++                            c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++                            c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == q->code_sync) {
++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++                        break;
++                    }
++                    else if (link == q->code_exit) {
++                        // We expect exit to occur without other sync
++                        av_assert0(i == exit_n);
++                        ++exit_n;
++                        break;
++                    }
++                    else {
++                        av_assert0(0);
++                    }
++                }
++
++                st->qpu_mc_curr = cmd;
++            }
++        } while (exit_n == 0);
++    }
++}
++
++#undef FUNC
++#undef pixel
++
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+new file mode 100644
+index 0000000000..3caef20137
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,444 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
++# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
++.set USE_STACK, 0
++
++# Lines that fail to assemble start with #:
++# The script insert_magic_opcodes.sh inserts the machine code directly for these.
++# HEVC VPU Transform
++#
++# Transform matrix can be thought of as
++#   output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++#   (a b c d) (1 2  2  1)
++#             (3 4 -4 -3)
++#             (5 6  6  5)
++#             (7 8 -8 -7)
++#
++#  x=(a c)(1 2) = 1a+5c 2a+6c
++#         (5 6)
++#
++#  y=(b d)(3 4) = 3b+7d 4b+8d
++#         (7 8)
++#
++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++#  Final results are (u , v[::-1])
++#
++#
++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++#  Apply the even matrix first and stop before rounding
++#  Then apply the odd matrix in a full manner:
++#
++#   First step is to compute partial products with the first input (16 cycles)
++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
++#   2a 4b 6c 8d
++#   2a -4b 6c -8d
++#   1a -3b 5c -7d
++#
++#   Second step is to sum partial products into final position (8 cycles)
++#   1a+3b+5c+7d
++#   2a+4b+6c+8d
++#   2a-4b+6c-8d
++#   1a-3b+5c-7d
++#
++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++#   For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++  push r6-r15, lr # TODO cut down number of used registers
++  mov r14,r3 # coeffs32
++  mov r15,r4 # num32
++  mov r3, 16*2 # Stride of transMatrix2 in bytes
++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  # Now use r0 to describe which matrix we are working on.
++  # Allows us to prefetch the next block of coefficients for efficiency.
++  mov r0,0 # This describes the location where we read our coefficients from
++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++  mov r7,16*16*2 # Total block size
++  mov r8,64*16 # Value used to swap from current to next VRF location
++  mov r4,64 # Constant used for rounding first pass
++  mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++  sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++
++  add r11,sp,64 # Space for 32 bytes before, and rounding
++  lsr r11,5
++  lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
++
++  lsr r10, r2, 16 # Number of compressed blocks stored in top short
++  extu r2,16
++  # At start of block r0,r1 point to the current block (that has already been loaded)
++  # r0 VRF location of current block
++  # r1 address of current block
++  # r2 number of 16*16 transforms to do
++  # r3 Stride of coefficients (==32)
++  # r4 TRANS_RND1 (64)
++  # r5 TRANS_RND2
++  # r6 temporary used inside col_trans16
++  # r7 16*16*2 total bytes in block
++  # r8 64*16 VRF switch locations
++  # r9 temporary in unpack_coeff for index
++  # r10 number of 16x16 transforms using compression
++  # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
++  # r12 temporary counter in unpack_coeff
++  # r13
++  # r14 Save information for 32 bit transform (coeffs location)
++  # r15 Save information for 32 bit transform (number of transforms)
++  cmp r2,0
++  beq done16x16s
++block_loop:
++  # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
++  cmp r10,0
++  mov r6, r1
++  beq not_compressed
++  sub r10, 1
++  bl unpack16x16
++not_compressed:
++  #mov r6,r1 # DEBUG without compress
++  vldh HX(0++,0)+r0,(r6 += r3) REP 16
++  #eor r0,r8
++  #add r1,r7
++  # Prefetch the next block
++  #bl unpack16x16
++  #vldh HX(0++,0)+r0,(r6 += r3) REP 16
++  #vmov HX(0++,0)+r0,0 REP 16  # DEBUG
++  #eor r0,r8
++  #sub r1,r7
++
++  # Transform the current block
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
++
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
++
++  # Save results - note there has been a transposition during the processing so we save columns
++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++  # Move onto next block
++  eor r0,r8
++  add r1,r7
++
++  addcmpbgt r2,-1,0,block_loop
++done16x16s:
++
++  add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++  # Now go and do any 32x32 transforms
++  b hevc_trans_32x32
++
++  pop r6-r15, pc
++# This returns a value in r6 that says where to load the data from.
++# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
++unpack16x16:
++# Clear out destination
++  vmov HX(0,0)+r0,0
++  mov r6, r11
++  vsth HX(0,0)+r0,(r6 += r3) REP 16
++  mov r5, r1 # Moving pointer to input coefficients
++unpack_outer_loop:
++  # Loop until we find the end
++  vldh HX(0,0)+r0,(r5)  # TODO would prefetch help here while unpacking previous?
++  sub r6,r11,32
++  #add r6,pc,packed_data-$ # Packed data
++  vsth HX(0,0)+r0,(r6)  # Store into packed data
++  mov r12,0
++unpack_loop:
++  ld r4,(r6)
++  add r6,r6,4
++  lsr r9,r4,16 # r9 is destination value
++  cmp r4,0 # {value,index}
++  extu r4,8
++  beq done_unpack
++  sth r9,(r11, r4)
++  addcmpblt r12,1,8,unpack_loop
++#  # Read next 16
++  add r5,32
++  b unpack_outer_loop
++done_unpack:
++#  # Set new load location
++  mov r6, r11
++  #add r6,pc,unpacked_data-$
++#  # Restore constants
++  mov r4,64
++  mov r5,TRANS_RND2
++#  pop r6-r15, pc
++  b lr
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++col_trans_odd_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_odd_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++# r1/r10 input pointer
++# r0,r4,r5,r6 free
++# r8/r9 output storage
++#
++# Store packed coefficients at r9-32
++# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
++unpack32x32:
++# Clear out destination
++  vmov HX(0,0),0
++  add r0, r9, 32*32*2 # Unpacked buffer
++  mov r4, 32
++  vsth HX(0,0),(r0 += r4) REP 64
++unpack_outer_loop32:
++  # Loop until we find the end
++  vldh HX(0,0),(r1)  # TODO would prefetch help here while unpacking previous?
++  sub r6,r9,32
++  #add r6,pc,packed_data-$ # Packed data
++  vsth HX(0,0),(r6)  # Store into packed data
++  mov r8,0
++unpack_loop32:
++  ld r4,(r6)
++  add r6,r6,4
++  lsr r5,r4,16 # r5 is destination value
++  cmp r4,0 # {value,index}
++  extu r4,10
++  beq done_unpack
++  sth r5,(r0, r4)
++  addcmpblt r8,1,8,unpack_loop32
++#  # Read next 16
++  add r1,32
++  b unpack_outer_loop32
++done_unpack32:
++  b lr
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
++#
++# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
++hevc_trans_32x32:
++  mov r1,r14 # coeffs
++  mov r2,r15 # num
++  lsr r15,r15,16 # Number that are packed
++  extu r2,16 # Total number
++
++  # Fetch odd transform matrix
++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++  #add r0, 16*16*2
++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++  mov r7, 16*16*2 # Total block size
++
++.if USE_STACK
++  # Stack base allocation
++  sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
++  # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
++  add r8,sp,63
++  lsr r8,5
++  lsl r8,5
++.else
++#:version r8
++  .half 0x00e8 #AUTOINSERTED
++  btst r8,16
++#:add r8,pc,intermediate_results-$
++  .half 0xbfe8
++  .half intermediate_results-($-2)
++  beq on_vpu1
++  add r8,r8,32*32*2*2+16*2 # Move to secondary storage
++on_vpu1:
++.endif
++  mov r9,r8  # Backup of the temporary storage
++  mov r10,r1 # Backup of the coefficient buffer
++
++  cmp r2,0
++  beq done32x32s
++block_loop32:
++
++  # Transform the first 16 columns
++  mov r1,r10  # Input Coefficient buffer
++  mov r8,r9   # Output temporary storage
++  # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
++  cmp r2,r15
++  bgt not_compressed_32
++  bl unpack32x32
++  add r1,r9,32*32*2   # Uncompressed into temporary storage
++  mov r8,r9           # Transform into here
++not_compressed_32:
++  # COLUMN TRANSFORM
++  mov r4, 64 # Constant used for rounding first pass
++  mov r5, 9 # left shift used for rounding first pass
++
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  # ROW TRANSFORM
++  mov r4, TRANS_RND2 # Constant used for rounding second pass
++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++  mov r1,r9  # Input temporary storage
++  mov r8,r10   # Output Coefficient buffer
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  add r10, 32*32*2 # move onto next block of coefficients
++  addcmpbgt r2,-1,0,block_loop32
++done32x32s:
++
++.if USE_STACK
++  add sp,sp,32*32*4+64# Restore stack
++.endif
++
++  pop r6-r15, pc
++
++trans32:
++  push lr
++  # We can no longer afford the VRF space to do prefetching when doing 32x32
++  # Fetch the even rows
++  vldh HX(0++,0),(r1 += r3) REP 16
++  # Fetch the odd rows
++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++  # Transform the even rows using even matrix
++  mov r0, 0 # Even rows
++  bl col_trans_16
++
++  # Now transform the odd rows using odd matrix
++  mov r0, 64*16 # Odd rows
++  bl col_trans_odd_16
++
++  # Now apply butterfly to compute the first 16 results
++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  # 16bit results now in HX(48,32)
++  mov r0,r8
++  mov r6,32*2
++  vsth VX(48,32++),(r0+=r6) REP 16
++
++  # Now apply butterfly to compute the second 16 results (in reverse order)
++  vsub HY(63,0),HY(0 ,0),HY(16,0)
++  vsub HY(62,0),HY(1 ,0),HY(17,0)
++  vsub HY(61,0),HY(2 ,0),HY(18,0)
++  vsub HY(60,0),HY(3 ,0),HY(19,0)
++  vsub HY(59,0),HY(4 ,0),HY(20,0)
++  vsub HY(58,0),HY(5 ,0),HY(21,0)
++  vsub HY(57,0),HY(6 ,0),HY(22,0)
++  vsub HY(56,0),HY(7 ,0),HY(23,0)
++  vsub HY(55,0),HY(8 ,0),HY(24,0)
++  vsub HY(54,0),HY(9 ,0),HY(25,0)
++  vsub HY(53,0),HY(10,0),HY(26,0)
++  vsub HY(52,0),HY(11,0),HY(27,0)
++  vsub HY(51,0),HY(12,0),HY(28,0)
++  vsub HY(50,0),HY(13,0),HY(29,0)
++  vsub HY(49,0),HY(14,0),HY(30,0)
++  vsub HY(48,0),HY(15,0),HY(31,0)
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  add r0,r8,32
++  vsth VX(48,32++),(r0+=r6) REP 16
++  pop pc
++
++.if USE_STACK == 0
++  .balign 32
++
++# .space directives generate 0's in the bin so avoid unnecessary padding by
++# just setting to appropriate value
++.equ intermediate_results, $+16*2
++
++# Layout goes:
++#
++#packed_buffer:
++#  .space 16*2
++#intermediate_results:
++#  .space 32*32*2
++#unpacked_buffer:
++#  .space 32*32*2
++#
++#packed_buffer2:
++#  .space 16*2
++#intermediate_results2:
++#  .space 32*32*2
++#unpacked_buffer2:
++#  .space 32*32*2
++.endif
++
++
+diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
+new file mode 100644
+index 0000000000..1c364492d0
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform10 [] = {
++0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
++0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
++0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
++0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
++0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
++0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
++0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
++0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
++0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
++0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
++0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
++0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
++0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
++0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
++0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
++0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
++0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
++0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
++0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
++0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
++0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
++0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
++0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
++0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
++0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
++0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
++0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
++0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
++0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
++0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
++0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
++0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
++0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
++0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
++0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
++0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
++0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
++0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
++0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
++0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
++0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
++0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
++0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
++0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
++0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
++0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
++0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
++0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
++0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
++0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
++0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
++0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
++0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
++0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
++0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
++0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
++0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
++0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
++0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
++0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
++0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
++0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
++0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
++0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
++0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
++0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
++0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
++0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
++0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
++0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
++0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
++0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
++0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
++0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
++0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
++0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
++0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
++0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
++0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
++0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
++0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
++0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
++0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
++0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
++};
+diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
+new file mode 100644
+index 0000000000..1128a2c054
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform8 [] = {
++0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
++0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
++0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
++0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
++0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
++0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
++0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
++0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
++0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
++0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
++0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
++0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
++0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
++0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
++0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
++0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
++0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
++0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
++0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
++0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
++0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
++0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
++0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
++0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
++0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
++0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
++0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
++0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
++0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
++0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
++0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
++0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
++0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
++0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
++0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
++0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
++0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
++0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
++0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
++0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
++0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
++0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
++0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
++0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
++0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
++0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
++0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
++0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
++0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
++0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
++0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
++0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
++0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
++0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
++0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
++0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
++0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
++0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
++0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
++0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
++0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
++0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
++0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
++0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
++0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
++0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
++0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
++0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
++0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
++0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
++0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
++0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
++0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
++0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
++0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
++0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
++0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
++0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
++0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
++0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
++0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
++0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
++0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
++0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
++};
+diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
+new file mode 100644
+index 0000000000..5e28b3978f
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.c
+@@ -0,0 +1,6132 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Wassim Hamidouche
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++#include "libavutil/display.h"
++#include "libavutil/internal.h"
++#include "libavutil/mastering_display_metadata.h"
++#include "libavutil/md5.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/stereo3d.h"
++
++#include "decode.h"
++#include "bswapdsp.h"
++#include "bytestream.h"
++#include "golomb.h"
++#include "hevc.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_parse.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++#include "profiles.h"
++#include "hwconfig.h"
++
++#include "rpi_zc_frames.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "pthread.h"
++#include <stdatomic.h>
++
++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++    return a & ((1 << p) - 1);
++}
++#   define av_mod_uintp2   av_mod_uintp2_c
++#endif
++
++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
++
++// UV & Y both have min 4x4 pred (no 2x2 chroma)
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
++#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
++
++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
++
++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
++
++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
++
++// Total cmds to allocate - allow for slack & setup
++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
++
++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
++
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8] = {
++        ENCODE_COEFFS(  0,  64,   0,  0),
++        ENCODE_COEFFS(  2,  58,  10,  2),
++        ENCODE_COEFFS(  4,  54,  16,  2),
++        ENCODE_COEFFS(  6,  46,  28,  4),
++        ENCODE_COEFFS(  4,  36,  36,  4),
++        ENCODE_COEFFS(  4,  28,  46,  6),
++        ENCODE_COEFFS(  2,  16,  54,  4),
++        ENCODE_COEFFS(  2,  10,  58,  2)
++};
++
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++    const uint8_t bit_depth;
++    const uint8_t n;
++    const int * const * setup_fns;
++    const int * const * sync_fns;
++    const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++    ipe_chan_info_t luma;
++    ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
++{
++    switch (ln)
++    {
++        default:  // normally 0
++            *b = a;
++            break;
++        case 1:
++            a |= a << 8;
++            *(uint16_t *)b = a;
++            b += stride;
++            *(uint16_t *)b = a;
++            break;
++        case 2:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b = a;
++            b += stride;
++            *(uint32_t *)b = a;
++            b += stride;
++            *(uint32_t *)b = a;
++            b += stride;
++            *(uint32_t *)b = a;
++            break;
++        case 3:
++        {
++            unsigned int i;
++            uint64_t d;
++            a |= a << 8;
++            a |= a << 16;
++            d = ((uint64_t)a << 32) | a;
++            for (i = 0; i != 8; ++i, b += stride)
++                *(uint64_t *)b = d;
++            break;
++        }
++        case 4:
++        {
++            unsigned int i;
++            uint64_t d;
++            a |= a << 8;
++            a |= a << 16;
++            d = ((uint64_t)a << 32) | a;
++            for (i = 0; i != 16; ++i, b += stride)
++            {
++                *(uint64_t *)b = d;
++                *(uint64_t *)(b + 8) = d;
++            }
++            break;
++        }
++    }
++}
++
++// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
++// (4 not required)
++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
++{
++    switch (ln)
++    {
++        default:  // 0 or -1
++            *b_u = a;
++            *b_l = a;
++            break;
++        case 1:
++            a |= a << 8;
++            *(uint16_t *)b_u = a;
++            *(uint16_t *)b_l = a;
++            break;
++        case 2:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b_u = a;
++            *(uint32_t *)b_l = a;
++            break;
++        case 3:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b_u = a;
++            *(uint32_t *)(b_u + 4) = a;
++            *(uint32_t *)b_l = a;
++            *(uint32_t *)(b_l + 4) = a;
++            break;
++        case 4:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b_u = a;
++            *(uint32_t *)(b_u + 4) = a;
++            *(uint32_t *)(b_u + 8) = a;
++            *(uint32_t *)(b_u + 12) = a;
++            *(uint32_t *)b_l = a;
++            *(uint32_t *)(b_l + 4) = a;
++            *(uint32_t *)(b_l + 8) = a;
++            *(uint32_t *)(b_l + 12) = a;
++            break;
++    }
++}
++
++static void zap_cabac_stash(uint8_t * b, const int ln)
++{
++    switch (ln)
++    {
++        default:  // 0
++            *b = 0;
++            break;
++        case 1:
++            *(uint16_t *)b = 0;
++            break;
++        case 2:
++            *(uint32_t *)b = 0;
++            break;
++        case 3:
++            *(uint32_t *)b = 0;
++            *(uint32_t *)(b + 4) = 0;
++            break;
++    }
++}
++
++
++
++// Set a small square block of bits in a bitmap
++// Bits must be aligned on their size boundry (which will be true of all split CBs)
++static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
++{
++    unsigned int n;
++    const unsigned int sh = (x & 7);
++
++    f += (x >> 3);
++
++    av_assert2(ln <= 3);
++    av_assert2((x & ((1 << ln) - 1)) == 0);
++
++    switch (ln)
++    {
++        default:  // 1
++            f[0] |= 1 << sh;
++            break;
++        case 1:  // 3 * 2
++            n = 3 << sh;
++            f[0] |= n;
++            f[stride] |= n;
++            break;
++        case 2:  // 0xf * 4
++            n = 0xf << sh;
++            f[0] |= n;
++            f[stride] |= n;
++            f[stride * 2] |= n;
++            f[stride * 3] |= n;
++            break;
++        case 3:  // 0xff * 8
++            for (n = 0; n != 8; ++n, f += stride)
++                *f = 0xff;
++            break;
++    }
++}
++
++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
++   {  // 8
++      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++   },
++   {  // 9
++      .luma =   {0},
++      .chroma = {0}
++   },
++   {  // 10
++      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++   }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++    const unsigned int n = ici->n;
++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
++
++    ipe->n = n;
++    ipe->max_fill = q1_size - ipe->min_gap;
++    for(unsigned int i = 0; i < n; i++) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base =
++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++        q->code_setup = qpu_fn(ici->setup_fns[i]);
++        q->code_sync = qpu_fn(ici->sync_fns[i]);
++        q->code_exit = qpu_fn(ici->exit_fns[i]);
++    }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
++{
++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++}
++
++// Unsigned Trivial MOD
++static inline unsigned int utmod(const unsigned int x, const unsigned int n)
++{
++    return x >= n ? x - n : x;
++}
++
++// returns pq->job_n++
++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
++{
++    unsigned int const x2 = pq->job_n;
++    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
++    return x2;
++}
++
++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
++{
++    pq->terminate = 0;
++    pq->job_n = 0;
++    pq->context = s;
++    pq->worker = worker;
++    pq->psem_out = psem_out;
++    pq->pass_n = n;
++    pq->started = 0;
++    sem_init(&pq->sem_in, 0, 0);
++}
++
++static void pass_queue_kill(HEVCRpiPassQueue * const pq)
++{
++    sem_destroy(&pq->sem_in);
++}
++
++static inline void rpi_sem_wait(sem_t * const sem)
++{
++    while (sem_wait(sem) != 0) {
++        av_assert0(errno == EINTR);
++    }
++}
++
++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
++{
++    sem_post(&pq->sem_in);
++}
++
++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    // Do the various passes - common with the worker code
++    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
++        s->passq[i].worker(s, jb);
++    }
++}
++
++
++#if 0
++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
++{
++    int x;
++    sem_getvalue((sem_t *)&jbc->sem_out, &x);
++    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
++}
++#endif
++
++
++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJob * jb;
++    HEVCRpiJobGlobal * const jbg = jbc->jbg;
++
++    pthread_mutex_lock(&jbg->lock);
++    // Check local 1st
++    if ((jb = jbc->jb1) != NULL)
++    {
++        // Only 1 - very easy :-)
++        jbc->jb1 = NULL;
++    }
++    else
++    {
++        // Now look for global free chain
++        if ((jb = jbg->free1) != NULL)
++        {
++            // Found one - unlink it
++            jbg->free1 = jb->next;
++            jb->next = NULL;
++        }
++        else
++        {
++            // Out of places to look - wait for one to become free - add to Qs
++
++            // Global
++            // If "good" lc then add after the last "good" el in the chain
++            // otherwise add to the tail
++            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
++            {
++                // Add to end as we had to wait last time or wait Q empty
++                if ((lc->jw_prev = jbg->wait_tail) == NULL)
++                    jbg->wait_head = lc;
++                else
++                    lc->jw_prev->jw_next = lc;
++                lc->jw_next = NULL;
++                jbg->wait_tail = lc;
++            }
++            else
++            {
++                // This is a "good" lc that we need to poke into the middle
++                // of the Q
++                // We know that the Q isn't empty and there is at least one
++                // !last_progess_good el in it from the previous test
++
++                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
++
++                if (p == NULL)
++                {
++                    // No current good els - add to head
++                    lc->jw_next = jbg->wait_head;
++                    jbg->wait_head = lc;
++                }
++                else
++                {
++                    lc->jw_next = p->jw_next;
++                    p->jw_next = lc;
++                }
++
++                lc->jw_next->jw_prev = lc;
++                lc->jw_prev = p;
++            }
++
++            // If "good" then we are now the last good waiting el
++            if (lc->last_progress_good)
++                jbg->wait_good = lc;
++
++            // Local
++            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
++                jbc->lcw_head = lc;
++            else
++                lc->ljw_prev->ljw_next = lc;
++            lc->ljw_next = NULL;
++            jbc->lcw_tail = lc;
++        }
++    }
++
++    pthread_mutex_unlock(&jbg->lock);
++
++    if (jb == NULL)  // Need to wait
++    {
++        rpi_sem_wait(&lc->jw_sem);
++        jb = lc->jw_job;  // Set by free code
++    }
++
++    return jb;
++}
++
++
++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
++{
++    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
++    HEVCRpiJobCtl * jbc = jb->jbc_local;
++    HEVCRpiLocalContext * lc = NULL;
++
++    pthread_mutex_lock(&jbg->lock);
++
++    if (jbc != NULL)
++    {
++        av_assert1(jbc->jb1 == NULL);
++
++        // Release to Local if nothing waiting there
++        if ((lc = jbc->lcw_head) == NULL)
++            jbc->jb1 = jb;
++    }
++    else
++    {
++        // Release to global if nothing waiting there
++        if ((lc = jbg->wait_head) == NULL)
++        {
++            jb->next = jbg->free1;
++            jbg->free1 = jb;
++        }
++        else
++        {
++            // ? seems somehow mildy ugly...
++            jbc = lc->context->jbc;
++        }
++    }
++
++    if (lc != NULL)
++    {
++        // Something was waiting
++
++        // Unlink
++        // Global
++        if (lc->jw_next == NULL)
++            jbg->wait_tail = lc->jw_prev;
++        else
++            lc->jw_next->jw_prev = lc->jw_prev;
++
++        if (lc->jw_prev == NULL)
++            jbg->wait_head = lc->jw_next;
++        else
++            lc->jw_prev->jw_next = lc->jw_next;
++
++        // Local
++        if (lc->ljw_next == NULL)
++            jbc->lcw_tail = lc->ljw_prev;
++        else
++            lc->ljw_next->ljw_prev = lc->ljw_prev;
++
++        if (lc->ljw_prev == NULL)
++            jbc->lcw_head = lc->ljw_next;
++        else
++            lc->ljw_prev->ljw_next = lc->ljw_next;
++
++        // Update good if required
++        if (jbg->wait_good == lc)
++            jbg->wait_good = lc->jw_prev;
++
++        // Prod
++        lc->jw_job = jb;
++        sem_post(&lc->jw_sem);
++    }
++
++    pthread_mutex_unlock(&jbg->lock);
++}
++
++static void job_lc_kill(HEVCRpiLocalContext * const lc)
++{
++    sem_destroy(&lc->jw_sem);
++}
++
++static void job_lc_init(HEVCRpiLocalContext * const lc)
++{
++    lc->jw_next = NULL;
++    lc->jw_prev = NULL;
++    lc->ljw_next = NULL;
++    lc->ljw_prev = NULL;
++    lc->jw_job = NULL;
++    sem_init(&lc->jw_sem,  0, 0);
++}
++
++// Returns:
++//  0 if we have waited for MV or expect to wait for recon
++//  1 if we haven't waited for MV & do not need to wait for recon
++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
++{
++    if (jb->waited) // reset by rpi_begin
++        return 0;
++    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
++    {
++        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
++                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
++            return 0;
++    }
++    return 1;
++}
++
++// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl *const jbc = s->jbc;
++    HEVCRpiJob * const jb = lc->jb0;
++
++    av_assert1(jb != NULL);
++
++    if (jb->ctu_ts_last < 0) {
++        return;
++    }
++
++    lc->last_progress_good = progress_good(s, jb);
++    jb->waited = !lc->last_progress_good;
++    lc->jb0 = NULL;
++
++    if (s->offload_recon)
++    {
++        pthread_mutex_lock(&jbc->in_lock);
++        jbc->offloadq[jbc->offload_in] = jb;
++        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
++        pthread_mutex_unlock(&jbc->in_lock);
++
++        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
++    }
++    else
++    {
++        pass_queue_do_all(s, jb);  // Consumes job before return
++    }
++}
++
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++//
++// Now safe against multiple callers - needed for tiles
++// "normal" and WPP will only call here one at a time
++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++
++    // It is legit for us to already have a job allocated - do nothing in this case
++    if (lc->jb0 != NULL)
++        return;
++
++    if (s->offload_recon)
++        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
++
++    lc->jb0 = job_alloc(jbc, lc);
++
++    rpi_begin(s, lc->jb0, lc->ts);
++}
++
++// Free up a job without submission
++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++    HEVCRpiJob * const jb = lc->jb0;
++
++    if (jb == NULL) {
++        return;
++    }
++
++    lc->jb0 = NULL;
++
++    job_free(jbc, jb);
++
++    // If offload then poke sem_out too
++    if (s->offload_recon) {
++        sem_post(&jbc->sem_out);
++    }
++}
++
++
++// Call this to wait for all jobs to have completed at the end of a frame
++// Slightly icky as there is no clean way to wait for a sem to count up
++// Not reentrant - call on main thread only
++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++    int i = 0;
++
++    // We shouldn't reach here with an unsubmitted job
++    av_assert1(lc->jb0 == NULL);
++
++    // If no offload then there can't be anything to wait for
++    if (!s->offload_recon) {
++        return;
++    }
++
++    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
++    {
++        for (i = 0; i != RPI_MAX_JOBS; ++i) {
++            rpi_sem_wait(&jbc->sem_out);
++        }
++        for (i = 0; i != RPI_MAX_JOBS; ++i) {
++            sem_post(&jbc->sem_out);
++        }
++    }
++}
++
++static void * pass_worker(void *arg)
++{
++    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
++    HEVCRpiContext *const s = pq->context;
++
++    for (;;)
++    {
++        rpi_sem_wait(&pq->sem_in);
++
++        if (pq->terminate)
++            break;
++
++        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
++        // * should really set jb->passes_done here
++
++        sem_post(pq->psem_out);
++    }
++    return NULL;
++}
++
++static void pass_queues_start_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
++        pqs[i].started = 1;
++    }
++}
++
++static void pass_queues_term_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++        pqs[i].terminate = 1;
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        if (pqs[i].started)
++            sem_post(&pqs[i].sem_in);
++    }
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        if (pqs[i].started) {
++            pthread_join(pqs[i].thread, NULL);
++            pqs[i].started = 0;
++        }
++    }
++}
++
++static void pass_queues_kill_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++        pass_queue_kill(pqs + i);
++}
++
++
++static void worker_pic_free_one(HEVCRpiJob * const jb)
++{
++    // Free coeff stuff - allocation not the same for all buffers
++    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++    if (cf->s[0].buf != NULL)
++        av_freep(&cf->mptr);
++    if (cf->s[2].buf != NULL)
++        gpu_free(&cf->gptr);
++    memset(cf, 0, sizeof(*cf));
++}
++
++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
++{
++    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++        goto fail;
++    cf->s[2].buf = (int16_t *)cf->gptr.arm;
++    cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++    // Must be 64 byte aligned for our zero zapping code so over-allocate &
++    // round
++    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++        goto fail;
++    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++    return 0;
++
++fail:
++    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
++    worker_pic_free_one(jb);
++    return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++    unsigned int i;
++    for (i = 0; i != 4; ++i) {
++        cf->s[i].n = 0;
++#if RPI_COMPRESS_COEFFS        
++        cf->s[i].packed = 1;
++        cf->s[i].packed_n = 0;
++#endif
++    }
++}
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
++{
++    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++    cfe->n += n;
++    return coeffs;
++}
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int val, const int field)
++{
++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
++        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
++        sem_t * sem = NULL;
++
++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
++            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
++
++            av_assert1(pwait->req == -1 && pwait->next == NULL);
++            jb->waited = 1;  // Remember that we had to wait for later scheduling
++
++            pwait->req = val;
++            pwait->next = NULL;
++            if (pstate->first == NULL)
++                pstate->first = pwait;
++            else
++                pstate->last->next = pwait;
++            pstate->last = pwait;
++            sem = &pwait->sem;
++        }
++        pthread_mutex_unlock(&pstate->lock);
++
++        if (sem != NULL) {
++            rpi_sem_wait(sem);
++        }
++    }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
++{
++    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
++
++    ((int *)s->ref->tf.progress->data)[field] = val;
++
++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++    {
++        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
++        HEVCRpiFrameProgressWait * pwait;
++
++        while ((pwait = *ppwait) != NULL) {
++            if (pwait->req > val)
++            {
++                ppwait = &pwait->next;
++                pstate->last = pwait;
++            }
++            else
++            {
++                *ppwait = pwait->next;
++                pwait->req = -1;
++                pwait->next = NULL;
++                sem_post(&pwait->sem);
++            }
++        }
++    }
++    pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
++{
++    pstate->first = NULL;
++    pstate->last = NULL;
++    pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++    pwait->req = -1;
++    pwait->next = NULL;
++    sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
++{
++    av_assert1(pstate->first == NULL);
++    pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++    sem_destroy(&pwait->sem);
++}
++
++
++/**
++ * NOTE: Each function hls_foo correspond to the function foo in the
++ * specification (HLS stands for High Level Syntax).
++ */
++
++/**
++ * Section 5.7
++ */
++
++// Realloc the entry point arrays
++static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
++{
++    if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
++    {
++        // Round up alloc to multiple of 32
++        int a = (n + 31) & ~31;
++
++        // We don't care about the previous contents so probably fastest to simply discard
++        av_freep(&sh->entry_point_offset);
++        av_freep(&sh->offset);
++        av_freep(&sh->size);
++
++        if (a != 0)
++        {
++            sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
++            sh->offset = av_malloc_array(a, sizeof(int));
++            sh->size = av_malloc_array(a, sizeof(int));
++
++            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
++                sh->num_entry_point_offsets = 0;
++                sh->offsets_allocated = 0;
++                return AVERROR(ENOMEM);
++            }
++        }
++
++        sh->offsets_allocated = a;
++    }
++
++    return 0;
++}
++
++/* free everything allocated  by pic_arrays_init() */
++static void pic_arrays_free(HEVCRpiContext *s)
++{
++    av_freep(&s->sao);
++    av_freep(&s->deblock);
++
++    av_freep(&s->cabac_stash_up);
++    s->cabac_stash_left = NULL;  // freed with _up
++
++    av_freep(&s->mvf_up);
++    av_freep(&s->mvf_left);
++
++    av_freep(&s->is_pcm);
++    av_freep(&s->is_intra_store);
++    s->is_intra = NULL;
++    av_freep(&s->rpl_tab);
++    s->rpl_tab_size = 0;
++
++    av_freep(&s->qp_y_tab);
++    av_freep(&s->tab_slice_address);
++    av_freep(&s->filter_slice_edges);
++
++    av_freep(&s->bs_horizontal);
++    s->bs_vertical = NULL;  // freed with H
++    av_freep(&s->bsf_stash_left);
++    av_freep(&s->bsf_stash_up);
++
++    av_freep(&s->rpl_up);
++    av_freep(&s->rpl_left);
++
++    alloc_entry_points(&s->sh, 0);
++
++    av_buffer_pool_uninit(&s->col_mvf_pool);
++}
++
++/* allocate arrays that depend on frame dimensions */
++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
++{
++    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
++    const unsigned int width            = sps->width;
++    const unsigned int height           = sps->height;
++    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
++                           ((height >> log2_min_cb_size) + 1);
++    const unsigned int ctb_count        = sps->ctb_size;
++
++    {
++        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
++        unsigned int h = ((height + 15) & ~15);
++
++        s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
++        s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
++    }
++
++    s->sao           = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
++    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
++    if (!s->sao || !s->deblock)
++        goto fail;
++
++    s->cabac_stash_up  = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
++    s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
++    if (s->cabac_stash_up == NULL)
++        goto fail;
++
++    // Round width up to max ctb size
++    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++    // * Only needed if we have H tiles
++    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++
++    // We can overread by 1 line & one byte in deblock so alloc & zero
++    // We don't need to zero the extra @ start of frame as it will never be
++    // written
++    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++    if (s->is_pcm == NULL || s->is_intra_store == NULL)
++        goto fail;
++
++    s->filter_slice_edges = av_mallocz(ctb_count);
++    s->tab_slice_address  = av_malloc_array(ctb_count,
++                                      sizeof(*s->tab_slice_address));
++    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
++                                      sizeof(*s->qp_y_tab));
++    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
++        goto fail;
++
++    s->bs_horizontal = av_mallocz(s->bs_size * 2);
++    s->bs_vertical   = s->bs_horizontal + s->bs_size;
++    if (s->bs_horizontal == NULL)
++        goto fail;
++
++    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
++    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
++    if (s->rpl_left == NULL || s->rpl_up == NULL)
++        goto fail;
++
++    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
++        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
++        goto fail;
++
++    s->col_mvf_stride = (width + 15) >> 4;
++    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
++                                          av_buffer_allocz);
++    if (s->col_mvf_pool == NULL)
++        goto fail;
++
++    return 0;
++
++fail:
++    pic_arrays_free(s);
++    return AVERROR(ENOMEM);
++}
++
++static void default_pred_weight_table(HEVCRpiContext * const s)
++{
++  unsigned int i;
++  const unsigned int wt = 1 << QPU_MC_DENOM;
++  s->sh.luma_log2_weight_denom = 0;
++  s->sh.chroma_log2_weight_denom = 0;
++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++      s->sh.luma_weight_l0[i] = wt;
++      s->sh.luma_offset_l0[i] = 0;
++      s->sh.chroma_weight_l0[i][0] = wt;
++      s->sh.chroma_weight_l0[i][1] = wt;
++      s->sh.chroma_offset_l0[i][0] = 0;
++      s->sh.chroma_offset_l0[i][1] = 0;
++  }
++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++      s->sh.luma_weight_l1[i] = wt;
++      s->sh.luma_offset_l1[i] = 0;
++      s->sh.chroma_weight_l1[i][0] = wt;
++      s->sh.chroma_weight_l1[i][1] = wt;
++      s->sh.chroma_offset_l1[i][0] = 0;
++      s->sh.chroma_offset_l1[i][1] = 0;
++  }
++}
++
++static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
++                       const unsigned int refs,
++                       int16_t * luma_weight,   int16_t * luma_offset,
++                       int16_t * chroma_weight, int16_t * chroma_offset)
++{
++    unsigned int luma_flags;
++    unsigned int chroma_flags;
++    unsigned int i;
++    const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
++    const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
++    const unsigned int luma_weight_base    = 1 << QPU_MC_DENOM;
++    const unsigned int chroma_weight_base  = 1 << QPU_MC_DENOM;
++    const unsigned int luma_weight_shift   = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
++    const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
++
++    if (refs == 0)
++        return 0;
++
++    luma_flags = get_bits(gb, refs);
++    chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
++    i = 1 << (refs - 1);
++
++    do
++    {
++        if ((luma_flags & i) != 0)
++        {
++            const int delta_weight = get_se_golomb(gb);
++            const int offset = get_se_golomb(gb);
++            if (delta_weight < -128 || delta_weight > 127 ||
++                offset < -wp_offset_half_range || offset >= wp_offset_half_range)
++            {
++                return AVERROR_INVALIDDATA;
++            }
++            *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
++            *luma_offset++ = offset << wp_offset_bd_shift;
++        }
++        else
++        {
++            *luma_weight++ = luma_weight_base;
++            *luma_offset++ = 0;
++        }
++
++        if ((chroma_flags & i) != 0)
++        {
++            unsigned int j;
++            for (j = 0; j != 2; ++j)
++            {
++                const int delta_weight = get_se_golomb(gb);
++                const int delta_offset = get_se_golomb(gb);
++
++                if (delta_weight < -128 || delta_weight > 127 ||
++                    delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
++                {
++                    return AVERROR_INVALIDDATA;
++                }
++
++                *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
++                *chroma_offset++ = av_clip(
++                    wp_offset_half_range + delta_offset -
++                        ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
++                    -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
++            }
++        }
++        else
++        {
++            *chroma_weight++ = chroma_weight_base;
++            *chroma_weight++ = chroma_weight_base;
++            *chroma_offset++ = 0;
++            *chroma_offset++ = 0;
++        }
++    } while ((i >>= 1) != 0);
++
++    return 0;
++}
++
++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
++{
++    int err;
++    const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
++    const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
++
++    if (luma_log2_weight_denom > 7 ||
++        chroma_log2_weight_denom > 7)
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
++               luma_log2_weight_denom, chroma_log2_weight_denom);
++        return AVERROR_INVALIDDATA;
++    }
++
++    s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
++    s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
++
++    if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
++                s->sh.luma_weight_l0,      s->sh.luma_offset_l0,
++                s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
++        (err = get_weights(s, gb, s->sh.nb_refs[L1],
++                s->sh.luma_weight_l1,      s->sh.luma_offset_l1,
++                s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
++        return err;
++    }
++
++    return 0;
++}
++
++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
++{
++    const HEVCRpiSPS *sps = s->ps.sps;
++    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
++    int prev_delta_msb = 0;
++    unsigned int nb_sps = 0, nb_sh;
++    int i;
++
++    rps->nb_refs = 0;
++    if (!sps->long_term_ref_pics_present_flag)
++        return 0;
++
++    if (sps->num_long_term_ref_pics_sps > 0)
++        nb_sps = get_ue_golomb_long(gb);
++    nb_sh = get_ue_golomb_long(gb);
++
++    if (nb_sps > sps->num_long_term_ref_pics_sps)
++        return AVERROR_INVALIDDATA;
++    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
++        return AVERROR_INVALIDDATA;
++
++    rps->nb_refs = nb_sh + nb_sps;
++
++    for (i = 0; i < rps->nb_refs; i++) {
++        uint8_t delta_poc_msb_present;
++
++        if (i < nb_sps) {
++            uint8_t lt_idx_sps = 0;
++
++            if (sps->num_long_term_ref_pics_sps > 1)
++                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
++
++            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
++            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
++        } else {
++            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
++            rps->used[i] = get_bits1(gb);
++        }
++
++        delta_poc_msb_present = get_bits1(gb);
++        if (delta_poc_msb_present) {
++            int64_t delta = get_ue_golomb_long(gb);
++            int64_t poc;
++
++            if (i && i != nb_sps)
++                delta += prev_delta_msb;
++
++            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
++            if (poc != (int32_t)poc)
++                return AVERROR_INVALIDDATA;
++            rps->poc[i] = poc;
++            prev_delta_msb = delta;
++        }
++    }
++
++    return 0;
++}
++
++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
++                                 const HEVCRpiSPS *sps)
++{
++    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
++    const HEVCRpiWindow *ow = &sps->output_window;
++    unsigned int num = 0, den = 0;
++
++    avctx->pix_fmt             = sps->pix_fmt;
++    avctx->coded_width         = sps->width;
++    avctx->coded_height        = sps->height;
++    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
++    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
++    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
++    avctx->profile             = sps->ptl.general_ptl.profile_idc;
++    avctx->level               = sps->ptl.general_ptl.level_idc;
++
++    ff_set_sar(avctx, sps->vui.sar);
++
++    if (sps->vui.video_signal_type_present_flag)
++        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
++                                                            : AVCOL_RANGE_MPEG;
++    else
++        avctx->color_range = AVCOL_RANGE_MPEG;
++
++    if (sps->vui.colour_description_present_flag) {
++        avctx->color_primaries = sps->vui.colour_primaries;
++        avctx->color_trc       = sps->vui.transfer_characteristic;
++        avctx->colorspace      = sps->vui.matrix_coeffs;
++    } else {
++        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
++        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
++        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
++    }
++
++    if (vps->vps_timing_info_present_flag) {
++        num = vps->vps_num_units_in_tick;
++        den = vps->vps_time_scale;
++    } else if (sps->vui.vui_timing_info_present_flag) {
++        num = sps->vui.vui_num_units_in_tick;
++        den = sps->vui.vui_time_scale;
++    }
++
++    if (num != 0 && den != 0)
++        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
++                  num, den, 1 << 30);
++}
++
++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
++
++    // Admit to no h/w formats
++
++    *fmt++ = sps->pix_fmt;
++    *fmt = AV_PIX_FMT_NONE;
++
++    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
++}
++
++static int is_sps_supported(const HEVCRpiSPS * const sps)
++{
++    return av_rpi_is_sand_format(sps->pix_fmt) &&
++           sps->width <= HEVC_RPI_MAX_WIDTH &&
++           sps->height <= HEVC_RPI_MAX_HEIGHT;
++}
++
++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
++                   const enum AVPixelFormat pix_fmt)
++{
++    int ret;
++
++    pic_arrays_free(s);
++    s->ps.sps = NULL;
++    s->ps.vps = NULL;
++
++    if (sps == NULL)
++        return 0;
++
++    if (!is_sps_supported(sps))
++        return AVERROR_DECODER_NOT_FOUND;
++
++    ret = pic_arrays_init(s, sps);
++    if (ret < 0)
++        goto fail;
++
++    export_stream_params(s->avctx, &s->ps, sps);
++
++    s->avctx->pix_fmt = pix_fmt;
++
++    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
++    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
++
++    // * We don't support cross_component_prediction_enabled_flag but as that
++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
++    //   only deal with sand which is never 4:4:4
++    //   [support wouldn't be hard]
++
++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++
++    av_freep(&s->sao_pixel_buffer_h[0]);
++    av_freep(&s->sao_pixel_buffer_v[0]);
++
++    if (sps->sao_enabled)
++    {
++        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
++        unsigned int c_idx;
++        size_t vsize[3] = {0};
++        size_t hsize[3] = {0};
++
++        for(c_idx = 0; c_idx < c_count; c_idx++) {
++            int w = sps->width >> ctx_hshift(s, c_idx);
++            int h = sps->height >> ctx_vshift(s, c_idx);
++            // ctb height & width are a min of 8 so this must a multiple of 16
++            // so no point rounding up!
++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++        }
++
++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++        // when we have plaited chroma
++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++    }
++
++    s->ps.sps = sps;
++    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++
++    return 0;
++
++fail:
++    pic_arrays_free(s);
++    s->ps.sps = NULL;
++    return ret;
++}
++
++static inline int qp_offset_valid(const int qp_offset)
++{
++    return qp_offset >= -12 && qp_offset <= 12;
++}
++
++static int hls_slice_header(HEVCRpiContext * const s)
++{
++    GetBitContext * const gb = &s->HEVClc->gb;
++    RpiSliceHeader * const sh   = &s->sh;
++    int i, ret;
++
++    // Coded parameters
++    sh->first_slice_in_pic_flag = get_bits1(gb);
++    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++        if (IS_IDR(s))
++            ff_hevc_rpi_clear_refs(s);
++    }
++    sh->no_output_of_prior_pics_flag = 0;
++    if (IS_IRAP(s))
++        sh->no_output_of_prior_pics_flag = get_bits1(gb);
++
++    sh->pps_id = get_ue_golomb_long(gb);
++    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
++        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
++        return AVERROR_INVALIDDATA;
++    }
++    if (!sh->first_slice_in_pic_flag &&
++        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
++        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
++        return AVERROR_INVALIDDATA;
++    }
++    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
++    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
++        sh->no_output_of_prior_pics_flag = 1;
++
++    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
++        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
++        const HEVCRpiSPS *last_sps = s->ps.sps;
++        enum AVPixelFormat pix_fmt;
++
++        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
++            if (sps->width != last_sps->width || sps->height != last_sps->height ||
++                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
++                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
++                sh->no_output_of_prior_pics_flag = 0;
++        }
++        ff_hevc_rpi_clear_refs(s);
++
++        ret = set_sps(s, sps, sps->pix_fmt);
++        if (ret < 0)
++            return ret;
++
++        pix_fmt = get_format(s, sps);
++        if (pix_fmt < 0)
++            return pix_fmt;
++
++//        ret = set_sps(s, sps, pix_fmt);
++//        if (ret < 0)
++//            return ret;
++
++        s->avctx->pix_fmt = pix_fmt;
++
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++    }
++
++    sh->dependent_slice_segment_flag = 0;
++    if (!sh->first_slice_in_pic_flag) {
++        int slice_address_length;
++
++        if (s->ps.pps->dependent_slice_segments_enabled_flag)
++            sh->dependent_slice_segment_flag = get_bits1(gb);
++
++        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
++        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
++        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "Invalid slice segment address: %u.\n",
++                   sh->slice_segment_addr);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (!sh->dependent_slice_segment_flag) {
++            sh->slice_addr = sh->slice_segment_addr;
++            s->slice_idx++;
++        }
++    } else {
++        sh->slice_segment_addr = sh->slice_addr = 0;
++        s->slice_idx           = 0;
++        s->slice_initialized   = 0;
++    }
++
++    if (!sh->dependent_slice_segment_flag) {
++        s->slice_initialized = 0;
++
++        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
++            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
++
++        sh->slice_type = get_ue_golomb_long(gb);
++        if (!(sh->slice_type == HEVC_SLICE_I ||
++              sh->slice_type == HEVC_SLICE_P ||
++              sh->slice_type == HEVC_SLICE_B)) {
++            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
++                   sh->slice_type);
++            return AVERROR_INVALIDDATA;
++        }
++        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
++            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
++            return AVERROR_INVALIDDATA;
++        }
++
++        // when flag is not present, picture is inferred to be output
++        sh->pic_output_flag = 1;
++        if (s->ps.pps->output_flag_present_flag)
++            sh->pic_output_flag = get_bits1(gb);
++
++        if (s->ps.sps->separate_colour_plane_flag)
++            sh->colour_plane_id = get_bits(gb, 2);
++
++        if (!IS_IDR(s)) {
++            int poc, pos;
++
++            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
++            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
++            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
++                av_log(s->avctx, AV_LOG_WARNING,
++                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
++                if (s->avctx->err_recognition & AV_EF_EXPLODE)
++                    return AVERROR_INVALIDDATA;
++                poc = s->poc;
++            }
++            s->poc = poc;
++
++            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
++            pos = get_bits_left(gb);
++            if (!sh->short_term_ref_pic_set_sps_flag) {
++                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
++                if (ret < 0)
++                    return ret;
++
++                sh->short_term_rps = &sh->slice_rps;
++            } else {
++                int numbits, rps_idx;
++
++                if (!s->ps.sps->nb_st_rps) {
++                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
++                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
++                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
++            }
++            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++            pos = get_bits_left(gb);
++            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
++            if (ret < 0) {
++                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
++                if (s->avctx->err_recognition & AV_EF_EXPLODE)
++                    return AVERROR_INVALIDDATA;
++            }
++            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
++                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
++            else
++                sh->slice_temporal_mvp_enabled_flag = 0;
++        } else {
++            s->sh.short_term_rps = NULL;
++            s->poc               = 0;
++        }
++
++        /* 8.3.1 */
++        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
++            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
++            s->nal_unit_type != HEVC_NAL_TSA_N   &&
++            s->nal_unit_type != HEVC_NAL_STSA_N  &&
++            s->nal_unit_type != HEVC_NAL_RADL_N  &&
++            s->nal_unit_type != HEVC_NAL_RADL_R  &&
++            s->nal_unit_type != HEVC_NAL_RASL_N  &&
++            s->nal_unit_type != HEVC_NAL_RASL_R)
++            s->pocTid0 = s->poc;
++
++        if (s->ps.sps->sao_enabled) {
++            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
++            if (ctx_cfmt(s) != 0) {
++                sh->slice_sample_adaptive_offset_flag[1] =
++                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
++            }
++        } else {
++            sh->slice_sample_adaptive_offset_flag[0] = 0;
++            sh->slice_sample_adaptive_offset_flag[1] = 0;
++            sh->slice_sample_adaptive_offset_flag[2] = 0;
++        }
++
++        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
++        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
++            int nb_refs;
++
++            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
++            if (sh->slice_type == HEVC_SLICE_B)
++                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
++
++            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
++                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
++                if (sh->slice_type == HEVC_SLICE_B)
++                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
++            }
++            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
++                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
++                       sh->nb_refs[L0], sh->nb_refs[L1]);
++                return AVERROR_INVALIDDATA;
++            }
++
++            sh->rpl_modification_flag[0] = 0;
++            sh->rpl_modification_flag[1] = 0;
++            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
++            if (!nb_refs) {
++                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
++                return AVERROR_INVALIDDATA;
++            }
++
++            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
++                sh->rpl_modification_flag[0] = get_bits1(gb);
++                if (sh->rpl_modification_flag[0]) {
++                    for (i = 0; i < sh->nb_refs[L0]; i++)
++                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
++                }
++
++                if (sh->slice_type == HEVC_SLICE_B) {
++                    sh->rpl_modification_flag[1] = get_bits1(gb);
++                    if (sh->rpl_modification_flag[1] == 1)
++                        for (i = 0; i < sh->nb_refs[L1]; i++)
++                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
++                }
++            }
++
++            if (sh->slice_type == HEVC_SLICE_B)
++                sh->mvd_l1_zero_flag = get_bits1(gb);
++
++            if (s->ps.pps->cabac_init_present_flag)
++                sh->cabac_init_flag = get_bits1(gb);
++            else
++                sh->cabac_init_flag = 0;
++
++            sh->collocated_ref_idx = 0;
++            if (sh->slice_temporal_mvp_enabled_flag) {
++                sh->collocated_list = L0;
++                if (sh->slice_type == HEVC_SLICE_B)
++                    sh->collocated_list = !get_bits1(gb);
++
++                if (sh->nb_refs[sh->collocated_list] > 1) {
++                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
++                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
++                        av_log(s->avctx, AV_LOG_ERROR,
++                               "Invalid collocated_ref_idx: %d.\n",
++                               sh->collocated_ref_idx);
++                        return AVERROR_INVALIDDATA;
++                    }
++                }
++            }
++
++            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
++                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
++            {
++                if ((ret = pred_weight_table(s, gb)) != 0)
++                    return ret;
++            }
++            else
++            {
++                // Give us unit weights
++                default_pred_weight_table(s);
++            }
++
++            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++                av_log(s->avctx, AV_LOG_ERROR,
++                       "Invalid number of merging MVP candidates: %d.\n",
++                       sh->max_num_merge_cand);
++                return AVERROR_INVALIDDATA;
++            }
++        }
++
++        sh->slice_qp_delta = get_se_golomb(gb);
++
++        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
++            sh->slice_cb_qp_offset = get_se_golomb(gb);
++            sh->slice_cr_qp_offset = get_se_golomb(gb);
++            if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
++                !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
++                !qp_offset_valid(sh->slice_cr_qp_offset) ||
++                !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
++            {
++                av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
++                       sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
++                       s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
++                return AVERROR_INVALIDDATA;
++            }
++        } else
++        {
++            sh->slice_cb_qp_offset = 0;
++            sh->slice_cr_qp_offset = 0;
++        }
++
++        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
++            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
++        else
++            sh->cu_chroma_qp_offset_enabled_flag = 0;
++
++        if (s->ps.pps->deblocking_filter_control_present_flag) {
++            int deblocking_filter_override_flag = 0;
++
++            if (s->ps.pps->deblocking_filter_override_enabled_flag)
++                deblocking_filter_override_flag = get_bits1(gb);
++
++            if (deblocking_filter_override_flag) {
++                sh->disable_deblocking_filter_flag = get_bits1(gb);
++                if (!sh->disable_deblocking_filter_flag) {
++                    int beta_offset_div2 = get_se_golomb(gb);
++                    int tc_offset_div2   = get_se_golomb(gb) ;
++                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
++                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
++                        av_log(s->avctx, AV_LOG_ERROR,
++                            "Invalid deblock filter offsets: %d, %d\n",
++                            beta_offset_div2, tc_offset_div2);
++                        return AVERROR_INVALIDDATA;
++                    }
++                    sh->beta_offset = beta_offset_div2 * 2;
++                    sh->tc_offset   =   tc_offset_div2 * 2;
++                }
++            } else {
++                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
++                sh->beta_offset                    = s->ps.pps->beta_offset;
++                sh->tc_offset                      = s->ps.pps->tc_offset;
++            }
++        } else {
++            sh->disable_deblocking_filter_flag = 0;
++            sh->beta_offset                    = 0;
++            sh->tc_offset                      = 0;
++        }
++
++        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
++            (sh->slice_sample_adaptive_offset_flag[0] ||
++             sh->slice_sample_adaptive_offset_flag[1] ||
++             !sh->disable_deblocking_filter_flag)) {
++            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++        } else {
++            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
++        }
++        sh->no_dblk_boundary_flags =
++            (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
++                BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
++            (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
++                BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
++
++
++    } else if (!s->slice_initialized) {
++        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    sh->num_entry_point_offsets = 0;
++    sh->offload_wpp = 0;
++    sh->offload_tiles = 0;
++
++    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
++        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
++        // It would be possible to bound this tighter but this here is simpler
++        if (num_entry_point_offsets > get_bits_left(gb)) {
++            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
++            return AVERROR_INVALIDDATA;
++        }
++
++        sh->num_entry_point_offsets = num_entry_point_offsets;
++        if (sh->num_entry_point_offsets > 0) {
++            int offset_len = get_ue_golomb_long(gb) + 1;
++
++            if (offset_len < 1 || offset_len > 32) {
++                sh->num_entry_point_offsets = 0;
++                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
++                return AVERROR_INVALIDDATA;
++            }
++
++            if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
++            {
++                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
++                return ret;
++            }
++
++            for (i = 0; i < sh->num_entry_point_offsets; i++) {
++                uint32_t val_minus1 = get_bits_long(gb, offset_len);
++                if (val_minus1 > (1 << 28))
++                {
++                    // We can declare offsets of > 2^28 bad without loss of generality
++                    // Will check actual bounds wrt NAL later, but this keeps
++                    // the values within bounds we can deal with easily
++                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
++                    return AVERROR_INVALIDDATA;
++                }
++                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
++            }
++
++            // Do we want to offload this
++            if (s->threads_type != 0)
++            {
++                sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
++                    s->ps.pps->num_tile_columns > 1;
++                // * We only cope with WPP in a single column
++                //   Probably want to deal with that case as tiles rather than WPP anyway
++                // ?? Not actually sure that the main code deals with WPP + multi-col correctly
++                sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
++                    s->ps.pps->num_tile_columns == 1;
++            }
++        }
++    }
++
++    if (s->ps.pps->slice_header_extension_present_flag) {
++        unsigned int length = get_ue_golomb_long(gb);
++        if (length*8LL > get_bits_left(gb)) {
++            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
++            return AVERROR_INVALIDDATA;
++        }
++        for (i = 0; i < length; i++)
++            skip_bits(gb, 8);  // slice_header_extension_data_byte
++    }
++
++    // Inferred parameters
++    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
++    if (sh->slice_qp > 51 ||
++        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "The slice_qp %d is outside the valid range "
++               "[%d, 51].\n",
++               sh->slice_qp,
++               -s->ps.sps->qp_bd_offset);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (get_bits_left(gb) < 0) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Overread slice header by %d bits\n", -get_bits_left(gb));
++        return AVERROR_INVALIDDATA;
++    }
++
++    s->slice_initialized = 1;
++    return 0;
++}
++
++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
++{
++    RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
++    int c_idx, i;
++
++    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
++        s->sh.slice_sample_adaptive_offset_flag[1]) {
++        if ((lc->ctb_avail & AVAIL_L) != 0)
++        {
++            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++            if (sao_merge_left_flag) {
++                *sao = sao[-1];
++                return;
++            }
++        }
++        if ((lc->ctb_avail & AVAIL_U) != 0)
++        {
++            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++            if (sao_merge_up_flag) {
++                *sao = sao[-(int)s->ps.sps->ctb_width];
++                return;
++            }
++        }
++    }
++
++    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
++        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
++                                                 s->ps.pps->log2_sao_offset_scale_chroma;
++        int offset_abs[4];
++        char offset_sign[4] = {0};
++
++        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
++            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
++            continue;
++        }
++
++        if (c_idx == 2) {
++            sao->type_idx[2] = sao->type_idx[1];
++            sao->eo_class[2] = sao->eo_class[1];
++        } else {
++            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
++        }
++
++        // ** Could use BY22 here quite plausibly - this is all bypass stuff
++        //    though only per CTB so not very timing critical
++
++        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
++            continue;
++
++        for (i = 0; i < 4; i++)
++            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
++
++        if (sao->type_idx[c_idx] == SAO_BAND) {
++            for (i = 0; i < 4; i++) {
++                if (offset_abs[i] != 0)
++                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
++            }
++            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
++        } else if (c_idx != 2) {
++            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
++        }
++
++        // Inferred parameters
++        sao->offset_val[c_idx][0] = 0;
++        for (i = 0; i < 4; i++) {
++            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
++            if (sao->type_idx[c_idx] == SAO_EDGE) {
++                if (i > 1)
++                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++            } else if (offset_sign[i]) {
++                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++            }
++        }
++    }
++}
++
++#if 0
++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
++    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);  // 0..4
++
++    if (log2_res_scale_abs_plus1 !=  0) {
++        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
++        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
++                               (1 - 2 * res_scale_sign_flag);
++    } else {
++        lc->tu.res_scale_val = 0;
++    }
++
++
++    return 0;
++}
++#endif
++
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
++{
++    return jb->intra.cmds + jb->intra.n++;
++}
++
++#define A0(x, y, U, L, UL, UR, DL) \
++    [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
++
++#define A1(x, y, U, L, UL, UR, DL) \
++    A0((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A0((x) + 1, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A0((x) + 0, (y) + 1,  1,   (L),  (L),   1,   (DL)),  A0((x) + 1, (y) + 1,  1,    1,    1,    0,    0  )
++
++#define A2(x, y, U, L, UL, UR, DL) \
++    A1((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A1((x) + 2, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A1((x) + 0, (y) + 2,  1,   (L),  (L),   1,   (DL)),  A1((x) + 2, (y) + 2,  1,    1,    1,    0,    0  )
++
++#define A3(x, y, U, L, UL, UR, DL) \
++    A2((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A2((x) + 4, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A2((x) + 0, (y) + 4,  1,   (L),  (L),   1,   (DL)),  A2((x) + 4, (y) + 4,  1,    1,    1,    0,    0  )
++
++#define A4(x, y, U, L, UL, UR, DL) \
++    A3((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A3((x) + 8, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A3((x) + 0, (y) + 8,  1,   (L),  (L),   1,   (DL)),  A3((x) + 8, (y) + 8,  1,    1,    1,    0,    0  )
++
++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
++{
++    const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
++    const unsigned int tb_x = x & ~ctb_mask;
++    const unsigned int tb_y = y & ~ctb_mask;
++    const unsigned int ctb_avail = lc->ctb_avail;
++
++    const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
++
++    unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
++
++    // This deals with both the U & L edges
++    if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
++        f |= AVAIL_UL;
++
++    if (x + w < lc->end_of_ctb_x)
++        f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
++    else if (tb_y == 0)
++        f |= (ctb_avail & AVAIL_UR);
++#if AVAIL_S_U - AVAIL_S_UR < 0
++#error Shift problem
++#endif
++
++    // Never any D if Y beyond eoctb
++    if (y + h < lc->end_of_ctb_y)
++        f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
++#if AVAIL_S_DL - AVAIL_S_L < 0
++#error Shift problem
++#endif
++
++//    printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
++//           lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
++//           lc->end_of_ctb_x, lc->end_of_ctb_y);
++
++    return f;
++}
++
++#undef A0
++#undef A1
++#undef A2
++#undef A3
++#undef A4
++
++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
++                          unsigned int avail)
++{
++    // If rpi_enabled then sand - U & V done on U call
++    if (c_idx <= 1)
++    {
++        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++        cmd->type = RPI_PRED_INTRA + c_idx;
++        cmd->size = log2_trafo_size;
++        cmd->avail = avail;
++        cmd->i_pred.x = x0;
++        cmd->i_pred.y = y0;
++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
++
++//        printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
++    }
++}
++
++#define CBF_CB0_S 0
++#define CBF_CB1_S 1 // CB1 must be CB0 + 1
++#define CBF_CR0_S 2
++#define CBF_CR1_S 3
++
++#define CBF_CB0 (1 << CBF_CB0_S)
++#define CBF_CR0 (1 << CBF_CR0_S)
++#define CBF_CB1 (1 << CBF_CB1_S)
++#define CBF_CR1 (1 << CBF_CR1_S)
++
++// * Only good for chroma_idx == 1
++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                              const unsigned int x0, const unsigned int y0,
++                              const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
++                              const unsigned int blk_idx, const int cbf_luma,
++                              const unsigned int cbf_chroma)
++{
++    const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
++    const unsigned int x0_c = x0 & ~7;
++    const unsigned int y0_c = y0 & ~7;
++
++    enum ScanType scan_idx   = SCAN_DIAG;
++    enum ScanType scan_idx_c = SCAN_DIAG;
++
++    if (lc->cu.pred_mode == MODE_INTRA)
++    {
++        const unsigned int trafo_size = 1 << log2_trafo_size;
++        const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
++
++        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
++
++        if (log2_trafo_size > 2)
++            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
++        else if (blk_idx == 3)
++            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
++                          ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
++
++        if (log2_trafo_size < 4) {
++            if (lc->tu.intra_pred_mode >= 6 &&
++                lc->tu.intra_pred_mode <= 14) {
++                scan_idx = SCAN_VERT;
++            } else if (lc->tu.intra_pred_mode >= 22 &&
++                       lc->tu.intra_pred_mode <= 30) {
++                scan_idx = SCAN_HORIZ;
++            }
++
++            if (lc->tu.intra_pred_mode_c >=  6 &&
++                lc->tu.intra_pred_mode_c <= 14) {
++                scan_idx_c = SCAN_VERT;
++            } else if (lc->tu.intra_pred_mode_c >= 22 &&
++                       lc->tu.intra_pred_mode_c <= 30) {
++                scan_idx_c = SCAN_HORIZ;
++            }
++        }
++    }
++
++    if (!cbf_luma && cbf_chroma == 0)
++        return 0;
++
++    if (lc->tu.is_cu_qp_delta_wanted)
++    {
++        const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
++        const unsigned int cb_mask = ~0U << log2_cb_size;
++
++        if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
++            qp_delta >  (25 + (s->ps.sps->qp_bd_offset >> 1)))
++        {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "The cu_qp_delta %d is outside the valid range "
++                   "[%d, %d].\n",
++                   qp_delta,
++                   -(26 + (s->ps.sps->qp_bd_offset >> 1)),
++                    (25 + (s->ps.sps->qp_bd_offset >> 1)));
++            return AVERROR_INVALIDDATA;
++        }
++
++        lc->tu.is_cu_qp_delta_wanted = 0;
++        lc->tu.cu_qp_delta = qp_delta;
++        ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
++    }
++
++    // * Not main profile & untested due to no conform streams
++    if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
++        !lc->cu.cu_transquant_bypass_flag) {
++        int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
++        if (cu_chroma_qp_offset_flag) {
++            int cu_chroma_qp_offset_idx  = 0;
++            if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
++                cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
++            }
++            lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
++            lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
++        }
++        lc->tu.cu_chroma_qp_offset_wanted = 0;
++    }
++
++    if (cbf_luma)
++        ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
++
++    if (log2_trafo_size > 2 || blk_idx == 3)
++    {
++        if ((cbf_chroma & CBF_CB0) != 0)
++            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++                                        log2_trafo_size_c, scan_idx_c, 1);
++        if ((cbf_chroma & CBF_CR0) != 0)
++            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++                                        log2_trafo_size_c, scan_idx_c, 2);
++    }
++
++    return 0;
++}
++
++static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
++{
++    set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
++}
++
++
++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                              const unsigned int x0, const unsigned int y0,
++                              const unsigned int log2_trafo_size,
++                              const unsigned int trafo_depth, const unsigned int blk_idx,
++                              const unsigned int cbf_c0)
++{
++    // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
++    unsigned int cbf_c1 = cbf_c0;
++    int split_transform_flag;
++    int ret;
++
++    if (lc->cu.intra_split_flag) {
++        if (trafo_depth == 1) {
++            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
++            if (ctx_cfmt(s) == 3) {
++                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
++                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
++            } else {
++                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
++            }
++        }
++    } else {
++        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
++        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
++    }
++
++    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
++        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
++        trafo_depth     < lc->cu.max_trafo_depth       &&
++        !(lc->cu.intra_split_flag && trafo_depth == 0))
++    {
++        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
++    } else {
++        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
++                          lc->cu.pred_mode == MODE_INTER &&
++                          lc->cu.part_mode != PART_2Nx2N &&
++                          trafo_depth == 0;
++
++        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
++                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
++                               inter_split;
++    }
++
++    if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
++    {
++        const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
++        cbf_c1 = 0;
++
++        if ((cbf_c0 & CBF_CB0) != 0)
++        {
++            cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
++            if (wants_c1)
++                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
++        }
++
++        if ((cbf_c0 & CBF_CR0) != 0)
++        {
++            cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
++            if (wants_c1)
++                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
++        }
++    }
++
++    if (split_transform_flag) {
++        const int trafo_size_split = 1 << (log2_trafo_size - 1);
++        const int x1 = x0 + trafo_size_split;
++        const int y1 = y0 + trafo_size_split;
++
++#define SUBDIVIDE(x, y, idx)                                                    \
++do {                                                                            \
++    ret = hls_transform_tree(s, lc, x, y,                                       \
++                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
++                             cbf_c1);                                           \
++    if (ret < 0)                                                                \
++        return ret;                                                             \
++} while (0)
++
++        SUBDIVIDE(x0, y0, 0);
++        SUBDIVIDE(x1, y0, 1);
++        SUBDIVIDE(x0, y1, 2);
++        SUBDIVIDE(x1, y1, 3);
++
++#undef SUBDIVIDE
++    } else {
++        // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
++        // trafo_size == 2 with depth == 0 the issue is moot
++        const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
++            ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
++
++        ret = hls_transform_unit(s, lc, x0, y0,
++                                 log2_trafo_size + trafo_depth, log2_trafo_size,
++                                 blk_idx, cbf_luma, cbf_c1);
++        if (ret < 0)
++            return ret;
++
++        if (!s->sh.disable_deblocking_filter_flag) {
++            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
++        }
++    }
++    return 0;
++}
++
++
++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++{
++    GetBitContext gb;
++    int ret;
++
++    ret = init_get_bits(&gb, pcm, length);
++    if (ret < 0)
++        return ret;
++
++    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
++                       frame_stride1(s->frame, 0),
++                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
++                       s->frame->linesize[1],
++                       cb_size >> ctx_hshift(s, 1),
++                       cb_size >> ctx_vshift(s, 1),
++                       &gb, s->ps.sps->pcm.bit_depth_chroma);
++
++    return 0;
++}
++
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++    return x << (y * 2);
++}
++
++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
++{
++    // Length in bits
++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
++
++    const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
++
++    if (!s->sh.disable_deblocking_filter_flag)
++        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++
++    // Copy coeffs
++    {
++        const int blen = (length + 7) >> 3;
++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
++        // Allocation is in int16_t s
++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++        // sample this rounding doesn't affect the total size we need to allocate for
++        // the coeff buffer
++        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
++        memcpy(coeffs, pcm, blen);
++
++        // Our coeff stash assumes that any partially allocated 64byte lump
++        // is zeroed so make that true.
++        {
++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++            if ((-(intptr_t)eopcm & 63) != 0)
++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
++        }
++
++        // Add command
++        {
++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++            cmd->type = RPI_PRED_I_PCM;
++            cmd->size = log2_cb_size;
++            cmd->i_pcm.src = coeffs;
++            cmd->i_pcm.x = x0;
++            cmd->i_pcm.y = y0;
++            cmd->i_pcm.src_len = length;
++        }
++        return 0;
++    }
++}
++
++
++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
++                                const MvXY xy, const int y0, const int height)
++{
++    if (s->threads_type != 0) {
++        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
++
++        // Progress has to be attached to current job as the actual wait
++        // is in worker_core which can't use lc
++        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
++        if (*pr < y) {
++            *pr = y;
++        }
++    }
++}
++
++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                  const int x0, const int y0, const int nPbW,
++                                  const int nPbH,
++                                  HEVCRpiMvField * const mv)
++{
++    enum InterPredIdc inter_pred_idc = PRED_L0;
++    int mvp_flag;
++    const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
++
++    mv->pred_flag = 0;
++    if (s->sh.slice_type == HEVC_SLICE_B)
++        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
++
++    if (inter_pred_idc != PRED_L1) {
++        MvXY mvd;
++
++        if (s->sh.nb_refs[L0])
++            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
++
++        mv->pred_flag = PF_L0;
++        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++                                 mv, mvp_flag, 0);
++        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
++    }
++
++    if (inter_pred_idc != PRED_L0) {
++        MvXY mvd = 0;
++
++        if (s->sh.nb_refs[L1])
++            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
++
++        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
++            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++
++        mv->pred_flag += PF_L1;
++        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++                                 mv, mvp_flag, 1);
++        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
++    }
++}
++
++
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++{
++    HEVCRpiInterPredQ * yp = NULL;
++    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
++    const unsigned int max_fill = ipe->max_fill;
++    unsigned int load = UINT_MAX;
++
++    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
++        // We will always have enough room between the Qs but if we are
++        // running critically low due to poor scheduling then use fill size
++        // rather than load to determine QPU.  This has obvious dire
++        // performance implications but (a) it is better than crashing
++        // and (b) it should (almost) never happen
++        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
++        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
++
++        if (tload < load)
++        {
++            yp = ypt;
++            load = tload;
++        }
++    }
++
++    yp->load += load_val;
++    ipe->used_grp = 1;
++    qpu_mc_link_set(yp->qpu_mc_curr, fn);
++
++    return yp;
++}
++
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++    for (unsigned int i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
++
++        qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
++        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
++    }
++}
++
++// Returns 0 on success
++// We no longer check for Q fullness as wew have emergncy code in ctu alloc
++// * However it might be an idea to have some means of spotting that we've used it
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++    if (!ipe->used_grp)
++        return 0;
++
++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
++    {
++        ipe->curr = 0;
++        rpi_inter_pred_sync(ipe);
++    }
++    ipe->used = 1;
++    ipe->used_grp = 0;
++
++    return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++
++    ipe->curr = 0;
++    ipe->used = 0;
++    ipe->used_grp = 0;
++    for (i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base;
++        q->load = 0;
++        q->last_l0 = NULL;
++        q->last_l1 = NULL;
++    }
++}
++
++static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++                                 const unsigned int n_max, const unsigned int n_grp,
++                                 const unsigned int total_size, const unsigned int min_gap)
++{
++    int rv;
++
++    memset(ipe, 0, sizeof(*ipe));
++    if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
++        return AVERROR(ENOMEM);
++
++    ipe->n_grp = n_grp;
++    ipe->min_gap = min_gap;
++
++    if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
++        av_freep(&ipe->q);
++    return rv;
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline uint32_t pack_wo_p(const int off, const int mul)
++{
++    return PACK2(off * 2 + 1, mul);
++}
++
++static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
++{
++    return PACK2(off0 + off1 + 1, mul);
++}
++
++
++static void
++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const MvXY mv_xy,
++           const int weight_mul,
++           const int weight_offset,
++           AVFrame *const src_frame)
++{
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const unsigned int mx          = MV_X(mv_xy) & 3;
++    const unsigned int my          = MV_Y(mv_xy) & 3;
++    const unsigned int my_mx       = (my << 8) | mx;
++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++    const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
++    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++
++    if (my_mx == 0)
++    {
++        const int x1 = x0 + (MV_X(mv_xy) >> 2);
++        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
++        const int bh = nPbH;
++
++        for (int start_x = 0; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                ++ts->y_pred1_x0y0;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src_vc_address_y;
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->wo1 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
++        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
++        const unsigned int bh = nPbH;
++        int start_x = 0;
++
++#if 1
++        // As Y-pred operates on two independant 8-wide src blocks we can merge
++        // this pred with the previous one if it the previous one is 8 pel wide,
++        // the same height as the current block, immediately to the left of our
++        // current dest block and mono-pred.
++
++        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++        {
++            const int bw = FFMIN(nPbW, 8);
++            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
++
++            last_y8_src2->x = x1_m3;
++            last_y8_src2->y = y1_m3;
++            last_y8_src2->base = src_vc_address_y;
++            last_y8_p->w += bw;
++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++            last_y8_p->wo2 = wo;
++
++            jb->last_y8_p = NULL;
++            jb->last_y8_l1 = NULL;
++            start_x = bw;
++#if RPI_TSTATS
++            ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
++#endif
++        }
++#endif
++
++        for (; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                if (mx == 0 && my == 0)
++                    ++ts->y_pred1_x0y0;
++                else if (mx == 0)
++                    ++ts->y_pred1_x0;
++                else if (my == 0)
++                    ++ts->y_pred1_y0;
++                else
++                    ++ts->y_pred1_xy;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++            src1->x = x1_m3 + start_x;
++            src1->y = y1_m3;
++            src1->base = src_vc_address_y;
++            if (bw <= 8)
++            {
++                src2->x = MC_DUMMY_X;
++                src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++                src2->base = s->qpu_dummy_frame_emu;
++#else
++                src2->base = s->qpu_dummy_frame_qpu;
++#endif
++            }
++            else
++            {
++                src2->x = x1_m3 + start_x + 8;
++                src2->y = y1_m3;
++                src2->base = src_vc_address_y;
++            }
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo;
++            cmd_y->wo2 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++            if (bw == 8) {
++                jb->last_y8_l1 = src2;
++                jb->last_y8_p = cmd_y;
++            }
++        }
++    }
++}
++
++static void
++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const struct HEVCRpiMvField *const mv_field,
++           const AVFrame *const src_frame,
++           const AVFrame *const src_frame2)
++{
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const MvXY mv  = mv_field->xy[0];
++    const MvXY mv2 = mv_field->xy[1];
++
++    const unsigned int mx          = MV_X(mv) & 3;
++    const unsigned int my          = MV_Y(mv) & 3;
++    const unsigned int my_mx = (my<<8) | mx;
++    const unsigned int mx2          = MV_X(mv2) & 3;
++    const unsigned int my2          = MV_Y(mv2) & 3;
++    const unsigned int my2_mx2 = (my2<<8) | mx2;
++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
++    const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
++    const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
++
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++
++    if (my2_mx2_my_mx == 0)
++    {
++        const int x1 = x0 + (MV_X(mv) >> 2);
++        const int y1 = y0 + (MV_Y(mv) >> 2);
++        const int x2 = x0 + (MV_X(mv2) >> 2);
++        const int y2 = y0 + (MV_Y(mv2) >> 2);
++        const int bh = nPbH;
++
++        // Can do chunks a full 16 wide if we don't want the H filter
++        for (int start_x=0; start_x < nPbW; start_x += 16)
++        {
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                ++ts->y_pred2_x0y0;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 16);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = 0;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        // Filter requires a run-up of 3
++        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
++        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
++        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
++        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
++        const int bh = nPbH;
++
++        for (int start_x=0; start_x < nPbW; start_x += 8)
++        { // B blocks work 8 at a time
++            // B weights aren't doubled as the QPU code does the same
++            // amount of work as it does for P
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                const unsigned int mmx = mx | mx2;
++                const unsigned int mmy = my | my2;
++                if (mmx == 0 && mmy == 0)
++                    ++ts->y_pred2_x0y0;
++                else if (mmx == 0)
++                    ++ts->y_pred2_x0;
++                else if (mmy == 0)
++                    ++ts->y_pred2_y0;
++                else
++                    ++ts->y_pred2_xy;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 8);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++  const unsigned int lx, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const MvXY mv,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  AVFrame * const src_frame)
++{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // = s->ps.sps->hshift[1];
++    const int vshift = 1; // = s->ps.sps->vshift[1];
++
++    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
++    const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
++    const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
++
++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++    {
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++        qpu_mc_src_t * const last_lx = *plast_lx;
++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++        last_lx->x = x1_c + start_x;
++        last_lx->y = y1_c;
++        last_lx->base = src_base_u;
++        cmd_c->h = bh;
++        cmd_c->w = bw;
++        cmd_c->coeffs_x = x_coeffs;
++        cmd_c->coeffs_y = y_coeffs;
++        cmd_c->wo_u = wo_u;
++        cmd_c->wo_v = wo_v;
++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++        *plast_lx = &cmd_c->next_src;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
++    }
++    return;
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++  const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const struct HEVCRpiMvField * const mv_field,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  const int16_t * const c_weights2,
++  const int16_t * const c_offsets2,
++  AVFrame * const src_frame,
++  AVFrame * const src_frame2)
++{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // s->ps.sps->hshift[1];
++    const int vshift = 1; // s->ps.sps->vshift[1];
++    const MvXY mv = mv_field->xy[0];
++    const MvXY mv2 = mv_field->xy[1];
++
++    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
++    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++
++    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
++    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
++    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
++
++    const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
++    const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
++
++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++
++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
++    {
++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++        qpu_mc_src_t * const src_l0 = cp->last_l0;
++        qpu_mc_src_t * const src_l1 = cp->last_l1;
++
++        src_l0->x = x1_c + start_x;
++        src_l0->y = y1_c;
++        src_l0->base = src1_base;
++        src_l1->x = x2_c + start_x;
++        src_l1->y = y2_c;
++        src_l1->base = src2_base;
++
++        u[0].h = bh;
++        u[0].w = bw;
++        u[0].coeffs_x1 = coefs0_x;
++        u[0].coeffs_y1 = coefs0_y;
++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
++        u[0].coeffs_x2 = coefs1_x;
++        u[0].coeffs_y2 = coefs1_y;
++        u[0].wo_u2 = wo_u2;
++        u[0].wo_v2 = wo_v2;
++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
++
++        cp->last_l0 = &u[0].next_src1;
++        cp->last_l1 = &u[0].next_src2;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++    }
++}
++
++
++static inline void
++col_stash(const HEVCRpiContext * const s,
++          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
++          const HEVCRpiMvField * const mvf)
++{
++    ColMvField * const col_mvf = s->ref->col_mvf;
++    const unsigned int x = (x0 + 15) >> 4;
++    const unsigned int y = (y0 + 15) >> 4;
++    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
++    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
++
++    if (col_mvf != NULL && w != 0 && h != 0)
++    {
++        // Only record MV from the top left of the 16x16 block
++
++        const RefPicList * const rpl = s->refPicList;
++        const ColMvField cmv = {
++            .L = {
++                {
++                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
++                            COL_POC_INTRA :
++                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
++                    .xy = mvf->xy[0]
++                },
++                {
++                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
++                            COL_POC_INTRA :
++                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
++                    .xy = mvf->xy[1]
++                }
++            }
++        };
++
++        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
++        const unsigned int stride = s->col_mvf_stride - w;
++        unsigned int j = h;
++
++        do
++        {
++            unsigned int k = w;
++            do
++            {
++                *p++ = cmv;
++            } while (--k != 0);
++            p += stride;
++        } while (--j != 0);
++    }
++}
++
++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const unsigned int x0, const unsigned int y0,
++                                const unsigned int nPbW, const unsigned int nPbH,
++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++{
++    HEVCRpiJob * const jb = lc->jb0;
++
++    struct HEVCRpiMvField current_mv = {{0}};
++    const RefPicList  *const refPicList = s->refPicList;
++    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
++
++    if (lc->cu.pred_mode != MODE_SKIP)
++        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
++
++    if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
++        const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
++            ff_hevc_rpi_merge_idx_decode(s, lc);
++
++        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++                                   partIdx, merge_idx, &current_mv);
++    } else {
++        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
++    }
++
++    {
++        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++        unsigned int i, j;
++
++        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
++        {
++            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
++                p[i] = current_mv;
++            p += MVF_STASH_WIDTH_PU;
++        }
++    }
++
++    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
++
++    if (current_mv.pred_flag & PF_L0) {
++        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
++        if (!ref0)
++            return;
++        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
++    }
++    if (current_mv.pred_flag & PF_L1) {
++        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
++        if (!ref1)
++            return;
++        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
++    }
++
++    if (current_mv.pred_flag == PF_L0) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
++          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++          ref0->frame);
++
++        if (ctx_cfmt(s) != 0) {
++            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
++              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++              ref0->frame);
++            return;
++        }
++    } else if (current_mv.pred_flag == PF_L1) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
++          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++          ref1->frame);
++
++        if (ctx_cfmt(s) != 0) {
++            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
++              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++              ref1->frame);
++            return;
++        }
++    } else if (current_mv.pred_flag == PF_BI) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
++
++        if (ctx_cfmt(s) != 0) {
++          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
++                       &current_mv,
++                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                       ref0->frame,
++                       ref1->frame);
++            return;
++        }
++    }
++}
++
++static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                    const unsigned int x0, const unsigned int y0,
++                    const unsigned int log2_cb_size,
++                    const unsigned int ipm)
++{
++    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
++    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
++
++    {
++        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
++        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
++    }
++
++    // If IRAP then everything is Intra & we avoid ever looking at these
++    // stashes so don't bother setting them
++    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
++    {
++        if (s->is_intra != NULL)
++        {
++            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
++        }
++
++        {
++            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
++            unsigned int n = size_in_pus;
++
++            do
++            {
++                memset(p, 0, size_in_pus * sizeof(*p));
++                p += MVF_STASH_WIDTH_PU;
++            } while (--n != 0);
++        }
++
++
++        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
++        {
++            // Only record top left stuff
++            // Blocks should always be alinged on size boundries
++            // so cannot have overflow from a small block
++
++            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
++            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
++            const unsigned int stride = s->col_mvf_stride - size_in_col;
++            unsigned int j = size_in_col;
++
++            do
++            {
++                unsigned int k = size_in_col;
++                do
++                {
++                    p->L[0].poc = COL_POC_INTRA;
++                    p->L[0].xy = 0;
++                    p->L[1].poc = COL_POC_INTRA;
++                    p->L[1].xy = 0;
++                    ++p;
++                } while (--k != 0);
++                p += stride;
++            } while (--j != 0);
++        }
++    }
++}
++
++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                                const unsigned int x0, const unsigned int y0,
++                                                const unsigned int log2_cb_size)
++{
++    set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
++}
++
++
++/**
++ * 8.4.1
++ */
++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                int x0, int y0, int log2_pu_size,
++                                int prev_intra_luma_pred_flag,
++                                const unsigned int idx)
++{
++    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++    const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++
++    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
++    // lc we can just keep 1 CTB lR stashes
++    // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
++    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
++    const unsigned int cand_left = lc->ipm_left[yb_pu];
++
++    unsigned int intra_pred_mode;
++    unsigned int a, b, c;
++
++    if (cand_left == cand_up) {
++        if (cand_left < 2) {
++            a = INTRA_PLANAR;
++            b = INTRA_DC;
++            c = INTRA_ANGULAR_26;
++        } else {
++            a = cand_left;
++            b = 2 + ((cand_left - 2 - 1 + 32) & 31);
++            c = 2 + ((cand_left - 2 + 1) & 31);
++        }
++    } else {
++        a = cand_left;
++        b = cand_up;
++        c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
++                INTRA_PLANAR :
++            (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
++                INTRA_DC :
++                INTRA_ANGULAR_26;
++    }
++
++    if (prev_intra_luma_pred_flag) {
++        intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
++    } else {
++        // Sort lowest 1st
++        if (a > b)
++            FFSWAP(int, a, b);
++        if (a > c)
++            FFSWAP(int, a, c);
++        if (b > c)
++            FFSWAP(int, b, c);
++
++        intra_pred_mode = idx;
++        if (intra_pred_mode >= a)
++            intra_pred_mode++;
++        if (intra_pred_mode >= b)
++            intra_pred_mode++;
++        if (intra_pred_mode >= c)
++            intra_pred_mode++;
++    }
++
++    /* write the intra prediction units into the mv array */
++    set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
++    return intra_pred_mode;
++}
++
++static const uint8_t tab_mode_idx[] = {
++     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
++    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
++
++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                  const unsigned int x0, const unsigned int y0,
++                                  const unsigned int log2_cb_size)
++{
++    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
++    uint8_t prev_intra_luma_pred_flag[4];
++    int split   = lc->cu.part_mode == PART_NxN;
++    const unsigned int split_size = (1 << (log2_cb_size - 1));
++    int chroma_mode;
++    const unsigned int n = split ? 4 : 1;
++    unsigned int i;
++
++    for (i = 0; i != n; i++)
++        prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
++
++    for (i = 0; i < n; i++) {
++        // depending on mode idx is mpm or luma_pred_mode
++        const unsigned int idx = prev_intra_luma_pred_flag[i] ?
++            ff_hevc_rpi_mpm_idx_decode(lc) :
++            ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
++
++        lc->pu.intra_pred_mode[i] =
++            luma_intra_pred_mode(s, lc,
++                                 x0 + ((i & 1) == 0 ? 0 : split_size),
++                                 y0 + ((i & 2) == 0 ? 0 : split_size),
++                                 log2_cb_size - split,
++                                 prev_intra_luma_pred_flag[i], idx);
++    }
++
++    if (ctx_cfmt(s) == 3) {
++        for (i = 0; i < n; i++) {
++            lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++            if (chroma_mode != 4) {
++                if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
++                    lc->pu.intra_pred_mode_c[i] = 34;
++                else
++                    lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
++            } else {
++                lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
++            }
++        }
++    } else if (ctx_cfmt(s) == 2) {
++        int mode_idx;
++        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++        if (chroma_mode != 4) {
++            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++                mode_idx = 34;
++            else
++                mode_idx = intra_chroma_table[chroma_mode];
++        } else {
++            mode_idx = lc->pu.intra_pred_mode[0];
++        }
++        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
++    } else if (ctx_cfmt(s) != 0) {
++        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++        if (chroma_mode != 4) {
++            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++                lc->pu.intra_pred_mode_c[0] = 34;
++            else
++                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
++        } else {
++            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
++        }
++    }
++}
++
++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                           const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
++{
++    const unsigned int cb_size          = 1 << log2_cb_size;
++    const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++    const unsigned int min_cb_width     = s->ps.sps->min_cb_width;
++    const unsigned int x_cb             = x0 >> log2_min_cb_size;
++    const unsigned int y_cb             = y0 >> log2_min_cb_size;
++    const unsigned int idx              = log2_cb_size - 2;
++    const unsigned int qp_block_mask    = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++    int skip_flag = 0;
++
++    lc->cu.x                = x0;
++    lc->cu.y                = y0;
++    lc->cu.x_split          = x0;
++    lc->cu.y_split          = y0;
++
++    lc->cu.pred_mode        = MODE_INTRA;
++    lc->cu.part_mode        = PART_2Nx2N;
++    lc->cu.intra_split_flag = 0;
++    lc->cu.cu_transquant_bypass_flag = 0;
++    lc->pu.intra_pred_mode[0] = 1;
++    lc->pu.intra_pred_mode[1] = 1;
++    lc->pu.intra_pred_mode[2] = 1;
++    lc->pu.intra_pred_mode[3] = 1;
++
++    if (s->ps.pps->transquant_bypass_enable_flag) {
++        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
++        if (lc->cu.cu_transquant_bypass_flag)
++            set_deblocking_bypass(s, x0, y0, log2_cb_size);
++    }
++
++    if (s->sh.slice_type != HEVC_SLICE_I) {
++        lc->cu.pred_mode = MODE_INTER;
++        skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
++    }
++
++    if (skip_flag) {
++        lc->cu.pred_mode = MODE_SKIP;
++
++        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++
++        if (!s->sh.disable_deblocking_filter_flag)
++            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++    } else {
++        int pcm_flag = 0;
++
++        if (s->sh.slice_type != HEVC_SLICE_I)
++            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
++        if (lc->cu.pred_mode != MODE_INTRA ||
++            log2_cb_size == s->ps.sps->log2_min_cb_size) {
++            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
++            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
++                                      lc->cu.pred_mode == MODE_INTRA;
++        }
++
++        if (lc->cu.pred_mode == MODE_INTRA) {
++            if (lc->cu.part_mode == PART_2Nx2N &&
++                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size &&  // 0 if not enabled
++                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
++                ff_hevc_rpi_pcm_flag_decode(lc) != 0)
++            {
++                int ret;
++                pcm_flag = 1;
++                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++                if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
++                    return ret;
++
++                if (s->ps.sps->pcm.loop_filter_disable_flag)
++                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
++            } else {
++                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
++            }
++        } else {
++            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++            switch (lc->cu.part_mode) {
++            case PART_2Nx2N:
++                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++                break;
++            case PART_2NxN:
++                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
++                lc->cu.y_split = y0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
++                break;
++            case PART_Nx2N:
++                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
++                lc->cu.x_split = x0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
++                break;
++            case PART_2NxnU:
++                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
++                lc->cu.y_split = y0 + cb_size / 4;
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
++                break;
++            case PART_2NxnD:
++                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
++                lc->cu.y_split = y0 + cb_size / 4 * 3;
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
++                break;
++            case PART_nLx2N:
++                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
++                lc->cu.x_split = x0 + cb_size / 4;
++                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
++                break;
++            case PART_nRx2N:
++                hls_prediction_unit(s, lc, x0,                   y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
++                lc->cu.x_split = x0 + cb_size / 4 * 3;
++                hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
++                break;
++            case PART_NxN:
++                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
++                lc->cu.x_split = x0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
++                lc->cu.y_split = y0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
++                break;
++            }
++        }
++
++        if (!pcm_flag) {
++            int rqt_root_cbf = 1;
++
++            if (lc->cu.pred_mode != MODE_INTRA &&
++                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
++                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
++            }
++            if (rqt_root_cbf) {
++                const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
++                int ret;
++
++                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
++                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
++                                         s->ps.sps->max_transform_hierarchy_depth_inter;
++                // transform_tree does deblock_boundary_strengths
++                ret = hls_transform_tree(s, lc, x0, y0,
++                                         log2_cb_size, 0, 0, cbf_c);
++                if (ret < 0)
++                    return ret;
++            } else {
++                if (!s->sh.disable_deblocking_filter_flag)
++                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++            }
++        }
++    }
++
++    // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
++    if (lc->tu.is_cu_qp_delta_wanted)
++        ff_hevc_rpi_set_qPy(s, lc, x0, y0);
++
++    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
++        lc->qPy_pred = lc->qp_y;
++    }
++
++    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
++
++    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
++
++    return 0;
++}
++
++// Returns:
++//  < 0  Error
++//  0    More data wanted
++//  1    EoSlice / EoPicture
++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++                               const int log2_cb_size, const unsigned int cb_depth)
++{
++    const int cb_size    = 1 << log2_cb_size;
++    int ret;
++    int split_cu;
++
++    lc->ct_depth = cb_depth;
++    split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
++    if (x0 + cb_size <= s->ps.sps->width  &&
++        y0 + cb_size <= s->ps.sps->height &&
++        split_cu)
++    {
++        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
++    }
++
++    // Qp delta (and offset) need to remain wanted if cb_size < min until
++    // a coded block is found so we still initial state at depth 0 (outside
++    // this fn) and only reset here
++    if (s->ps.pps->cu_qp_delta_enabled_flag &&
++        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++    {
++        lc->tu.is_cu_qp_delta_wanted = 1;
++        lc->tu.cu_qp_delta          = 0;
++    }
++    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
++        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++    {
++        lc->tu.cu_chroma_qp_offset_wanted = 1;
++    }
++
++    lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
++    lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
++    lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
++
++    if (split_cu) {
++        int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++        const int cb_size_split = cb_size >> 1;
++        const int x1 = x0 + cb_size_split;
++        const int y1 = y0 + cb_size_split;
++
++        int more_data = 0;
++
++        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
++        if (more_data < 0)
++            return more_data;
++
++        if (more_data && x1 < s->ps.sps->width) {
++            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++        if (more_data && y1 < s->ps.sps->height) {
++            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++        if (more_data && x1 < s->ps.sps->width &&
++            y1 < s->ps.sps->height) {
++            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++
++        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
++            lc->qPy_pred = lc->qp_y;
++
++        if (more_data)
++            return ((x1 + cb_size_split) < s->ps.sps->width ||
++                    (y1 + cb_size_split) < s->ps.sps->height);
++        else
++            return 0;
++    } else {
++        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
++        if (ret < 0)
++            return ret;
++        if ((!((x0 + cb_size) %
++               (1 << (s->ps.sps->log2_ctb_size))) ||
++             (x0 + cb_size >= s->ps.sps->width)) &&
++            (!((y0 + cb_size) %
++               (1 << (s->ps.sps->log2_ctb_size))) ||
++             (y0 + cb_size >= s->ps.sps->height))) {
++            int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++            return !end_of_slice_flag;
++        } else {
++            return 1;
++        }
++    }
++
++    return 0;  // NEVER
++}
++
++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
++{
++    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
++    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
++    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++    const unsigned int line_w = s->ps.sps->ctb_width;
++
++    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
++
++    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
++    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++
++    lc->boundary_flags = 0;
++
++    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
++        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
++    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
++        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
++    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
++        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
++    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
++        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
++
++    // Use line width rather than tile width for addr_in_slice test as
++    // addr_in_slice is in raster units
++
++    lc->ctb_avail =
++        ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
++        ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
++        ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++            (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
++        ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
++            (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
++    // Down-left never avail at CTB level
++}
++
++
++static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
++        (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
++
++    // Signal
++    if (y > 0) {
++        // Cast away const as progress is held in s, but this really shouldn't confuse anything
++        ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
++    }
++
++    // Job done now
++    // ? Move outside this fn
++    job_free(s->jbc, jb);
++}
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    unsigned int i;
++    HEVCRpiIntraPredEnv * const iap = &jb->intra;
++    const HEVCPredCmd *cmd = iap->cmds;
++
++#if !RPI_WORKER_WAIT_PASS_0
++    rpi_sem_wait(&jb->sem);
++    rpi_cache_flush_execute(jb->rfe);  // Invalidate data set up in pass1
++#endif
++
++    for (i = iap->n; i > 0; i--, cmd++)
++    {
++        switch (cmd->type)
++        {
++            case RPI_PRED_INTRA:
++                s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++                break;
++            case RPI_PRED_INTRA_C:
++                s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++                break;
++            case RPI_PRED_ADD_RESIDUAL:
++                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++                break;
++            case RPI_PRED_ADD_DC:
++                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_U:
++                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_V:
++                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_C:
++                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++                break;
++            case RPI_PRED_ADD_DC_U:
++            case RPI_PRED_ADD_DC_V:
++                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++                break;
++
++            case RPI_PRED_I_PCM:
++                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++                break;
++
++            default:
++                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++                abort();
++        }
++    }
++
++    // Mark done
++    iap->n = 0;
++}
++
++
++// Set initial uniform job values & zero ctu_count
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
++{
++    unsigned int i;
++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
++    const HEVCRpiSPS * const sps = s->ps.sps;
++
++    const uint16_t pic_width_y   = sps->width;
++    const uint16_t pic_height_y  = sps->height;
++
++    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
++    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
++
++    // We expect the pointer to change if we use another sps
++    if (sps != jb->sps)
++    {
++        worker_pic_free_one(jb);
++
++        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
++        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
++
++        {
++            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
++            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
++            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
++        }
++
++        jb->sps = sps;
++    }
++
++    jb->waited = 0;
++    jb->ctu_ts_first = ctu_ts_first;
++    jb->ctu_ts_last = -1;
++
++    rpi_inter_pred_reset(cipe);
++    for (i = 0; i < cipe->n; i++) {
++        HEVCRpiInterPredQ * const cp = cipe->q + i;
++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++
++        u->next_src1.x = 0;
++        u->next_src1.y = 0;
++        u->next_src1.base = 0;
++        u->pic_cw = pic_width_c;
++        u->pic_ch = pic_height_c;
++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        cp->last_l0 = &u->next_src1;
++
++        u->next_fn = 0;
++        u->next_src2.x = 0;
++        u->next_src2.y = 0;
++        u->next_src2.base = 0;
++        cp->last_l1 = &u->next_src2;
++
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++    }
++
++    rpi_inter_pred_reset(yipe);
++    for (i = 0; i < yipe->n; i++) {
++        HEVCRpiInterPredQ * const yp = yipe->q + i;
++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++
++        y->next_src1.x = 0;
++        y->next_src1.y = 0;
++        y->next_src1.base = 0;
++        y->next_src2.x = 0;
++        y->next_src2.y = 0;
++        y->next_src2.base = 0;
++        y->pic_h = pic_height_y;
++        y->pic_w = pic_width_y;
++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        y->next_fn = 0;
++        yp->last_l0 = &y->next_src1;
++        yp->last_l1 = &y->next_src2;
++
++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
++    }
++
++    jb->last_y8_p = NULL;
++    jb->last_y8_l1 = NULL;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++        jb->progress_req[i] = -1;
++    }
++
++    worker_pic_reset(&jb->coeffs);
++}
++
++
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++    unsigned int max_block = 0;
++
++    if (!ipe->used) {
++        return 0;
++    }
++
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++        if (block_size > max_block)
++            max_block = block_size;
++
++        qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_qpu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_qpu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++
++        // Add to mailbox list
++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++        mail[i][1] = yp->code_setup;
++    }
++
++    // We don't need invalidate here as the uniforms aren't changed by the QPU
++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++    // new values which seems to give us a small performance advantage
++    //
++    // In most cases we will not have a completely packed set of uniforms and as
++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++    // fullest
++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++                                  ipe->n, ipe->max_fill + ipe->min_gap);
++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++    return 1;
++}
++#endif
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    if (!ipe->used) {
++        return 0;
++    }
++
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_emu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_emu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++    }
++
++    return 1;
++}
++#endif
++
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++
++
++static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
++{
++    rpi_cache_buf_t cbuf;
++    rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++    rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++    rpi_cache_flush_finish(rfe);
++}
++
++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
++    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
++    const unsigned int ctb_width = s->ps.sps->ctb_width;
++    RpiBlk *const bounds = &jb->bounds;
++    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
++    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
++    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
++    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
++    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
++
++    bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
++    bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
++}
++
++#if RPI_PASSES == 2
++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s, jb);
++
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s, jb);
++}
++#endif
++
++// Core execution tasks
++static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    int pred_y, pred_c;
++    vpu_qpu_job_env_t qvbuf;
++    const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
++#if RPI_WORKER_WAIT_PASS_0
++    int do_wait;
++#endif
++
++    {
++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++        if (cf->s[3].n + cf->s[2].n != 0)
++        {
++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++            unsigned int n16 = (cf->s[2].n >> 8);
++            unsigned int n32 = (cf->s[3].n >> 10);
++#if RPI_COMPRESS_COEFFS
++            if (cf->s[2].packed) {
++                n16 = n16 | (n16<<16);
++            } else {
++                const unsigned int npack16 = (cf->s[2].packed_n>>8);
++                n16 = n16 | (npack16<<16);
++            }
++            if (cf->s[3].packed) {
++                n32 = n32 | (n32<<16);
++            } else {
++                const unsigned int npack32 = (cf->s[3].packed_n>>10);
++                n32 = n32 | (npack32<<16);
++            }
++#endif
++            vpu_qpu_job_add_vpu(vqj,
++                vpu_get_fn(s->ps.sps->bit_depth),
++                vpu_get_constants(),
++                cf->gptr.vc,
++                n16,
++                cf->gptr.vc + offset32,
++                n32,
++                0);
++
++            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
++        }
++    }
++
++    pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
++
++// We could take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++
++    pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
++
++    // Returns 0 if nothing to do, 1 if sync added
++#if RPI_WORKER_WAIT_PASS_0
++    do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
++#else
++    if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
++        sem_post(&jb->sem);
++#endif
++
++    rpi_cache_flush_execute(jb->rfe);
++
++    // Await progress as required
++    // jb->waited will only be clear if we have already tested the progress values
++    // (in worker_submit_job) and found we don't have to wait
++    if (jb->waited)
++    {
++        unsigned int i;
++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++            if (jb->progress_req[i] >= 0) {
++                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
++            }
++        }
++    }
++
++    vpu_qpu_job_finish(vqj);
++
++    // We always work on a rectangular block
++    if (pred_y || pred_c)
++    {
++        rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
++                                        ctx_vshift(s, 1), pred_y, pred_c);
++    }
++
++    // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    if (av_rpi_is_sand8_frame(s->frame))
++    {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++    }
++    else
++    {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
++    }
++#endif
++
++#if RPI_WORKER_WAIT_PASS_0
++    if (do_wait)
++        rpi_sem_wait(&jb->sem);
++    rpi_cache_flush_execute(jb->rfe);
++#endif
++}
++
++
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++    av_freep(&ipe->q);
++    gpu_free(&ipe->gptr);
++}
++
++static HEVCRpiJob * job_new(void)
++{
++    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
++
++    if (jb == NULL)
++        return NULL;
++
++    sem_init(&jb->sem, 0, 0);
++    jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
++    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++    jb->intra.n = 0;
++    if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
++        goto fail1;
++
++    // * Sizeof the union structure might be overkill but at the moment it
++    //   is correct (it certainly isn't going to be too small)
++    // Set max fill to slack/2 from the end of the Q
++    // If we exceed this in any Q then we will schedule by size (which should
++    // mean that we never use that Q again part from syncs)
++    // * Given how agressive the overflow resonse is we could maybe put the
++    //   threshold even nearer the end, but I don't expect us to ever hit
++    //   it on any real stream anyway.
++
++    if (rpi_inter_pred_alloc(&jb->chroma_ip,
++                         QPU_N_MAX, QPU_N_GRP,
++                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
++                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
++        goto fail2;
++    if (rpi_inter_pred_alloc(&jb->luma_ip,
++                         QPU_N_MAX,  QPU_N_GRP,
++                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
++                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
++        goto fail3;
++
++    return jb;
++
++fail3:
++    rpi_free_inter_pred(&jb->luma_ip);
++fail2:
++    av_freep(&jb->intra.cmds);
++fail1:
++    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++    rpi_cache_flush_finish(jb->rfe);
++    sem_destroy(&jb->sem);
++    return NULL;
++}
++
++static void job_delete(HEVCRpiJob * const jb)
++{
++    worker_pic_free_one(jb);
++    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++    rpi_free_inter_pred(&jb->chroma_ip);
++    rpi_free_inter_pred(&jb->luma_ip);
++    av_freep(&jb->intra.cmds);
++    rpi_cache_flush_finish(jb->rfe);  // Not really needed - should do nothing
++    sem_destroy(&jb->sem);
++    av_free(jb);
++}
++
++static void jbg_delete(HEVCRpiJobGlobal * const jbg)
++{
++    HEVCRpiJob * jb;
++
++    if (jbg == NULL)
++        return;
++
++    jb = jbg->free1;
++    while (jb != NULL)
++    {
++        HEVCRpiJob * const jb2 = jb;
++        jb = jb2->next;
++        job_delete(jb2);
++    }
++
++    pthread_mutex_destroy(&jbg->lock);
++    av_free(jbg);
++}
++
++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
++{
++    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
++    if (jbg == NULL)
++        return NULL;
++
++    pthread_mutex_init(&jbg->lock, NULL);
++
++    while (job_count-- != 0)
++    {
++        HEVCRpiJob * const jb = job_new();
++        if (jb == NULL)
++            goto fail;
++
++        jb->next = jbg->free1;
++        jbg->free1 = jb;
++    }
++
++    return jbg;
++
++fail:
++    jbg_delete(jbg);
++    return NULL;
++}
++
++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
++{
++    HEVCRpiJobGlobal * jbg;
++
++    if (jbc == NULL)
++        return;
++
++    jbg = jbc->jbg;
++
++    if (jbc->jb1 != NULL)
++        job_delete(jbc->jb1);
++
++    pthread_mutex_destroy(&jbc->in_lock);
++    sem_destroy(&jbc->sem_out);
++    av_free(jbc);
++
++    // Deref the global job context
++    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
++        jbg_delete(jbg);
++}
++
++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
++{
++    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
++
++    if (jbc == NULL)
++        return NULL;
++
++    jbc->jbg = jbg;
++    atomic_fetch_add(&jbg->ref_count, 1);
++
++    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
++    pthread_mutex_init(&jbc->in_lock, NULL);
++
++    if ((jbc->jb1 = job_new()) == NULL)
++        goto fail;
++    jbc->jb1->jbc_local = jbc;
++
++    return jbc;
++
++fail:
++    rpi_job_ctl_delete(jbc);
++    return NULL;
++}
++
++
++
++static av_cold void hevc_init_worker(HEVCRpiContext * const s)
++{
++#if RPI_PASSES == 2
++    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
++#elif RPI_PASSES == 3
++    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
++    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
++#else
++#error Passes confused
++#endif
++    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
++
++    pass_queues_start_all(s);
++}
++
++static av_cold void hevc_exit_worker(HEVCRpiContext *s)
++{
++    pass_queues_term_all(s);
++
++    pass_queues_kill_all(s);
++
++    rpi_job_ctl_delete(s->jbc);
++    s->jbc = NULL;
++}
++
++
++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
++{
++    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
++    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
++
++    // Check for obvious disasters
++    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
++        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    // If dependant then ctb_addr_ts != 0 from previous check
++    if (s->sh.dependent_slice_segment_flag) {
++        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
++        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
++            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++        tile_id + s->sh.num_entry_point_offsets >= tiles)
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    // Tiled stuff must start at start of tile if it has multiple entry points
++    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++        s->sh.num_entry_point_offsets != 0 &&
++        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    ff_hevc_rpi_cabac_init_decoder(lc);
++
++    // Setup any required decode vars
++    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
++
++//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
++    lc->qp_y = s->sh.slice_qp;
++
++    // General setup
++    lc->bt_line_no = 0;
++    lc->ts = ctb_addr_ts;
++    return 0;
++}
++
++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++    const GetBitContext * const gb = &s->HEVClc->gb;
++    RpiSliceHeader * const sh = &s->sh;
++    int i, j;
++
++    const unsigned int length = nal->size;
++    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
++    unsigned int cmpt;
++    unsigned int startheader;
++
++    if (sh->num_entry_point_offsets == 0) {
++        s->data = NULL;
++        return 0;
++    }
++
++    // offset in slice header includes emulation prevention bytes.
++    // Unfortunately those have been removed by the time we get here so we
++    // have to compensate.  The nal layer keeps a track of where they were.
++    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
++        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++            startheader--;
++            cmpt++;
++        }
++    }
++
++    for (i = 1; i < sh->num_entry_point_offsets; i++) {
++        offset += (sh->entry_point_offset[i - 1] - cmpt);
++        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
++            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++                startheader--;
++                cmpt++;
++            }
++        }
++        if (sh->entry_point_offset[i] <= cmpt) {
++            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
++            return AVERROR_INVALIDDATA;
++        }
++        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
++        sh->offset[i - 1] = offset;
++    }
++
++    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
++    if (length < offset) {
++        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++        return AVERROR_INVALIDDATA;
++    }
++    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
++    sh->offset[sh->num_entry_point_offsets - 1] = offset;
++
++    // Remember data start pointer as we won't have nal later
++    s->data = nal->data;
++    return 0;
++}
++
++
++// Return
++// < 0   Error
++// 0     OK
++//
++// jb->ctu_ts_last < 0       Job still filling
++// jb->ctu_ts_last >= 0      Job ready
++
++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
++{
++    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++    const unsigned int ctb_size = (1 << log2_ctb_size);
++    HEVCRpiJob * const jb = lc->jb0;
++    int more_data = 1;
++    unsigned int ctb_addr_ts = lc->ts;
++    unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++    unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
++    const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
++
++    lc->unit_done = 0;
++
++    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
++    {
++        int q_full;
++        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++
++        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
++
++        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
++
++        hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
++
++        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
++        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
++        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
++
++        // Zap stashes if navail
++        if ((lc->ctb_avail & AVAIL_U) == 0)
++            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
++        if ((lc->ctb_avail & AVAIL_L) == 0)
++        {
++            memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
++            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
++        }
++#if MVF_STASH_WIDTH > 64
++        // Restore left mvf stash at start of tile if not at start of line
++        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
++        {
++            unsigned int i;
++            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
++            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++            {
++                *dst = *src++;
++                dst += MVF_STASH_WIDTH_PU;
++            }
++        }
++#endif
++
++        // Set initial tu states
++        lc->tu.cu_qp_delta = 0;
++        lc->tu.is_cu_qp_delta_wanted = 0;
++        lc->tu.cu_chroma_qp_offset_wanted = 0;
++
++        // Decode
++        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
++
++        if (ff_hevc_rpi_cabac_overflow(lc))
++        {
++            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
++            more_data = AVERROR_INVALIDDATA;
++        }
++
++        if (more_data < 0) {
++            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
++            return more_data;
++        }
++
++        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
++             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
++        {
++            if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
++                ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
++            {
++                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
++                return -1;
++            }
++        }
++
++        // --- Post CTB processing
++
++        // Stash rpl top/left for deblock that needs to remember such things cross-slice
++        s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
++        s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
++
++        if (!s->is_irap)
++        {
++            // Copy MVF up to up-left & stash to up
++            {
++                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
++                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
++
++    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
++
++                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
++                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
++            }
++            // Stash sideways if end of tile line but not end of line (no point)
++            // ** Could/should do this @ end of fn
++#if MVF_STASH_WIDTH > 64
++            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
++#endif
++            {
++                unsigned int i;
++                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
++                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++                {
++                    *dst++ = *src;
++                    src += MVF_STASH_WIDTH_PU;
++                }
++            }
++        }
++
++        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
++            ff_hevc_rpi_save_states(s, lc);
++
++        // Report progress so we can use our MVs in other frames
++        if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
++            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
++
++        // End of line || End of tile line || End of tile
++        // (EoL covers end of frame for our purposes here)
++        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
++
++        // Allocate QPU chunks on fixed size 64 pel boundries rather than
++        // whatever ctb_size is today.
++        // * We might quite like to continue to 64 pel vertical too but that
++        //   currently confuses WPP
++        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
++        {
++            int overflow = 0;
++            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
++                overflow = 1;
++            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
++                overflow = 1;
++            if (overflow)
++            {
++                // * This is very annoying (and slow) to cope with in WPP so
++                //   we treat it as an error there (no known stream triggers this
++                //   with the current buffer sizes).  Non-wpp should cope fine.
++                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
++                q_full = 1;
++            }
++        }
++
++        // Inc TS to next.
++        ctb_addr_ts++;
++        ctb_addr_rs++;
++        x_ctb += ctb_size;
++
++        if (q_full)
++        {
++            // Do job
++            // Prep for submission
++            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
++            job_gen_bounds(s, jb);
++            break;
++        }
++
++        // If max_blocks started as 0 then this will never be true
++        if (--max_blocks == 0)
++            break;
++    }
++
++    lc->unit_done = (more_data <= 0);
++    lc->ts = ctb_addr_ts;
++    return 0;
++}
++
++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
++{
++    lc->context = s;
++    lc->jb0 = NULL;
++    lc->lc_n = n;
++    lc->bt_terminate = 0;
++    lc->bt_psem_out = NULL;
++    sem_init(&lc->bt_sem_in, 0, 0);
++}
++
++#define TRACE_WPP 0
++#if RPI_EXTRA_BIT_THREADS > 0
++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
++{
++    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
++    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
++}
++
++// Move local context parameters from an aux bit thread back to the main
++// thread at the end of a slice as processing is going to continue there.
++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
++{
++    if (src_lc == dst_lc) {
++        return;
++    }
++
++    // Move the job
++    // We will still have an active job if the final line terminates early
++    // Dest should always be null by now
++    av_assert1(dst_lc->jb0 == NULL);
++    dst_lc->jb0 = src_lc->jb0;
++    src_lc->jb0 = NULL;
++
++    // Always need to store where we are in the bitstream
++    dst_lc->ts = src_lc->ts;
++    dst_lc->gb = src_lc->gb;
++    // Cabac init request will be built at start of next slice
++
++    // Need to store context if we might have a dependent seg
++    if (is_dep)
++    {
++        dst_lc->qPy_pred = src_lc->qPy_pred;
++        memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
++        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
++        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
++    }
++}
++
++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
++{
++    rpi_sem_wait(&lc->bt_sem_in);
++    return lc->bt_terminate;
++}
++
++// Do one WPP line
++// Will not work correctly over horizontal tile boundries - vertical should be OK
++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
++{
++    const int is_tile = lc->bt_is_tile;
++    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
++    const unsigned int line = lc->bt_line_no;
++    const unsigned int line_inc = lc->bt_line_inc;
++    const int is_last = (line >= lc->bt_last_line);
++
++    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
++    const unsigned int ts_next =
++        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
++            INT_MAX :
++        is_tile ?
++            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
++            lc->ts + lc->bt_line_width * line_inc;
++    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
++    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
++    unsigned int ts_prev;
++    int loop_n = 0;
++    int err = 0;
++
++    av_assert1(line <= s->sh.num_entry_point_offsets);
++
++#if TRACE_WPP
++    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
++           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
++           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
++           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
++#endif
++    if (line != 0)
++    {
++        const uint8_t * const data = s->data + s->sh.offset[line - 1];
++        const unsigned int len = s->sh.size[line - 1];
++        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
++            return err;
++
++        ff_init_cabac_decoder(&lc->cc, data, len);
++    }
++
++    // We should never be processing a dependent slice here so reset is good
++    // ?? These probably shouldn't be needed (as they should be set by later
++    //    logic) but do seem to be required
++    lc->qp_y = s->sh.slice_qp;
++
++    do
++    {
++        if (!is_last && loop_n > 1) {
++#if TRACE_WPP
++            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
++#endif
++            sem_post(lc->bt_psem_out);
++        }
++        // The wait for loop_n == 0 has been done in bit_thread
++        if (!is_first && loop_n != 0)
++        {
++#if TRACE_WPP
++            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
++#endif
++            if (wait_bt_sem_in(lc) != 0)
++                return AVERROR_EXIT;
++        }
++
++#if TRACE_WPP
++        {
++            int n;
++            sem_getvalue(&lc->bt_sem_in, &n);
++            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
++        }
++#endif
++
++        ts_prev = lc->ts;
++
++        // If we have had an error - do no further decode but do continue
++        // moving signals around so the other threads continue to operate
++        // correctly (or at least as correctly as they can with this line missing)
++        //
++        // Errors in WPP/Tile are less fatal than normal as we have a good idea
++        // of how to restart on the next line so there is no need to give up totally
++        if (err != 0)
++        {
++            lc->unit_done = 0;
++            lc->ts += partial_size;
++        }
++        else
++        {
++            worker_pass0_ready(s, lc);
++
++            if ((err = fill_job(s, lc, partial_size)) < 0 ||
++                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
++            {
++                if (err == 0) {
++                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
++                    err = AVERROR_INVALIDDATA;
++                }
++                worker_free(s, lc);
++                lc->ts = ts_prev + partial_size;  // Pretend we did all that
++                lc->unit_done = 0;
++            }
++            else if (is_tile)
++            {
++                worker_submit_job(s, lc);
++            }
++        }
++
++        ++loop_n;
++    } while (lc->ts < ts_eol && !lc->unit_done);
++
++    // If we are on the last line & we didn't get a whole line we must wait for
++    // and sink the sem_posts from the line above / tile to the left.
++    while ((ts_prev += partial_size) < ts_eol)
++    {
++#if TRACE_WPP
++        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
++#endif
++        if (wait_bt_sem_in(lc) != 0)
++            return AVERROR_EXIT;
++    }
++
++    lc->bt_line_no += line_inc;
++
++    if (!is_tile && err == 0)
++        worker_submit_job(s, lc);
++
++    if (!is_last) {
++        lc->ts = ts_next;
++
++#if TRACE_WPP
++        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++        sem_post(lc->bt_psem_out);
++        if (loop_n > 1) {
++#if TRACE_WPP
++            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++            sem_post(lc->bt_psem_out);
++        }
++    }
++    else
++    {
++        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);  // * & not EoT
++#if MVF_STASH_WIDTH > 64
++        // Horrid calculations to work out what we want but luckily this should almost never execute
++        // **** Move to movlc
++        if (!s->is_irap)
++        {
++            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
++            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
++            {
++                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
++                unsigned int i;
++                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++
++                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
++                {
++                    *d_mvf = *s_mvf;
++                    d_mvf += MVF_STASH_WIDTH_PU;
++                    s_mvf += MVF_STASH_WIDTH_PU;
++                }
++
++            }
++        }
++#endif
++        // When all done poke the thread 0 sem_in one final time
++#if TRACE_WPP
++        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
++#endif
++        sem_post(&s->HEVClcList[0]->bt_sem_in);
++    }
++
++#if TRACE_WPP
++    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
++#endif
++    return err;
++}
++
++static void wpp_setup_lcs(HEVCRpiContext * const s)
++{
++    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const unsigned int line_width = line_ts_width(s, ts);
++
++    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
++    {
++        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++        lc->ts = ts;
++        lc->bt_is_tile = 0;
++        lc->bt_line_no = i;
++        lc->bt_line_width = line_width;
++        lc->bt_last_line = s->sh.num_entry_point_offsets;
++        lc->bt_line_inc = RPI_BIT_THREADS;
++        ts += line_width;
++    }
++}
++
++
++// Can only process tile single row at once
++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
++{
++    const HEVCRpiPPS * const pps = s->ps.pps;
++    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const unsigned int tile0 = pps->tile_id[ts0];
++    const unsigned int col0 = tile0 % pps->num_tile_columns;
++
++    const unsigned int col = (slice_row == 0) ? col0 : 0;
++    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
++    const unsigned int last_line = FFMIN(
++        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
++
++    const unsigned int par =
++        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
++#if TRACE_WPP
++    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
++           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
++#endif
++    for (unsigned int i = 0; i != par; ++i, ++line)
++    {
++        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++        const unsigned int tile = tile0 + line;
++
++        lc->ts = pps->tile_pos_ts[tile];
++        lc->bt_line_no = line;
++        lc->bt_is_tile = 1;
++        lc->bt_line_width = line_ts_width(s, lc->ts);
++        lc->bt_last_line = last_line;
++        lc->bt_line_inc = par;
++    }
++}
++
++
++static void * bit_thread(void * v)
++{
++    HEVCRpiLocalContext * const lc = v;
++    HEVCRpiContext *const s = lc->context;
++
++    while (wait_bt_sem_in(lc) == 0)
++    {
++        int err;
++
++        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
++            if (lc->bt_terminate) {
++                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
++                break;
++            }
++            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
++        }
++    }
++
++    return NULL;
++}
++
++static int bit_threads_start(HEVCRpiContext * const s)
++{
++    if (s->bt_started)
++        return 0;
++
++    for (int i = 1; i < RPI_BIT_THREADS; ++i)
++    {
++        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
++        if (s->HEVClcList[i] == NULL) {
++            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
++                return -1;
++        }
++
++        bt_lc_init(s, s->HEVClcList[i], i);
++        job_lc_init(s->HEVClcList[i]);
++    }
++
++    // Link the sems in a circle
++    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
++        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
++    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
++
++    // Init all lc before starting any threads
++    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++    {
++        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
++            return -1;
++    }
++
++    s->bt_started = 1;
++    return 0;
++}
++
++static int bit_threads_kill(HEVCRpiContext * const s)
++{
++    if (!s->bt_started)
++        return 0;
++    s->bt_started = 0;
++
++    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++    {
++        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
++        if (lc == NULL)
++            break;
++
++        lc->bt_terminate = 1;
++        sem_post(&lc->bt_sem_in);
++        pthread_join(s->bit_threads[i], NULL);
++
++        sem_destroy(&lc->bt_sem_in);
++        job_lc_kill(lc);
++    }
++    return 0;
++}
++#endif
++
++
++// If we are at EoT and the row is shorter than the number of jobs
++// we can Q we have to wait for it finish otherwise we risk cache/QPU
++// disasters
++static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
++{
++    return
++        s->ps.pps->tile_wpp_inter_disable >= 2 &&
++        s->sh.slice_type != HEVC_SLICE_I &&
++        n >= 0 &&
++        (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
++}
++
++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++{
++    HEVCRpiContext * const s  = avctxt->priv_data;
++    HEVCRpiLocalContext * const lc = s->HEVClc;
++    int err;
++
++    // Start of slice
++    if ((err = slice_start(s, lc)) != 0)
++        return err;
++
++#if RPI_EXTRA_BIT_THREADS > 0
++
++    if (s->sh.offload_tiles)
++    {
++        unsigned int slice_row = 0;
++
++#if TRACE_WPP
++        printf("%s: Do Tiles\n", __func__);
++#endif
++        // Generate & start extra bit threads if they aren't already running
++        bit_threads_start(s);
++
++        do
++        {
++            // Reset lc lines etc.
++            tile_one_row_setup_lcs(s, slice_row);
++
++#if TRACE_WPP
++            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
++                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
++#if TRACE_WPP
++            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
++                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++            while (lc->bt_line_no <= lc->bt_last_line) {
++                rpi_sem_wait(&lc->bt_sem_in);
++                rpi_run_one_line(s, lc, 0);
++            }
++#if TRACE_WPP
++            printf("%s: Done body\n", __func__);
++#endif
++
++            // Wait for everything else to finish
++            rpi_sem_wait(&lc->bt_sem_in);
++
++            ++slice_row;
++        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
++
++
++#if TRACE_WPP
++        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++    else if (s->sh.offload_wpp)
++    {
++#if TRACE_WPP
++        printf("%s: Do WPP\n", __func__);
++#endif
++        // Generate & start extra bit threads if they aren't already running
++        bit_threads_start(s);
++
++        // Reset lc lines etc.
++        wpp_setup_lcs(s);
++
++        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
++#if TRACE_WPP
++        printf("%s: Done 1st\n", __func__);
++#endif
++
++        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
++            rpi_sem_wait(&lc->bt_sem_in);
++            rpi_run_one_line(s, lc, 0);
++        }
++#if TRACE_WPP
++        printf("%s: Done body\n", __func__);
++#endif
++
++        // Wait for everything else to finish
++        rpi_sem_wait(&lc->bt_sem_in);
++
++#if TRACE_WPP
++        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++    else
++#endif
++    {
++#if TRACE_WPP
++        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
++#endif
++        // Single bit thread
++        do {
++            // Make sure we have space to prepare the next job
++            worker_pass0_ready(s, lc);
++
++            if ((err = fill_job(s, lc, 0)) < 0)
++                goto fail;
++
++            worker_submit_job(s, lc);
++
++            if (tile_needs_wait(s, lc->ts - 1))
++                worker_wait(s, lc);
++
++        } while (!lc->unit_done);
++
++#if TRACE_WPP
++        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++
++    // If we have reached the end of the frame or
++    // then wait for the worker to finish all its jobs
++    if (lc->ts >= s->ps.sps->ctb_size)
++        worker_wait(s, lc);
++
++#if RPI_TSTATS
++    {
++        HEVCRpiStats *const ts = &s->tstats;
++
++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
++        memset(ts, 0, sizeof(*ts));
++    }
++#endif
++
++    return lc->ts;
++
++fail:
++    // Cleanup
++    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
++    // Free our job & wait for temination
++    worker_free(s, lc);
++    worker_wait(s, lc);
++    return err;
++}
++
++
++static void set_no_backward_pred(HEVCRpiContext * const s)
++{
++    int i, j;
++    const RefPicList *const refPicList = s->refPicList;
++
++    s->no_backward_pred_flag = 0;
++    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
++        return;
++
++    for (j = 0; j < 2; j++) {
++        for (i = 0; i < refPicList[j].nb_refs; i++) {
++            if (refPicList[j].list[i] > s->poc) {
++                s->no_backward_pred_flag = 1;
++                return;
++            }
++        }
++    }
++}
++
++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++    int err;
++    if ((err = gen_entry_points(s, nal)) < 0)
++        return err;
++
++    set_no_backward_pred(s);
++
++    return rpi_decode_entry(s->avctx, NULL);
++}
++
++static int set_side_data(HEVCRpiContext *s)
++{
++    AVFrame *out = s->ref->frame;
++
++    if (s->sei.frame_packing.present &&
++        s->sei.frame_packing.arrangement_type >= 3 &&
++        s->sei.frame_packing.arrangement_type <= 5 &&
++        s->sei.frame_packing.content_interpretation_type > 0 &&
++        s->sei.frame_packing.content_interpretation_type < 3) {
++        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
++        if (!stereo)
++            return AVERROR(ENOMEM);
++
++        switch (s->sei.frame_packing.arrangement_type) {
++        case 3:
++            if (s->sei.frame_packing.quincunx_subsampling)
++                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
++            else
++                stereo->type = AV_STEREO3D_SIDEBYSIDE;
++            break;
++        case 4:
++            stereo->type = AV_STEREO3D_TOPBOTTOM;
++            break;
++        case 5:
++            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
++            break;
++        }
++
++        if (s->sei.frame_packing.content_interpretation_type == 2)
++            stereo->flags = AV_STEREO3D_FLAG_INVERT;
++
++        if (s->sei.frame_packing.arrangement_type == 5) {
++            if (s->sei.frame_packing.current_frame_is_frame0_flag)
++                stereo->view = AV_STEREO3D_VIEW_LEFT;
++            else
++                stereo->view = AV_STEREO3D_VIEW_RIGHT;
++        }
++    }
++
++    if (s->sei.display_orientation.present &&
++        (s->sei.display_orientation.anticlockwise_rotation ||
++         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
++        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
++        AVFrameSideData *rotation = av_frame_new_side_data(out,
++                                                           AV_FRAME_DATA_DISPLAYMATRIX,
++                                                           sizeof(int32_t) * 9);
++        if (!rotation)
++            return AVERROR(ENOMEM);
++
++        av_display_rotation_set((int32_t *)rotation->data, angle);
++        av_display_matrix_flip((int32_t *)rotation->data,
++                               s->sei.display_orientation.hflip,
++                               s->sei.display_orientation.vflip);
++    }
++
++    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++    // so the side data persists for the entire coded video sequence.
++    if (s->sei.mastering_display.present > 0 &&
++        IS_IRAP(s) && s->no_rasl_output_flag) {
++        s->sei.mastering_display.present--;
++    }
++    if (s->sei.mastering_display.present) {
++        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
++        const int mapping[3] = {2, 0, 1};
++        const int chroma_den = 50000;
++        const int luma_den = 10000;
++        int i;
++        AVMasteringDisplayMetadata *metadata =
++            av_mastering_display_metadata_create_side_data(out);
++        if (!metadata)
++            return AVERROR(ENOMEM);
++
++        for (i = 0; i < 3; i++) {
++            const int j = mapping[i];
++            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
++            metadata->display_primaries[i][0].den = chroma_den;
++            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
++            metadata->display_primaries[i][1].den = chroma_den;
++        }
++        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
++        metadata->white_point[0].den = chroma_den;
++        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
++        metadata->white_point[1].den = chroma_den;
++
++        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
++        metadata->max_luminance.den = luma_den;
++        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
++        metadata->min_luminance.den = luma_den;
++        metadata->has_luminance = 1;
++        metadata->has_primaries = 1;
++
++        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
++        av_log(s->avctx, AV_LOG_DEBUG,
++               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
++               av_q2d(metadata->display_primaries[0][0]),
++               av_q2d(metadata->display_primaries[0][1]),
++               av_q2d(metadata->display_primaries[1][0]),
++               av_q2d(metadata->display_primaries[1][1]),
++               av_q2d(metadata->display_primaries[2][0]),
++               av_q2d(metadata->display_primaries[2][1]),
++               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
++        av_log(s->avctx, AV_LOG_DEBUG,
++               "min_luminance=%f, max_luminance=%f\n",
++               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
++    }
++    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++    // so the side data persists for the entire coded video sequence.
++    if (s->sei.content_light.present > 0 &&
++        IS_IRAP(s) && s->no_rasl_output_flag) {
++        s->sei.content_light.present--;
++    }
++    if (s->sei.content_light.present) {
++        AVContentLightMetadata *metadata =
++            av_content_light_metadata_create_side_data(out);
++        if (!metadata)
++            return AVERROR(ENOMEM);
++        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
++        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
++
++        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
++        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
++               metadata->MaxCLL, metadata->MaxFALL);
++    }
++
++    if (s->sei.a53_caption.a53_caption) {
++        AVFrameSideData* sd = av_frame_new_side_data(out,
++                                                     AV_FRAME_DATA_A53_CC,
++                                                     s->sei.a53_caption.a53_caption_size);
++        if (sd)
++            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
++        av_freep(&s->sei.a53_caption.a53_caption);
++        s->sei.a53_caption.a53_caption_size = 0;
++        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
++    }
++
++    if (s->sei.alternative_transfer.present &&
++        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
++        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
++        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
++    }
++
++    return 0;
++}
++
++static int hevc_frame_start(HEVCRpiContext * const s)
++{
++    int ret;
++
++    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
++    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
++
++    // Only need to remember intra for CIP
++    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
++        s->is_intra = NULL;
++    else
++    {
++        s->is_intra = s->is_intra_store;
++        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++    }
++
++    s->is_decoded        = 0;
++    s->first_nal_type    = s->nal_unit_type;
++
++    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
++
++    if (s->pkt.nb_nals > s->rpl_tab_size)
++    {
++        // In most cases it will be faster to free & realloc as that doesn't
++        // require (an unwanted) copy
++        av_freep(&s->rpl_tab);
++        s->rpl_tab_size = 0;
++        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
++            goto fail;
++        s->rpl_tab_size = s->pkt.nb_nals;
++    }
++    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
++
++    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
++    if (ret < 0)
++        goto fail;
++
++    // Resize rpl_tab to max that we might want
++    ret = ff_hevc_rpi_frame_rps(s);
++    if (ret < 0) {
++        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
++        goto fail;
++    }
++
++    s->ref->frame->key_frame = IS_IRAP(s);
++
++    ret = set_side_data(s);
++    if (ret < 0)
++        goto fail;
++
++    s->frame->pict_type = 3 - s->sh.slice_type;
++
++    if (!IS_IRAP(s))
++        ff_hevc_rpi_bump_frame(s);
++
++    av_frame_unref(s->output_frame);
++    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
++    if (ret < 0)
++        goto fail;
++
++    ff_thread_finish_setup(s->avctx);
++
++    return 0;
++
++fail:
++    if (s->ref)
++        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++    s->ref = NULL;
++    return ret;
++}
++
++static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
++{
++    // From Table 7-1
++    return (nal_unit_type & ~0xe) == 0;  // True for 0, 2, 4, 6, 8, 10, 12, 14
++}
++
++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
++{
++    GetBitContext * const gb    = &s->HEVClc->gb;
++    int ctb_addr_ts, ret;
++
++    *gb              = nal->gb;
++    s->nal_unit_type = nal->type;
++    s->temporal_id   = nal->temporal_id;
++
++    switch (s->nal_unit_type) {
++    case HEVC_NAL_VPS:
++        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_SPS:
++        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
++                                     s->apply_defdispwin);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_PPS:
++        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_SEI_PREFIX:
++    case HEVC_NAL_SEI_SUFFIX:
++        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_TRAIL_R:
++    case HEVC_NAL_TRAIL_N:
++    case HEVC_NAL_TSA_N:
++    case HEVC_NAL_TSA_R:
++    case HEVC_NAL_STSA_N:
++    case HEVC_NAL_STSA_R:
++    case HEVC_NAL_BLA_W_LP:
++    case HEVC_NAL_BLA_W_RADL:
++    case HEVC_NAL_BLA_N_LP:
++    case HEVC_NAL_IDR_W_RADL:
++    case HEVC_NAL_IDR_N_LP:
++    case HEVC_NAL_CRA_NUT:
++    case HEVC_NAL_RADL_N:
++    case HEVC_NAL_RADL_R:
++    case HEVC_NAL_RASL_N:
++    case HEVC_NAL_RASL_R:
++        ret = hls_slice_header(s);
++        if (ret < 0)
++            return ret;
++
++        // The definition of _N unit types is "non-reference for other frames
++        // with the same temporal_id" so they may/will be ref frames for pics
++        // with a higher temporal_id.
++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++            !is_non_ref_unit_type(s->nal_unit_type);
++        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
++        s->is_irap = IS_IRAP(s);
++
++#if DEBUG_DECODE_N
++        {
++            static int z = 0;
++            if (IS_IDR(s)) {
++                z = 1;
++            }
++            if (z != 0 && z++ > DEBUG_DECODE_N) {
++                s->is_decoded = 0;
++                break;
++            }
++        }
++#endif
++        if (
++            (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
++            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
++            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
++            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
++        {
++            s->is_decoded = 0;
++            break;
++        }
++
++        if (s->sh.first_slice_in_pic_flag) {
++            if (s->max_ra == INT_MAX) {
++                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
++                    s->max_ra = s->poc;
++                } else {
++                    if (IS_IDR(s))
++                        s->max_ra = INT_MIN;
++                }
++            }
++
++            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
++                s->poc <= s->max_ra) {
++                s->is_decoded = 0;
++                break;
++            } else {
++                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
++                    s->max_ra = INT_MIN;
++            }
++
++            ret = hevc_frame_start(s);
++            if (ret < 0)
++                return ret;
++        } else if (!s->ref) {
++            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
++            goto fail;
++        }
++
++        if (s->nal_unit_type != s->first_nal_type) {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
++                   s->first_nal_type, s->nal_unit_type);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (!s->sh.dependent_slice_segment_flag &&
++            s->sh.slice_type != HEVC_SLICE_I) {
++            ret = ff_hevc_rpi_slice_rpl(s);
++            if (ret < 0) {
++                av_log(s->avctx, AV_LOG_WARNING,
++                       "Error constructing the reference lists for the current slice.\n");
++                goto fail;
++            }
++        }
++
++        ctb_addr_ts = hls_slice_data(s, nal);
++        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
++            s->is_decoded = 1;
++        }
++
++        if (ctb_addr_ts < 0) {
++            ret = ctb_addr_ts;
++            goto fail;
++        }
++        break;
++    case HEVC_NAL_EOS_NUT:
++    case HEVC_NAL_EOB_NUT:
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++        break;
++    case HEVC_NAL_AUD:
++    case HEVC_NAL_FD_NUT:
++        break;
++    default:
++        av_log(s->avctx, AV_LOG_INFO,
++               "Skipping NAL unit %d\n", s->nal_unit_type);
++    }
++
++    return 0;
++fail:
++    if (s->avctx->err_recognition & AV_EF_EXPLODE)
++        return ret;
++    return 0;
++}
++
++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
++{
++    int i, ret = 0;
++    int eos_at_start = 1;
++
++    s->ref = NULL;
++    s->last_eos = s->eos;
++    s->eos = 0;
++
++    /* split the input packet into NAL units, so we know the upper bound on the
++     * number of slices in the frame */
++    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
++                                s->nal_length_size, s->avctx->codec_id, 0, 0);
++    if (ret < 0) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Error splitting the input into NAL units.\n");
++        return ret;
++    }
++
++    for (i = 0; i < s->pkt.nb_nals; i++) {
++        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
++            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
++            if (eos_at_start) {
++                s->last_eos = 1;
++            } else {
++                s->eos = 1;
++            }
++        } else {
++            eos_at_start = 0;
++        }
++    }
++
++    /* decode the NAL units */
++    for (i = 0; i < s->pkt.nb_nals; i++) {
++        ret = decode_nal_unit(s, &s->pkt.nals[i]);
++        if (ret < 0) {
++            av_log(s->avctx, AV_LOG_WARNING,
++                   "Error parsing NAL unit #%d.\n", i);
++            goto fail;
++        }
++    }
++
++fail:  // Also success path
++    if (s->ref != NULL) {
++        if (s->used_for_ref && s->threads_type != 0) {
++            ff_hevc_rpi_progress_signal_all_done(s);
++        }
++        else {
++            // Flush frame to real memory as we expect to be able to pass
++            // it straight on to mmal
++            flush_frame(s, s->frame);
++        }
++    }
++    return ret;
++}
++
++static void print_md5(void *log_ctx, int level, uint8_t md5[16])
++{
++    int i;
++    for (i = 0; i < 16; i++)
++        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
++}
++
++static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
++{
++    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++    int pixel_shift;
++    int i, j;
++
++    if (!desc)
++        return AVERROR(EINVAL);
++
++    pixel_shift = desc->comp[0].depth > 8;
++
++    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
++           s->poc);
++
++    /* the checksums are LE, so we have to byteswap for >8bpp formats
++     * on BE arches */
++#if HAVE_BIGENDIAN
++    if (pixel_shift && !s->checksum_buf) {
++        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
++                       FFMAX3(frame->linesize[0], frame->linesize[1],
++                              frame->linesize[2]));
++        if (!s->checksum_buf)
++            return AVERROR(ENOMEM);
++    }
++#endif
++
++    for (i = 0; frame->data[i]; i++) {
++        int width  = s->avctx->coded_width;
++        int height = s->avctx->coded_height;
++        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
++        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
++        uint8_t md5[16];
++
++        av_md5_init(s->md5_ctx);
++        for (j = 0; j < h; j++) {
++            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
++#if HAVE_BIGENDIAN
++            if (pixel_shift) {
++                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
++                                    (const uint16_t *) src, w);
++                src = s->checksum_buf;
++            }
++#endif
++            av_md5_update(s->md5_ctx, src, w << pixel_shift);
++        }
++        av_md5_final(s->md5_ctx, md5);
++
++        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
++            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
++            print_md5(s->avctx, AV_LOG_DEBUG, md5);
++            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
++        } else {
++            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
++            print_md5(s->avctx, AV_LOG_ERROR, md5);
++            av_log   (s->avctx, AV_LOG_ERROR, " != ");
++            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
++            av_log   (s->avctx, AV_LOG_ERROR, "\n");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    av_log(s->avctx, AV_LOG_DEBUG, "\n");
++
++    return 0;
++}
++
++static int all_sps_supported(const HEVCRpiContext * const s)
++{
++    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        if (s->ps.sps_list[i] != NULL)
++        {
++            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++            if (!is_sps_supported(sps))
++                return 0;
++        }
++    }
++    return 1;
++}
++
++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
++{
++    int ret, i;
++
++    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
++                                   &s->nal_length_size, s->avctx->err_recognition,
++                                   s->apply_defdispwin, s->avctx);
++    if (ret < 0)
++        return ret;
++
++    /* export stream parameters from the first SPS */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        if (first && s->ps.sps_list[i]) {
++            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++            export_stream_params(s->avctx, &s->ps, sps);
++            break;
++        }
++    }
++
++    return 0;
++}
++
++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
++                             AVPacket *avpkt)
++{
++    int ret;
++    int new_extradata_size;
++    uint8_t *new_extradata;
++    HEVCRpiContext *s = avctx->priv_data;
++
++    if (!avpkt->size) {
++        ret = ff_hevc_rpi_output_frame(s, data, 1);
++        if (ret < 0)
++            return ret;
++
++        *got_output = ret;
++        return 0;
++    }
++
++    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
++                                            &new_extradata_size);
++    if (new_extradata && new_extradata_size > 0) {
++        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
++        if (ret < 0)
++            return ret;
++    }
++
++    s->ref = NULL;
++    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
++    if (ret < 0)
++        return ret;
++
++    /* verify the SEI checksum */
++    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
++        s->sei.picture_hash.is_md5) {
++        ret = verify_md5(s, s->ref->frame);
++        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
++            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++            return ret;
++        }
++    }
++    s->sei.picture_hash.is_md5 = 0;
++
++    if (s->is_decoded) {
++        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
++        s->is_decoded = 0;
++    }
++
++    if (s->output_frame->buf[0]) {
++        av_frame_move_ref(data, s->output_frame);
++        *got_output = 1;
++    }
++
++    return avpkt->size;
++}
++
++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
++{
++    int ret;
++
++    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
++    if (ret < 0)
++        return ret;
++
++    if (src->col_mvf_buf != NULL)
++    {
++        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
++        if (!dst->col_mvf_buf)
++            goto fail;
++    }
++    dst->col_mvf = src->col_mvf;
++
++    dst->poc        = src->poc;
++    dst->flags      = src->flags;
++    dst->sequence   = src->sequence;
++    return 0;
++
++fail:
++    ff_hevc_rpi_unref_frame(s, dst, ~0);
++    return AVERROR(ENOMEM);
++}
++
++
++static av_cold int hevc_decode_free(AVCodecContext *avctx)
++{
++    HEVCRpiContext * const s = avctx->priv_data;
++    int i;
++
++    pic_arrays_free(s);
++
++    av_freep(&s->md5_ctx);
++
++    av_freep(&s->cabac_save);
++
++#if RPI_EXTRA_BIT_THREADS
++    bit_threads_kill(s);
++#endif
++
++    hevc_exit_worker(s);
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++    }
++    job_lc_kill(s->HEVClc);
++
++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
++    av_freep(&s->sao_pixel_buffer_v[0]);
++    av_frame_free(&s->output_frame);
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++        av_frame_free(&s->DPB[i].frame);
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
++        av_buffer_unref(&s->ps.vps_list[i]);
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
++        av_buffer_unref(&s->ps.sps_list[i]);
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
++        av_buffer_unref(&s->ps.pps_list[i]);
++    s->ps.sps = NULL;
++    s->ps.pps = NULL;
++    s->ps.vps = NULL;
++
++    // Free separately from sLists as used that way by RPI WPP
++    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
++        av_freep(s->HEVClcList + i);
++    }
++    s->HEVClc = NULL;  // Allocated as part of HEVClcList
++
++    ff_h2645_packet_uninit(&s->pkt);
++
++    if (s->qpu_init_ok)
++        vpu_qpu_term();
++    s->qpu_init_ok = 0;
++
++    return 0;
++}
++
++
++static av_cold int hevc_init_context(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    int i;
++
++    s->avctx = avctx;
++
++    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
++    if (!s->HEVClc)
++        goto fail;
++    s->HEVClcList[0] = s->HEVClc;
++
++    // Whilst FFmpegs init fn is only called once the close fn is called as
++    // many times as we have threads (init_thread_copy is called for the
++    // threads).  So to match init & term put the init here where it will be
++    // called by both init & copy
++
++    if (vpu_qpu_init() != 0)
++        goto fail;
++    s->qpu_init_ok = 1;
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    {
++        static const uint32_t dframe[1] = {0x80808080};
++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
++    }
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    s->qpu_dummy_frame_qpu = qpu_dummy();
++#endif
++
++    bt_lc_init(s, s->HEVClc, 0);
++    job_lc_init(s->HEVClc);
++
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
++    }
++
++    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
++        goto fail;
++
++     if ((s->output_frame = av_frame_alloc()) == NULL)
++        goto fail;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        s->DPB[i].frame = av_frame_alloc();
++        if (!s->DPB[i].frame)
++            goto fail;
++        s->DPB[i].tf.f = s->DPB[i].frame;
++        s->DPB[i].dpb_no = i;
++    }
++
++    s->max_ra = INT_MAX;
++
++    if ((s->md5_ctx = av_md5_alloc()) == NULL)
++        goto fail;
++
++    s->context_initialized = 1;
++    s->eos = 0;
++
++    ff_hevc_rpi_reset_sei(&s->sei);
++
++    return 0;
++
++fail:
++    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
++    hevc_decode_free(avctx);
++    return AVERROR(ENOMEM);
++}
++
++#if HAVE_THREADS
++static int hevc_update_thread_context(AVCodecContext *dst,
++                                      const AVCodecContext *src)
++{
++    HEVCRpiContext *s  = dst->priv_data;
++    HEVCRpiContext *s0 = src->priv_data;
++    int i, ret;
++
++    if (!s->context_initialized) {
++        ret = hevc_init_context(dst);
++        if (ret < 0)
++            return ret;
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++        if (s0->DPB[i].frame->buf[0]) {
++            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
++            if (ret < 0)
++                return ret;
++        }
++    }
++
++    if (s->ps.sps != s0->ps.sps)
++        s->ps.sps = NULL;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
++        av_buffer_unref(&s->ps.vps_list[i]);
++        if (s0->ps.vps_list[i]) {
++            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
++            if (!s->ps.vps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        av_buffer_unref(&s->ps.sps_list[i]);
++        if (s0->ps.sps_list[i]) {
++            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
++            if (!s->ps.sps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
++        av_buffer_unref(&s->ps.pps_list[i]);
++        if (s0->ps.pps_list[i]) {
++            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
++            if (!s->ps.pps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    if (s->ps.sps != s0->ps.sps)
++        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
++            return ret;
++
++    s->seq_decode = s0->seq_decode;
++    s->seq_output = s0->seq_output;
++    s->pocTid0    = s0->pocTid0;
++    s->max_ra     = s0->max_ra;
++    s->eos        = s0->eos;
++    s->no_rasl_output_flag = s0->no_rasl_output_flag;
++
++    s->is_nalff        = s0->is_nalff;
++    s->nal_length_size = s0->nal_length_size;
++
++    s->threads_type        = s0->threads_type;
++
++    if (s0->eos) {
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra = INT_MAX;
++    }
++
++    s->sei.frame_packing        = s0->sei.frame_packing;
++    s->sei.display_orientation  = s0->sei.display_orientation;
++    s->sei.mastering_display    = s0->sei.mastering_display;
++    s->sei.content_light        = s0->sei.content_light;
++    s->sei.alternative_transfer = s0->sei.alternative_transfer;
++
++    // * We do this here as it allows us to easily locate our parents
++    //   global job pool, but there really should be a less nasty way
++    if (s->jbc == NULL)
++    {
++        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
++        hevc_init_worker(s);
++    }
++
++    return 0;
++}
++#endif
++
++#include <sys/stat.h>
++static int qpu_ok(void)
++{
++    static int is_pi3 = -1;
++    if (is_pi3 == -1)
++    {
++        struct stat sb;
++        is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
++    }
++    return is_pi3;
++}
++
++static av_cold int hevc_decode_init(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    int ret;
++
++    if (!qpu_ok())
++        return -1;
++
++    if ((ret = hevc_init_context(avctx)) < 0)
++        return ret;
++
++    // Job allocation requires VCSM alloc to work so ensure that we have it
++    // initialised by this point
++    {
++        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
++        if (jbg == NULL)
++        {
++            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
++            return -1;
++        }
++
++        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
++        {
++            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
++            return -1;
++        }
++    }
++
++    hevc_init_worker(s);
++
++    s->sei.picture_timing.picture_struct = 0;
++    s->eos = 1;
++
++    atomic_init(&s->wpp_err, 0);
++
++    if (avctx->extradata_size > 0 && avctx->extradata) {
++        ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
++
++        if (ret == 0 && !all_sps_supported(s))
++            ret = AVERROR_DECODER_NOT_FOUND;
++
++        if (ret < 0)
++        {
++            hevc_decode_free(avctx);
++            return ret;
++        }
++    }
++
++    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++        s->threads_type = FF_THREAD_FRAME;
++    else
++        s->threads_type = 0;
++
++    return 0;
++}
++
++static void hevc_decode_flush(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    ff_hevc_rpi_flush_dpb(s);
++    s->max_ra = INT_MAX;
++    s->eos = 1;
++}
++
++typedef struct  hwaccel_rpi3_qpu_env_s {
++    const AVClass *av_class;
++    AVZcEnvPtr zc;
++} hwaccel_rpi3_qpu_env_t;
++
++static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
++{
++    hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
++    int rv;
++
++    if (av_rpi_zc_in_use(s))
++    {
++        rv = s->get_buffer2(s, frame, 0);
++    }
++    else
++    {
++        rv = av_rpi_zc_get_buffer(r3->zc, frame);
++        if (rv == 0)
++            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);  // actually do the alloc
++    }
++
++    if (rv == 0 &&
++        (rv = ff_attach_decode_data(frame)) < 0)
++    {
++        av_frame_unref(frame);
++    }
++
++    return rv;
++}
++
++static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
++{
++    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++    av_rpi_zc_int_env_freep(&r3->zc);
++    return 0;
++}
++
++static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
++{
++    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++
++    if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
++        goto fail;
++
++    return 0;
++
++fail:
++    av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
++    hwaccel_rpi3_qpu_free(avctx);
++    return AVERROR(ENOMEM);
++}
++
++
++#define OFFSET(x) offsetof(HEVCRpiContext, x)
++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
++
++
++static const AVOption options[] = {
++    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
++        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
++        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++    { NULL },
++};
++
++static const AVClass hevc_rpi_decoder_class = {
++    .class_name = "HEVC RPI decoder",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++};
++
++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
++    AV_PIX_FMT_SAND128,
++    AV_PIX_FMT_SAND64_10,
++    AV_PIX_FMT_NONE
++};
++
++
++static const AVHWAccel hwaccel_rpi3_qpu = {
++    .name           = "Pi3 QPU Hwaccel",
++    .alloc_frame    = hwaccel_alloc_frame,
++    .init           = hwaccel_rpi3_qpu_init,
++    .uninit         = hwaccel_rpi3_qpu_free,
++    .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
++{
++    .public = {
++        .pix_fmt = AV_PIX_FMT_SAND128,
++        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++        .device_type = AV_HWDEVICE_TYPE_NONE,
++    },
++    .hwaccel = &hwaccel_rpi3_qpu
++};
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
++{
++    .public = {
++        .pix_fmt = AV_PIX_FMT_SAND64_10,
++        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++        .device_type = AV_HWDEVICE_TYPE_NONE,
++    },
++    .hwaccel = &hwaccel_rpi3_qpu
++};
++
++
++static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
++    &hevc_rpi_hw_config_sand128,
++    &hevc_rpi_hw_config_sand64_10,
++    NULL
++};
++
++
++AVCodec ff_hevc_rpi_decoder = {
++    .name                  = "hevc_rpi",
++    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
++    .type                  = AVMEDIA_TYPE_VIDEO,
++    .id                    = AV_CODEC_ID_HEVC,
++    .priv_data_size        = sizeof(HEVCRpiContext),
++    .priv_class            = &hevc_rpi_decoder_class,
++    .init                  = hevc_decode_init,
++    .close                 = hevc_decode_free,
++    .decode                = hevc_rpi_decode_frame,
++    .flush                 = hevc_decode_flush,
++    .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
++    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++                             AV_CODEC_CAP_HARDWARE |
++                             AV_CODEC_CAP_AVOID_PROBING |
++#if 0
++    // Debugging is often easier without threads getting in the way
++                            0,
++#warning H265 threading turned off
++#else
++    // We only have decent optimisation for frame - so only admit to that
++                             AV_CODEC_CAP_FRAME_THREADS,
++#endif
++    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE |
++                             FF_CODEC_CAP_EXPORTS_CROPPING |
++                             FF_CODEC_CAP_ALLOCATE_PROGRESS,
++    .pix_fmts              = hevc_rpi_pix_fmts,
++    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++    .hw_configs            = hevc_rpi_hw_configs,
++//    .wrapper_name          = "hevc_rpi",
++};
++
+diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
+new file mode 100644
+index 0000000000..5001a3853b
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.h
+@@ -0,0 +1,1093 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDEC_H
++#define AVCODEC_RPI_HEVCDEC_H
++
++#include "config.h"
++
++#include <stdatomic.h>
++
++#include "libavutil/buffer.h"
++
++#include "avcodec.h"
++#include "bswapdsp.h"
++#include "cabac.h"
++#include "get_bits.h"
++#include "rpi_hevcpred.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_mv.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++#include "rpi_hevcdsp.h"
++#include "internal.h"
++#include "thread.h"
++#include "videodsp.h"
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_misc_neon.h"
++#endif
++
++#define MAX_NB_THREADS 16
++#define SHIFT_CTB_WPP 2
++
++//TODO: check if this is really the maximum
++#define MAX_TRANSFORM_DEPTH 5
++
++#define MAX_TB_SIZE 32
++#define MAX_QP 51
++#define DEFAULT_INTRA_TC_OFFSET 2
++
++#define HEVC_CONTEXTS 199
++
++#define MRG_MAX_NUM_CANDS     5
++
++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
++
++// Size of DPB array
++#define HEVC_DPB_ELS            32
++
++#define L0 0
++#define L1 1
++
++#define EPEL_EXTRA_BEFORE 1
++#define EPEL_EXTRA_AFTER  2
++#define EPEL_EXTRA        3
++#define QPEL_EXTRA_BEFORE 3
++#define QPEL_EXTRA_AFTER  4
++#define QPEL_EXTRA        7
++
++#define EDGE_EMU_BUFFER_STRIDE 80
++
++#include <semaphore.h>
++#include "rpi_qpu.h"
++
++// Max jobs per frame thread. Actual usage will be limited by the size
++// of the global job pool
++// ?? Limits
++#define RPI_MAX_JOBS            8
++
++// This is the number of _extra_ bit threads - we will have
++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
++//
++// 0 is legitimate and will disable our WPP processing
++//#define RPI_EXTRA_BIT_THREADS 0
++#define RPI_EXTRA_BIT_THREADS   2
++
++// Number of separate threads/passes in worker
++// 2 and 3 are the currently valid numbers
++// At the moment 3 seems fractionally faster
++//#define RPI_PASSES              2
++#define RPI_PASSES              3
++
++// Print out various usage stats
++#define RPI_TSTATS              0
++
++// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
++#define RPI_COMPRESS_COEFFS     1
++
++// Wait for VPU/QPU to finish in worker pass 0
++// If 0 then the wait is in pass 1
++//
++// One might expect the better place to wait would be in pass 1 however
++// testing shows that pass 0 produces overall faster decode.
++// Interestingly it is QPU/VPU limited streams that seem to suffer
++// from pass 1 waits, CPU limited ones tend to show a very mild gain.
++// This define exists so it is easy to test this.
++#define RPI_WORKER_WAIT_PASS_0  1
++
++// Use ARM emulation of QPU pred
++// These are for debug only as the emulation makes only limited
++// effort to be fast
++#define RPI_QPU_EMU_Y           0
++#define RPI_QPU_EMU_C           0
++
++// Max width & height we are prepared to consider
++// Sand frame shape calc becomes confused with large frames
++// Some buffer alloc also depends on this
++#define HEVC_RPI_MAX_WIDTH      2048
++#define HEVC_RPI_MAX_HEIGHT     1088
++
++
++// Min CTB size is 16
++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
++
++/**
++ * Value of the luma sample at position (x, y) in the 2D array tab.
++ */
++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
++
++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
++                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
++
++enum RPSType {
++    ST_CURR_BEF = 0,
++    ST_CURR_AFT,
++    ST_FOLL,
++    LT_CURR,
++    LT_FOLL,
++    NB_RPS_TYPE,
++};
++
++enum SyntaxElement {
++    SAO_MERGE_FLAG = 0,
++    SAO_TYPE_IDX,
++    SAO_EO_CLASS,
++    SAO_BAND_POSITION,
++    SAO_OFFSET_ABS,
++    SAO_OFFSET_SIGN,
++    END_OF_SLICE_FLAG,
++    SPLIT_CODING_UNIT_FLAG,
++    CU_TRANSQUANT_BYPASS_FLAG,
++    SKIP_FLAG,
++    CU_QP_DELTA,
++    PRED_MODE_FLAG,
++    PART_MODE,
++    PCM_FLAG,
++    PREV_INTRA_LUMA_PRED_FLAG,
++    MPM_IDX,
++    REM_INTRA_LUMA_PRED_MODE,
++    INTRA_CHROMA_PRED_MODE,
++    MERGE_FLAG,
++    MERGE_IDX,
++    INTER_PRED_IDC,
++    REF_IDX_L0,
++    REF_IDX_L1,
++    ABS_MVD_GREATER0_FLAG,
++    ABS_MVD_GREATER1_FLAG,
++    ABS_MVD_MINUS2,
++    MVD_SIGN_FLAG,
++    MVP_LX_FLAG,
++    NO_RESIDUAL_DATA_FLAG,
++    SPLIT_TRANSFORM_FLAG,
++    CBF_LUMA,
++    CBF_CB_CR,
++    TRANSFORM_SKIP_FLAG,
++    EXPLICIT_RDPCM_FLAG,
++    EXPLICIT_RDPCM_DIR_FLAG,
++    LAST_SIGNIFICANT_COEFF_X_PREFIX,
++    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
++    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
++    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
++    SIGNIFICANT_COEFF_GROUP_FLAG,
++    SIGNIFICANT_COEFF_FLAG,
++    COEFF_ABS_LEVEL_GREATER1_FLAG,
++    COEFF_ABS_LEVEL_GREATER2_FLAG,
++    COEFF_ABS_LEVEL_REMAINING,
++    COEFF_SIGN_FLAG,
++    LOG2_RES_SCALE_ABS,
++    RES_SCALE_SIGN_FLAG,
++    CU_CHROMA_QP_OFFSET_FLAG,
++    CU_CHROMA_QP_OFFSET_IDX,
++};
++
++enum PartMode {
++    PART_2Nx2N = 0,
++    PART_2NxN  = 1,
++    PART_Nx2N  = 2,
++    PART_NxN   = 3,
++    PART_2NxnU = 4,
++    PART_2NxnD = 5,
++    PART_nLx2N = 6,
++    PART_nRx2N = 7,
++};
++
++enum PredMode {
++    MODE_INTER = 0,
++    MODE_INTRA,
++    MODE_SKIP,
++};
++
++enum InterPredIdc {
++    PRED_L0 = 0,
++    PRED_L1,
++    PRED_BI,
++};
++
++enum PredFlag {
++    PF_INTRA = 0,
++    PF_L0,
++    PF_L1,
++    PF_BI,
++};
++
++enum SAOType {
++    SAO_NOT_APPLIED = 0,
++    SAO_BAND,
++    SAO_EDGE,
++    SAO_APPLIED
++};
++
++enum SAOEOClass {
++    SAO_EO_HORIZ = 0,
++    SAO_EO_VERT,
++    SAO_EO_135D,
++    SAO_EO_45D,
++};
++
++enum ScanType {
++    SCAN_DIAG = 0,
++    SCAN_HORIZ,
++    SCAN_VERT,
++};
++
++typedef struct RefPicList {
++    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
++    int list[HEVC_MAX_REFS];
++    uint8_t isLongTerm[HEVC_MAX_REFS];
++    int nb_refs;
++} RefPicList;
++
++typedef struct RefPicListTab {
++    RefPicList refPicList[2];
++} RefPicListTab;
++
++typedef struct RpiCodingUnit {
++    unsigned int x;             // Passed to deblock
++    unsigned int y;
++    unsigned int x_split;
++    unsigned int y_split;
++
++    enum PredMode pred_mode;    ///< PredMode
++    enum PartMode part_mode;    ///< PartMode
++
++    // Inferred parameters
++    uint8_t intra_split_flag;   ///< IntraSplitFlag
++    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
++    uint8_t cu_transquant_bypass_flag;
++} RpiCodingUnit;
++
++typedef struct RpiPredictionUnit {
++    uint8_t intra_pred_mode[4];
++    uint8_t intra_pred_mode_c[4];
++    uint8_t chroma_mode_c[4];
++    uint8_t merge_flag;
++} RpiPredictionUnit;
++
++typedef struct HEVCRpiTransformUnit {
++    int8_t cu_qp_delta;
++
++    // Inferred parameters;
++    uint8_t intra_pred_mode;
++    uint8_t intra_pred_mode_c;
++    uint8_t chroma_mode_c;
++    uint8_t is_cu_qp_delta_wanted;
++    uint8_t cu_chroma_qp_offset_wanted;
++    const int8_t * qp_divmod6[3];
++} HEVCRpiTransformUnit;
++
++typedef struct DBParams {
++    int8_t beta_offset; // -12 to +12
++    int8_t tc_offset;   // -12 to +12
++} DBParams;
++
++#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
++#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
++#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
++
++struct HEVCRpiJob;
++
++typedef struct HEVCRpiFrame {
++    AVFrame *frame;
++    ThreadFrame tf;
++    ColMvField *col_mvf;
++    int poc;
++    struct HEVCRpiFrame *collocated_ref;
++
++    AVBufferRef *col_mvf_buf;
++
++    /**
++     * A sequence counter, so that old frames are output first
++     * after a POC reset
++     */
++    uint16_t sequence;
++
++    /**
++     * A combination of HEVC_FRAME_FLAG_*
++     */
++    uint8_t flags;
++
++    // Entry no in DPB - can be used as a small unique
++    // frame identifier (within the current thread)
++    uint8_t dpb_no;
++} HEVCRpiFrame;
++
++typedef struct HEVCRpiLocalContext {
++    HEVCRpiTransformUnit tu;
++
++    CABACContext cc;
++
++    // Vars that allow us to locate everything from just an lc
++    struct HEVCRpiContext * context;  // ??? make const ???
++    unsigned int lc_n; // lc list el no
++
++    // Job wait links
++    struct HEVCRpiLocalContext * jw_next;
++    struct HEVCRpiLocalContext * jw_prev;
++    struct HEVCRpiLocalContext * ljw_next;
++    struct HEVCRpiLocalContext * ljw_prev;
++    struct HEVCRpiJob * volatile jw_job;
++    sem_t jw_sem;
++
++    // ?? Wrap in structure ??
++    sem_t bt_sem_in;
++    sem_t * bt_psem_out;
++    volatile int bt_terminate;
++    unsigned int ts;
++    unsigned int bt_last_line;  // Last line in this bit_thread chunk
++    unsigned int bt_line_no;
++    unsigned int bt_line_width;
++    unsigned int bt_line_inc;
++
++    struct HEVCRpiJob * jb0;
++    char unit_done;  // Set once we have dealt with this slice
++    char bt_is_tile;
++    char last_progress_good;
++    char cabac_init_req;
++
++    uint8_t cabac_state[HEVC_CONTEXTS];
++    uint8_t stat_coeff[4];
++    GetBitContext gb;
++
++    uint8_t ct_depth;
++    int8_t qp_y;
++    int8_t curr_qp_y;
++    int8_t qPy_pred;
++
++// N.B. Used by asm (neon) - do not change
++#define AVAIL_S_UR  0
++#define AVAIL_S_U   1
++#define AVAIL_S_UL  2
++#define AVAIL_S_L   3
++#define AVAIL_S_DL  4
++
++#define AVAIL_U     (1 << AVAIL_S_U)
++#define AVAIL_L     (1 << AVAIL_S_L)
++#define AVAIL_UL    (1 << AVAIL_S_UL)
++#define AVAIL_UR    (1 << AVAIL_S_UR)
++#define AVAIL_DL    (1 << AVAIL_S_DL)
++
++// Intra filters - same number space as avail
++#define FILTER_LIGHT    0x40
++#define FILTER_STRONG   0x80
++#define FILTER_EITHER   (FILTER_LIGHT | FILTER_STRONG)
++
++    uint8_t ctb_avail;
++    int     end_of_ctb_x;
++    int     end_of_ctb_y;
++
++    RpiCodingUnit cu;
++    RpiPredictionUnit pu;
++
++#define BOUNDARY_LEFT_SLICE     (1 << 0)
++#define BOUNDARY_LEFT_TILE      (1 << 1)
++#define BOUNDARY_UPPER_SLICE    (1 << 2)
++#define BOUNDARY_UPPER_TILE     (1 << 3)
++    /* properties of the boundary of the current CTB for the purposes
++     * of the deblocking filter */
++    unsigned int boundary_flags;
++
++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
++    uint8_t ipm_left[IPM_TAB_SIZE];
++    uint8_t ipm_up[IPM_TAB_SIZE];
++
++//#define MVF_STASH_WIDTH       128
++#define MVF_STASH_WIDTH       64
++#define MVF_STASH_HEIGHT      64
++#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
++#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
++    HEVCRpiMvField mvf_ul[1];
++    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
++
++    /* +7 is for subpixel interpolation, *2 for high bit depths */
++//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++    /* The extended size between the new edge emu buffer is abused by SAO */
++//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++//    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++
++} HEVCRpiLocalContext;
++
++// Each block can have an intra prediction and an add_residual command
++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
++
++// Sand only has 2 planes (Y/C)
++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
++
++// Command for intra prediction and transform_add of predictions to coefficients
++enum rpi_pred_cmd_e
++{
++    RPI_PRED_ADD_RESIDUAL,
++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++    RPI_PRED_ADD_DC,
++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
++    RPI_PRED_ADD_DC_V,
++    RPI_PRED_INTRA,
++    RPI_PRED_INTRA_C,
++    RPI_PRED_I_PCM,
++    RPI_PRED_CMD_MAX
++};
++
++typedef struct HEVCPredCmd {
++    uint8_t type;
++    uint8_t size;  // log2 "size" used by all variants
++    uint8_t avail; // i_pred - but left here as they pack well
++    uint8_t dummy;
++    union {
++        struct {  // TRANSFORM_ADD
++            uint8_t * dst;
++            const int16_t * buf;
++            uint16_t stride;  // Should be good enough for all pic fmts we use
++            int16_t dc;
++        } ta;
++        struct {
++            uint8_t * dst;
++            uint32_t stride;
++            int dc;
++        } dc;
++        struct {  // INTRA
++            uint16_t x;
++            uint16_t y;
++            enum IntraPredMode mode;
++        } i_pred;
++        struct {  // I_PCM
++            uint16_t x;
++            uint16_t y;
++            const void * src;
++            uint32_t src_len;
++        } i_pcm;
++    };
++} HEVCPredCmd;
++
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
++
++typedef struct HEVCRpiInterPredQ
++{
++    union qpu_mc_pred_cmd_u *qpu_mc_base;
++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    struct qpu_mc_src_s *last_l0;
++    struct qpu_mc_src_s *last_l1;
++    unsigned int load;
++    uint32_t code_setup;
++    uint32_t code_sync;
++    uint32_t code_exit;
++} HEVCRpiInterPredQ;
++
++typedef struct HEVCRpiInterPredEnv
++{
++    HEVCRpiInterPredQ * q;
++    uint8_t n;                  // Number of Qs
++    uint8_t n_grp;              // Number of Q in a group
++    uint8_t curr;               // Current Q number (0..n-1)
++    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
++    uint8_t used_grp;           // 0 if nothing in any Q in the current group
++    unsigned int max_fill;
++    unsigned int min_gap;
++    GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++    unsigned int n;        // Number of commands
++    HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCoeffEnv {
++    unsigned int n;
++#if RPI_COMPRESS_COEFFS
++    unsigned int packed; // Equal to 1 if coefficients should be being packed
++    unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed).  Only valid if packed==0
++#endif
++    int16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCoeffsEnv {
++    HEVCRpiCoeffEnv s[4];
++    GPU_MEM_PTR_T gptr;
++    void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiFrameProgressWait {
++    int req;
++    struct HEVCRpiFrameProgressWait * next;
++    sem_t sem;
++} HEVCRpiFrameProgressWait;
++
++typedef struct HEVCRpiFrameProgressState {
++    struct HEVCRpiFrameProgressWait * first;
++    struct HEVCRpiFrameProgressWait * last;
++    pthread_mutex_t lock;
++} HEVCRpiFrameProgressState;
++
++typedef struct RpiBlk
++{
++    unsigned int x;
++    unsigned int y;
++    unsigned int w;
++    unsigned int h;
++} RpiBlk;
++
++typedef struct HEVCRpiJob {
++    struct HEVCRpiJob * next;  // Free chain
++    struct HEVCRpiJobCtl * jbc_local;
++    const HEVCRpiSPS * sps;       // sps used to set up this job
++
++    int waited;
++    int ctu_ts_first;
++    int ctu_ts_last;
++    RpiBlk bounds;  // Bounding box of job
++
++    struct qpu_mc_pred_y_p_s * last_y8_p;
++    struct qpu_mc_src_s * last_y8_l1;
++    rpi_cache_flush_env_t * rfe;
++
++    HEVCRpiInterPredEnv chroma_ip;
++    HEVCRpiInterPredEnv luma_ip;
++    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
++    HEVCRpiIntraPredEnv intra;
++    HEVCRpiCoeffsEnv coeffs;
++    HEVCRpiFrameProgressWait progress_wait;
++    sem_t sem;
++    rpi_cache_buf_t flush_buf;
++} HEVCRpiJob;
++
++struct HEVCRpiContext;
++
++typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
++
++typedef struct HEVCRpiPassQueue
++{
++//    int pending;
++    volatile int terminate;
++    sem_t sem_in;
++    sem_t * psem_out;
++    unsigned int job_n;
++    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
++    HEVCRpiWorkerFn * worker;
++    pthread_t thread;
++    uint8_t pass_n;  // Pass number - debug
++    uint8_t started;
++} HEVCRpiPassQueue;
++
++
++struct HEVCRpiJobGlobal;
++
++typedef struct HEVCRpiJobCtl
++{
++    sem_t sem_out;
++
++    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
++    struct HEVCRpiJobGlobal * jbg;
++
++    HEVCRpiLocalContext * lcw_head;
++    HEVCRpiLocalContext * lcw_tail;
++
++    pthread_mutex_t in_lock;
++    int offload_in;
++
++    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
++} HEVCRpiJobCtl;
++
++
++typedef struct HEVCRpiJobGlobal
++{
++    intptr_t ref_count;
++    pthread_mutex_t lock;
++    HEVCRpiJob * free1;                 // Singly linked list of free jobs
++    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
++    HEVCRpiLocalContext * wait_good;  // Last good tail
++    HEVCRpiLocalContext * wait_tail;
++
++} HEVCRpiJobGlobal;
++
++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++    int y_pred1_y8_merge;
++    int y_pred1_xy;
++    int y_pred1_x0;
++    int y_pred1_y0;
++    int y_pred1_x0y0;
++    int y_pred1_wle8;
++    int y_pred1_wgt8;
++    int y_pred1_hle16;
++    int y_pred1_hgt16;
++    int y_pred2_xy;
++    int y_pred2_x0;
++    int y_pred2_y0;
++    int y_pred2_x0y0;
++    int y_pred2_hle16;
++    int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++typedef struct HEVCRpiCabacState
++{
++    uint8_t rice[4];
++    uint8_t state[HEVC_CONTEXTS];
++} HEVCRpiCabacState;
++
++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT   6   // 64 pels
++#define HEVC_RPI_BS_STRIDE1_PELS        (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_PEL_MASK    (HEVC_RPI_BS_STRIDE1_PELS - 1)
++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT  2   // 4 els per byte
++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT   2   // 4 pels per el
++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT  (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTES       (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++#define HEVC_RPI_BS_Y_SHR               3   // 8 vertical pels per row
++#define HEVC_RPI_BS_COL_BYTES_SHR       (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++
++typedef struct HEVCRpiContext {
++    const AVClass *c;  // needed by private avoptions
++    AVCodecContext *avctx;
++
++    uint8_t             threads_type;
++    char qpu_init_ok;
++
++    /** 1 if the independent slice segment header was successfully parsed */
++    uint8_t slice_initialized;
++    char used_for_ref;  // rpi
++    char is_irap;
++    char offload_recon;
++    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
++    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
++    uint8_t no_backward_pred_flag;
++    uint8_t is_decoded;
++    uint8_t no_rasl_output_flag;
++
++
++    /**
++     * Sequence counters for decoded and output frames, so that old
++     * frames are output first after a POC reset
++     */
++    uint16_t seq_decode;
++    uint16_t seq_output;
++
++    int                 width;
++    int                 height;
++
++    HEVCRpiJobCtl * jbc;
++    // cabac stash
++    // b0       skip flag
++    // b1+      ct_depth
++    uint8_t * cabac_stash_left;
++    uint8_t * cabac_stash_up;
++
++    // Function pointers
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
++#endif
++    HEVCRpiQpu qpu;
++
++    HEVCRpiFrameProgressState progress_states[2];
++
++    HEVCRpiCabacState *cabac_save;
++
++    AVFrame *frame;
++    AVFrame *output_frame;
++    uint8_t *sao_pixel_buffer_h[3];
++    uint8_t *sao_pixel_buffer_v[3];
++
++    unsigned int col_mvf_stride;
++    AVBufferPool *col_mvf_pool;
++
++    RpiSAOParams *sao;
++    DBParams *deblock;
++    enum HEVCNALUnitType nal_unit_type;
++    int temporal_id;  ///< temporal_id_plus1 - 1
++    HEVCRpiFrame *ref;
++    int poc;
++    int pocTid0;
++    int slice_idx; ///< number of the slice being currently decoded
++    int max_ra;
++
++    int8_t *qp_y_tab;
++
++    // Deblocking block strength bitmaps
++    unsigned int bs_stride2;
++    unsigned int bs_size;
++    uint8_t *bs_horizontal;
++    uint8_t *bs_vertical;
++    uint8_t *bsf_stash_up;
++    uint8_t *bsf_stash_left;
++
++#if HEVC_RPI_MAX_CTBS >= 0xffff
++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
++    uint32_t *tab_slice_address;
++#else
++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
++    uint16_t *tab_slice_address;
++#endif
++
++    // Bitfield 1 bit per 8 pels (min pcm size)
++    uint8_t *is_pcm;
++    // Bitfield 1 bit per 8 pels (min cb size)
++    // Only needed for CIP as CIP processing is async to the main thread
++    uint8_t *is_intra;
++
++    // PU
++    HEVCRpiMvField *mvf_up;
++    HEVCRpiMvField *mvf_left;
++
++    const RefPicList **rpl_up;
++    const RefPicList **rpl_left;
++    RefPicList * refPicList;
++
++    // CTB-level flags affecting loop filter operation
++    uint8_t *filter_slice_edges;
++
++    /** used on BE to byteswap the lines for checksumming */
++    uint8_t *checksum_buf;
++    int      checksum_buf_size;
++
++    atomic_int wpp_err;
++
++    const uint8_t *data;
++
++    H2645Packet pkt;
++    // type of the first VCL NAL of the current frame
++    enum HEVCNALUnitType first_nal_type;
++
++    uint8_t context_initialized;
++    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
++                            ///< as a format defined in 14496-15
++    int apply_defdispwin;
++
++    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
++    int nuh_layer_id;
++
++    struct AVMD5 *md5_ctx;
++
++    RefPicListTab * rpl_tab;
++    unsigned int rpl_tab_size;
++
++    uint8_t *is_intra_store;
++
++    RpiSliceHeader sh;
++
++    HEVCRpiParamSets ps;
++
++    HEVCRpiLocalContext    *HEVClc;
++    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
++
++    HEVCRpiFrame DPB[HEVC_DPB_ELS];
++
++    ///< candidate references for the current frame
++    RefPicList rps[5];
++
++    HEVCRpiPredContext hpc;
++    HEVCDSPContext hevcdsp;
++
++    HEVCSEIContext sei;
++
++    // Put structures that allocate non-trivial storage at the end
++    // These are mostly used indirectly so position in the structure doesn't matter
++    HEVCRpiPassQueue passq[RPI_PASSES];
++#if RPI_EXTRA_BIT_THREADS > 0
++    int bt_started;
++    // This simply contains thread descriptors - task setup is held elsewhere
++    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
++#endif
++#if RPI_TSTATS
++    HEVCRpiStats tstats;
++#endif
++} HEVCRpiContext;
++
++/**
++ * Mark all frames in DPB as unused for reference.
++ */
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
++
++/**
++ * Drop all frames currently in DPB.
++ */
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture sets for the current frame.
++ */
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture list(s) for the current slice.
++ */
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
++
++
++/**
++ * Get the number of candidate references for the current frame.
++ */
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
++
++/**
++ * Find next frame in output order and put a reference to it in frame.
++ * @return 1 if a frame was output, 0 otherwise
++ */
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
++
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++                                int nPbH, int log2_cb_size, int part_idx,
++                                int merge_idx, HEVCRpiMvField * const mv);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    HEVCRpiMvField * const mv,
++    const unsigned int mvp_lx_flag, const unsigned int LX);
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++                                               const unsigned int x0, const unsigned int y0,
++                                               const unsigned int log2_trafo_size, const int is_coded_block);
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
++
++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra[4];
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int y)
++{
++    if (s->threads_type != 0)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
++{
++    if (s->used_for_ref && s->threads_type != 0)
++        ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int y)
++{
++    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++}
++
++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
++{
++    if (s->used_for_ref && s->threads_type != 0)
++    {
++        ff_hevc_rpi_progress_signal_field(s, y, 0);
++    }
++}
++
++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
++{
++    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++}
++
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
++{
++    if (ref->tf.progress != NULL)
++    {
++        int * const p = (int *)ref->tf.progress->data;
++        p[0] = INT_MAX;
++        p[1] = INT_MAX;
++    }
++}
++
++#define HEVC_RPI_420_ONLY 1
++#define HEVC_RPI_SAND128_ONLY 1
++
++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++    return cidx == 0 ? 0 : 1;
++#else
++    return s->ps.sps->hshift[cidx];
++#endif
++}
++
++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++    return cidx == 0 ? 0 : 1;
++#else
++    return s->ps.sps->vshift[cidx];
++#endif
++}
++
++static inline int ctx_cfmt(const HEVCRpiContext * const s)
++{
++#if HEVC_RPI_420_ONLY
++    return 1;
++#else
++    return s->ps.sps->chroma_format_idc;
++#endif
++}
++
++static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
++{
++#if HEVC_RPI_SAND128_ONLY
++    return 128;
++#else
++    return frame->linesize[c_idx];
++#endif
++}
++
++#if HEVC_RPI_SAND128_ONLY
++// Propagate this decision to later zc includes
++#define RPI_ZC_SAND128_ONLY 1
++#endif
++
++#ifndef ff_hevc_rpi_copy_vert
++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
++                                         int pixel_shift, int height,
++                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    int i;
++    switch (pixel_shift)
++    {
++        case 2:
++            for (i = 0; i < height; i++) {
++                *(uint32_t *)dst = *(uint32_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        case 1:
++            for (i = 0; i < height; i++) {
++                *(uint16_t *)dst = *(uint16_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        default:
++            for (i = 0; i < height; i++) {
++                *dst = *src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++    }
++}
++#endif
++
++
++#if MVF_STASH_WIDTH == 64
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++    return (HEVCRpiMvField *)((y < y0_ctb) ?
++        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
++        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
++            lc->mvf_stash +
++                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
++                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++                               const unsigned int x0,
++                               const unsigned int x)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
++}
++
++#else
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++    // If not in the same CTB for Y assume up
++    if (y < y0_ctb) {
++        // If not in the same CTB for X too assume up-left
++        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
++    }
++    return mvf_stash_ptr(s, lc, x, y);
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++                               const unsigned int x0,
++                               const unsigned int x)
++{
++    return MVF_STASH_WIDTH_PU;
++}
++#endif
++
++#endif /* AVCODEC_RPI_HEVCDEC_H */
+diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
+new file mode 100644
+index 0000000000..87f3cc9d14
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.c
+@@ -0,0 +1,450 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdsp.h"
++#include "rpi_hevc_mv.h"
++
++static const int8_t transform[32][32] = {
++    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
++      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
++    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
++      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
++    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
++     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
++    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
++      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
++    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
++      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
++    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
++     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
++    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
++     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
++    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
++      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
++    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
++      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
++    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
++     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
++    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
++     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
++    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
++      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
++    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
++      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
++    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
++     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
++    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
++     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
++    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
++      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
++    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
++      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
++    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
++     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
++    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
++     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
++    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
++      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
++    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
++      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
++    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
++     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
++    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
++     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
++    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
++      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
++    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
++      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
++    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
++     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
++    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
++     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
++    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
++      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
++    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
++      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
++    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
++     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
++    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
++      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
++    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
++      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
++    { -2, 58, 10, -2},
++    { -4, 54, 16, -2},
++    { -6, 46, 28, -4},
++    { -4, 36, 36, -4},
++    { -4, 28, 46, -6},
++    { -2, 16, 54, -4},
++    { -2, 10, 58, -2},
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
++    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
++    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
++    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
++};
++
++#define BIT_DEPTH 8
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                               int in_inc0, int in_inc1)
++{
++    int shift = 32;
++    uint32_t bs = 0;
++    for (; pus > 0; pus--) {
++        int strength, out;
++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
++        int nr_idx0 = neigh->ref_idx[0];
++        int nr_idx1 = neigh->ref_idx[1];
++        int neigh_refL0 = neigh_rpl0[nr_idx0];
++        int neigh_refL1 = neigh_rpl1[nr_idx1];
++
++        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
++        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
++
++#if 1 // This more directly matches the original implementation
++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++            // same L0 and L1
++            if (curr_refL0 == neigh_refL0 &&
++                curr_refL0 == curr_refL1 &&
++                neigh_refL0 == neigh_refL1) {
++                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
++                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL0 == curr_refL0 &&
++                       neigh_refL1 == curr_refL1) {
++                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL1 == curr_refL0 &&
++                       neigh_refL0 == curr_refL1) {
++                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else {
++                strength = 1;
++            }
++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++            MvXY curr_mv0, neigh_mv0;
++
++            if (curr->pred_flag & 1) {
++                curr_mv0   = curr->xy[0];
++            } else {
++                curr_mv0   = curr->xy[1];
++                curr_refL0 = curr_refL1;
++            }
++
++            if (neigh->pred_flag & 1) {
++                neigh_mv0   = neigh->xy[0];
++            } else {
++                neigh_mv0   = neigh->xy[1];
++                neigh_refL0 = neigh_refL1;
++            }
++
++            if (curr_refL0 == neigh_refL0) {
++                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else
++                strength = 1;
++        } else
++            strength = 1;
++#else // This has exactly the same effect, but is more suitable for vectorisation
++        MvXY curr_mv[2];
++        MvXY neigh_mv[2];
++        memcpy(curr_mv, curr->xy, sizeof curr_mv);
++        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
++
++        if (!(curr->pred_flag & 2)) {
++            curr_mv[1] = curr_mv[0];
++            curr_refL1 = curr_refL0;
++        }
++        if (!(neigh->pred_flag & 2)) {
++            neigh_mv[1] = neigh_mv[0];
++            neigh_refL1 = neigh_refL0;
++        }
++        if (!(curr->pred_flag & 1)) {
++            curr_mv[0] = curr_mv[1];
++            curr_refL0 = curr_refL1;
++        }
++        if (!(neigh->pred_flag & 1)) {
++            neigh_mv[0] = neigh_mv[1];
++            neigh_refL0 = neigh_refL1;
++        }
++
++        strength = 1;
++
++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
++                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
++                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
++
++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
++                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
++                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
++
++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#endif
++
++        curr += in_inc0 / sizeof (HEVCRpiMvField);
++        neigh += in_inc1 / sizeof (HEVCRpiMvField);
++
++        for (out = dup; out > 0; out--)
++        {
++            bs = (bs >> 2) | (strength << 30);
++            shift -= 2;
++        }
++    }
++    return bs >> shift;
++}
++
++
++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
++{
++    unsigned int i, j;
++
++    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
++        for (i = 0; i < height; i++) {
++            for (j = 0; j < width; j+=8)
++                AV_COPY64U(dst+j, src+j);
++            dst += stride_dst;
++            src += stride_src;
++        }
++    } else {
++        for (i = 0; i < height; i++) {
++            for (j = 0; j < width; j+=16)
++                AV_COPY128(dst+j, src+j);
++            dst += stride_dst;
++            src += stride_src;
++        }
++    }
++}
++
++
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef PEL_FUNC
++#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
++    for(i = 0 ; i < 10 ; i++)                                                  \
++{                                                                              \
++    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
++}
++
++#undef EPEL_FUNCS
++#define EPEL_FUNCS(depth)                                                     \
++    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
++    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
++    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
++    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
++
++#undef EPEL_UNI_FUNCS
++#define EPEL_UNI_FUNCS(depth)                                                 \
++    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
++    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
++    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
++    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
++    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
++    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
++    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
++
++#undef EPEL_BI_FUNCS
++#define EPEL_BI_FUNCS(depth)                                                \
++    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
++    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
++    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
++    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
++    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
++    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
++    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
++
++#undef QPEL_FUNCS
++#define QPEL_FUNCS(depth)                                                     \
++    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
++    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
++    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
++    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
++
++#undef QPEL_UNI_FUNCS
++#define QPEL_UNI_FUNCS(depth)                                                 \
++    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
++    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
++    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
++    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
++
++#undef QPEL_BI_FUNCS
++#define QPEL_BI_FUNCS(depth)                                                  \
++    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
++    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
++    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
++    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
++    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++
++#define SLICED_ADD_RESIDUAL(depth)\
++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
++    }                                                                         \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#define HEVC_DSP(depth)                                                     \
++    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
++    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
++    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
++    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
++    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
++    SLICED_ADD_RESIDUAL(depth);                                             \
++    hevcdsp->dequant                = FUNC(dequant, depth);                 \
++    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
++    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
++    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
++    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
++    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
++    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
++                                                                            \
++    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
++    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
++    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
++    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
++                                                                            \
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
++    }                                                                       \
++    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
++    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++    SLICED_SAO(depth);                                                         \
++                                                                               \
++    QPEL_FUNCS(depth);                                                         \
++    QPEL_UNI_FUNCS(depth);                                                     \
++    QPEL_BI_FUNCS(depth);                                                      \
++    EPEL_FUNCS(depth);                                                         \
++    EPEL_UNI_FUNCS(depth);                                                     \
++    EPEL_BI_FUNCS(depth);                                                      \
++                                                                               \
++    SLICED_LOOP_FILTERS(depth);                                                \
++    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
++    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
++    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
++    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
++    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
++    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
++    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
++    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
++int i = 0;
++
++    switch (bit_depth) {
++    case 9:
++        HEVC_DSP(9);
++        break;
++    case 10:
++        HEVC_DSP(10);
++        break;
++    case 12:
++        HEVC_DSP(12);
++        break;
++    default:
++        HEVC_DSP(8);
++        break;
++    }
++
++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++    hevcdsp->cpy_blk = cpy_blk;
++
++    if (ARCH_PPC)
++        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
++    if (ARCH_X86)
++        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
++    if (ARCH_ARM)
++        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
++    if (ARCH_MIPS)
++        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
++}
+diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
+new file mode 100644
+index 0000000000..5a7cdeeb66
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.h
+@@ -0,0 +1,177 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDSP_H
++#define AVCODEC_RPI_HEVCDSP_H
++
++#include "hevc.h"
++#include "get_bits.h"
++
++struct HEVCRpiMvField;
++
++#define MAX_PB_SIZE 64
++
++#define RPI_HEVC_SAO_BUF_STRIDE 160
++
++
++typedef struct RpiSAOParams {
++    uint8_t band_position[3];   ///< sao_band_position (Y,U,V)
++    uint8_t eo_class[3];        ///< sao_eo_class      (Y,U=V)
++    uint8_t type_idx[3];        ///< sao_type_idx      (Y,U=V)
++
++    int16_t offset_val[3][5];   ///<SaoOffsetVal       (Y,U,V)
++
++} RpiSAOParams;
++
++
++// This controls how many sao dsp functions there are
++// N=5 has width = 8, 16, 32, 48, 64
++// N=6 adds a function for width=24 (in fn array el 5 so existing code should
++// still work)
++#define SAO_FILTER_N 6
++
++
++typedef struct HEVCDSPContext {
++    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++
++    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
++
++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++
++    void (*dequant)(int16_t *coeffs, int16_t log2_size);
++
++    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
++
++    void (*transform_4x4_luma)(int16_t *coeffs);
++
++    void (*idct[4])(int16_t *coeffs, int col_limit);
++
++    void (*idct_dc[4])(int16_t *coeffs);
++
++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
++                               int width, int height);
++
++    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++
++    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++
++    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                        int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++
++    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, int denom, int wx0, int wx1,
++                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, intptr_t mx, intptr_t my, int width);
++
++    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, int denom, int wx0, int ox0, int wx1,
++                                         int ox1, intptr_t mx, intptr_t my, int width);
++
++    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                    int beta, int32_t *tc,
++                                    uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                    int beta, int32_t *tc,
++                                    uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++                                      int beta, int32_t *tc,
++                                      uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++                                      int beta, int32_t *tc,
++                                      uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                        int32_t *tc, uint8_t *no_p,
++                                        uint8_t *no_q);
++    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                        int32_t *tc, uint8_t *no_p,
++                                        uint8_t *no_q);
++    void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                                 uint8_t * _pix_l);
++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f);
++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f);
++
++    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                               int in_inc0, int inc_inc1);
++
++    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
++} HEVCDSPContext;
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++
++extern const int8_t ff_hevc_rpi_epel_filters[7][4];
++extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
++
++void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
++void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
++#endif /* AVCODEC_RPI_HEVCDSP_H */
+diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
+new file mode 100644
+index 0000000000..dea4e55e4b
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp_template.c
+@@ -0,0 +1,2279 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "get_bits.h"
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++#include "rpi_hevcdsp.h"
++
++#include "rpi_hevc_shader_template.h"
++
++static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++
++    dst = (pixel *)_dst + 1;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
++                                                ptrdiff_t stride, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + dc);
++        }
++        dst += stride;
++    }
++}
++
++
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_v, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_u, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, unsigned int size)
++{
++    unsigned int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int16_t * ru = res;
++    const int16_t * rv = res + size * size;
++
++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++        }
++        dst += stride;
++    }
++
++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
++}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int dc_v = dc >> 16;
++    const int dc_u = (dc << 16) >> 16;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++        }
++        dst += stride;
++    }
++}
++
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 32);
++}
++
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++
++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++{
++    int16_t *coeffs = (int16_t *) _coeffs;
++    int x, y;
++    int size = 1 << log2_size;
++
++    if (mode) {
++        coeffs += size;
++        for (y = 0; y < size - 1; y++) {
++            for (x = 0; x < size; x++)
++                coeffs[x] += coeffs[x - size];
++            coeffs += size;
++        }
++    } else {
++        for (y = 0; y < size; y++) {
++            for (x = 1; x < size; x++)
++                coeffs[x] += coeffs[x - 1];
++            coeffs += size;
++        }
++    }
++}
++
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
++{
++    int shift  = 15 - BIT_DEPTH - log2_size;
++    int x, y;
++    int size = 1 << log2_size;
++
++    if (shift > 0) {
++        int offset = 1 << (shift - 1);
++        for (y = 0; y < size; y++) {
++            for (x = 0; x < size; x++) {
++                *coeffs = (*coeffs + offset) >> shift;
++                coeffs++;
++            }
++        }
++    } else {
++        for (y = 0; y < size; y++) {
++            for (x = 0; x < size; x++) {
++                *coeffs = *coeffs << -shift;
++                coeffs++;
++            }
++        }
++    }
++}
++
++#define SET(dst, x)   (dst) = (x)
++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
++
++#define TR_4x4_LUMA(dst, src, step, assign)                             \
++    do {                                                                \
++        int c0 = src[0 * step] + src[2 * step];                         \
++        int c1 = src[2 * step] + src[3 * step];                         \
++        int c2 = src[0 * step] - src[3 * step];                         \
++        int c3 = 74 * src[1 * step];                                    \
++                                                                        \
++        assign(dst[2 * step], 74 * (src[0 * step] -                     \
++                                    src[2 * step] +                     \
++                                    src[3 * step]));                    \
++        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
++        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
++        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
++    } while (0)
++
++static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++{
++    int i;
++    int shift    = 7;
++    int add      = 1 << (shift - 1);
++    int16_t *src = coeffs;
++
++    for (i = 0; i < 4; i++) {
++        TR_4x4_LUMA(src, src, 4, SCALE);
++        src++;
++    }
++
++    shift = 20 - BIT_DEPTH;
++    add   = 1 << (shift - 1);
++    for (i = 0; i < 4; i++) {
++        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
++        coeffs += 4;
++    }
++}
++
++#undef TR_4x4_LUMA
++
++#define TR_4(dst, src, dstep, sstep, assign, end)                 \
++    do {                                                          \
++        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
++        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
++        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
++        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
++                                                                  \
++        assign(dst[0 * dstep], e0 + o0);                          \
++        assign(dst[1 * dstep], e1 + o1);                          \
++        assign(dst[2 * dstep], e1 - o1);                          \
++        assign(dst[3 * dstep], e0 - o0);                          \
++    } while (0)
++
++#define TR_8(dst, src, dstep, sstep, assign, end)                 \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_8[4];                                               \
++        int o_8[4] = { 0 };                                       \
++        for (i = 0; i < 4; i++)                                   \
++            for (j = 1; j < end; j += 2)                          \
++                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
++        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
++                                                                  \
++        for (i = 0; i < 4; i++) {                                 \
++            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
++            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
++        }                                                         \
++    } while (0)
++
++#define TR_16(dst, src, dstep, sstep, assign, end)                \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_16[8];                                              \
++        int o_16[8] = { 0 };                                      \
++        for (i = 0; i < 8; i++)                                   \
++            for (j = 1; j < end; j += 2)                          \
++                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
++        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
++                                                                  \
++        for (i = 0; i < 8; i++) {                                 \
++            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
++            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
++        }                                                         \
++    } while (0)
++
++#define TR_32(dst, src, dstep, sstep, assign, end)                \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_32[16];                                             \
++        int o_32[16] = { 0 };                                     \
++        for (i = 0; i < 16; i++)                                  \
++            for (j = 1; j < end; j += 2)                          \
++                o_32[i] += transform[j][i] * src[j * sstep];      \
++        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
++                                                                  \
++        for (i = 0; i < 16; i++) {                                \
++            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
++            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
++        }                                                         \
++    } while (0)
++
++#define IDCT_VAR4(H)                                              \
++    int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR8(H)                                              \
++    int limit  = FFMIN(col_limit, H);                             \
++    int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR16(H)   IDCT_VAR8(H)
++#define IDCT_VAR32(H)   IDCT_VAR8(H)
++
++#define IDCT(H)                                                   \
++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
++                                        int col_limit)            \
++{                                                                 \
++    int i;                                                        \
++    int      shift = 7;                                           \
++    int      add   = 1 << (shift - 1);                            \
++    int16_t *src   = coeffs;                                      \
++    IDCT_VAR ## H(H);                                             \
++                                                                  \
++    for (i = 0; i < H; i++) {                                     \
++        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
++        if (limit2 < H && i%4 == 0 && !!i)                        \
++            limit2 -= 4;                                          \
++        src++;                                                    \
++    }                                                             \
++                                                                  \
++    shift = 20 - BIT_DEPTH;                                       \
++    add   = 1 << (shift - 1);                                     \
++    for (i = 0; i < H; i++) {                                     \
++        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
++        coeffs += H;                                              \
++    }                                                             \
++}
++
++#define IDCT_DC(H)                                                \
++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
++{                                                                 \
++    int i, j;                                                     \
++    int shift = 14 - BIT_DEPTH;                                   \
++    int add   = 1 << (shift - 1);                                 \
++    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
++                                                                  \
++    for (j = 0; j < H; j++) {                                     \
++        for (i = 0; i < H; i++) {                                 \
++            coeffs[i + j * H] = coeff;                            \
++        }                                                         \
++    }                                                             \
++}
++
++IDCT( 4)
++IDCT( 8)
++IDCT(16)
++IDCT(32)
++
++IDCT_DC( 4)
++IDCT_DC( 8)
++IDCT_DC(16)
++IDCT_DC(32)
++
++#undef TR_4
++#undef TR_8
++#undef TR_16
++#undef TR_32
++
++#undef SET
++#undef SCALE
++
++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  int16_t *sao_offset_val, int sao_left_class,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    for (k = 0; k < 4; k++)
++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++
++#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
++
++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++
++    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            int diff0 = CMP(src[x], src[x + a_stride]);
++            int diff1 = CMP(src[x], src[x + b_stride]);
++            int offset_val        = edge_idx[2 + diff0 + diff1];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++
++
++#if BIT_DEPTH == 10
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
++#endif
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int sao_eo_class    = sao->eo_class[c_idx];
++    int init_x = 0, width = _width, height = _height;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    if (sao_eo_class != SAO_EO_VERT) {
++        if (borders[0]) {
++            for (y = 0; y < height; y++) {
++                dst[y * stride_dst] = src[y * stride_src];
++            }
++            init_x = 1;
++        }
++        if (borders[2]) {
++            int offset     = width - 1;
++            for (x = 0; x < height; x++) {
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++            }
++            width--;
++        }
++    }
++    if (sao_eo_class != SAO_EO_HORIZ) {
++        if (borders[1]) {
++            for (x = init_x; x < width; x++)
++                dst[x] = src[x];
++        }
++        if (borders[3]) {
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++            for (x = init_x; x < width; x++)
++                dst[x + y_stride_dst] = src[x + y_stride_src];
++            height--;
++        }
++    }
++}
++
++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int sao_eo_class    = sao->eo_class[c_idx];
++    int init_x = 0, init_y = 0, width = _width, height = _height;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    if (sao_eo_class != SAO_EO_VERT) {
++        if (borders[0]) {
++            for (y = 0; y < height; y++) {
++                dst[y * stride_dst] = src[y * stride_src];
++            }
++            init_x = 1;
++        }
++        if (borders[2]) {
++            int offset     = width - 1;
++            for (x = 0; x < height; x++) {
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++            }
++            width--;
++        }
++    }
++    if (sao_eo_class != SAO_EO_HORIZ) {
++        if (borders[1]) {
++            for (x = init_x; x < width; x++)
++                dst[x] = src[x];
++            init_y = 1;
++        }
++        if (borders[3]) {
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++            for (x = init_x; x < width; x++)
++                dst[x + y_stride_dst] = src[x + y_stride_src];
++            height--;
++        }
++    }
++
++    {
++        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
++        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
++        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
++        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
++
++        // Restore pixels that can't be modified
++        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
++            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
++                dst[y*stride_dst] = src[y*stride_src];
++        }
++        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
++            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
++                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
++        }
++
++        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
++            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
++                dst[x] = src[x];
++        }
++        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
++            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
++                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
++        }
++        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
++            dst[0] = src[0];
++        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
++            dst[width-1] = src[width-1];
++        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
++            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
++        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
++            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
++
++    }
++}
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
++
++// --- Plaited chroma versions
++
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table_u[32] = { 0 };
++    int offset_table_v[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++    width *= 2;
++
++    for (k = 0; k < 4; k++)
++    {
++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++    }
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2)
++        {
++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++            // *** & 31 shouldn't be wanted but just now we generate broken input that
++            // crashes us in 10-bit world
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
++        }
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++
++    stride_dst /= sizeof(pixel);
++    width *= 2;
++
++    av_assert0(width <= 64);
++
++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2) {
++            int diff0u = CMP(src[x], src[x + a_stride]);
++            int diff1u = CMP(src[x], src[x + b_stride]);
++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++
++// Do once
++#if BIT_DEPTH == 8
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
++#endif
++
++#undef CMP
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
++                                      uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = src[x] << (14 - BIT_DEPTH);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, intptr_t mx, intptr_t my, int width)
++{
++    int y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        memcpy(dst, src, width * sizeof(pixel));
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    int shift = 14  + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                           int16_t *src2,
++                                           int height, int denom, int wx0, int wx1,
++                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    int shift = 14  + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
++        }
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define QPEL_FILTER(src, stride)                                               \
++    (filter[0] * src[x - 3 * stride] +                                         \
++     filter[1] * src[x - 2 * stride] +                                         \
++     filter[2] * src[x -     stride] +                                         \
++     filter[3] * src[x             ] +                                         \
++     filter[4] * src[x +     stride] +                                         \
++     filter[5] * src[x + 2 * stride] +                                         \
++     filter[6] * src[x + 3 * stride] +                                         \
++     filter[7] * src[x + 4 * stride])
++
++static void FUNC(put_hevc_qpel_h)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_v)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    for (y = 0; y < height; y++)  {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
++                                   uint8_t *_src,
++                                   ptrdiff_t _srcstride,
++                                   int height, intptr_t mx,
++                                   intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++        tmp += MAX_PB_SIZE;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                      uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++
++    int shift = 14  + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                     uint8_t *_src, ptrdiff_t _srcstride,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++
++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                       uint8_t *_src, ptrdiff_t _srcstride,
++                                       int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift =  14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int16_t *src2,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                        uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox,
++                                        intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++
++    int shift = 14  + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                        uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox,
++                                        intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                         uint8_t *_src, ptrdiff_t _srcstride,
++                                         int height, int denom, int wx, int ox,
++                                         intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int16_t *src2,
++                                        int height, int denom, int wx0, int wx1,
++                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define EPEL_FILTER(src, stride)                                               \
++    (filter[0] * src[x - stride] +                                             \
++     filter[1] * src[x]          +                                             \
++     filter[2] * src[x + stride] +                                             \
++     filter[3] * src[x + 2 * stride])
++
++static void FUNC(put_hevc_epel_h)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_v)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_hv)(int16_t *dst,
++                                   uint8_t *_src, ptrdiff_t _srcstride,
++                                   int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++        tmp += MAX_PB_SIZE;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        }
++        dst  += dststride;
++        src  += srcstride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        dst  += dststride;
++        src  += srcstride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int16_t *src2,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        }
++        dst += dststride;
++        src += srcstride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        }
++        dst += dststride;
++        src += srcstride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int16_t *src2,
++                                        int height, int denom, int wx0, int wx1,
++                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++// line zero
++#define P3 pix[-4 * xstride]
++#define P2 pix[-3 * xstride]
++#define P1 pix[-2 * xstride]
++#define P0 pix[-1 * xstride]
++#define Q0 pix[0 * xstride]
++#define Q1 pix[1 * xstride]
++#define Q2 pix[2 * xstride]
++#define Q3 pix[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix[-4 * xstride + 3 * ystride]
++#define TP2 pix[-3 * xstride + 3 * ystride]
++#define TP1 pix[-2 * xstride + 3 * ystride]
++#define TP0 pix[-1 * xstride + 3 * ystride]
++#define TQ0 pix[0  * xstride + 3 * ystride]
++#define TQ1 pix[1  * xstride + 3 * ystride]
++#define TQ2 pix[2  * xstride + 3 * ystride]
++#define TQ3 pix[3  * xstride + 3 * ystride]
++
++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
++                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
++                                        int beta, int *_tc,
++                                        uint8_t *_no_p, uint8_t *_no_q)
++{
++    int d, j;
++    pixel *pix        = (pixel *)_pix;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    beta <<= BIT_DEPTH - 8;
++
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
++        const int no_p = _no_p[j];
++        const int no_q = _no_q[j];
++
++        if (d0 + d3 >= beta) {
++            pix += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
++
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix += ystride;
++                }
++            }
++        }
++    }
++}
++
++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, int *_tc,
++                                          uint8_t *_no_p, uint8_t *_no_q)
++{
++    int d, j, no_p, no_q;
++    pixel *pix        = (pixel *)_pix;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
++
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix += ystride;
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                            int32_t *tc, uint8_t *no_p,
++                                            uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                            int32_t *tc, uint8_t *no_p,
++                                            uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                          int beta, int32_t *tc, uint8_t *no_p,
++                                          uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
++                                beta, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                          int beta, int32_t *tc, uint8_t *no_p,
++                                          uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
++                                beta, tc, no_p, no_q);
++}
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
++
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                                 uint8_t * _pix_l)
++{
++    int d, j;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    const ptrdiff_t xstride = 1;
++    const ptrdiff_t ystride = _stride / sizeof(pixel);
++
++    beta <<= BIT_DEPTH - 8;
++
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
++        const int no_p = no_f & 1;
++        const int no_q = no_f & 2;
++
++        if (d0 + d3 >= beta) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
++
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            }
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
++{
++    // Just call the non-2 function having massaged the parameters
++    int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
++    uint8_t no_p[2] = {no_f & 1, no_f & 1};
++    uint8_t no_q[2] = {no_f & 2, no_f & 2};
++    FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
++}
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, const int32_t *_tc,
++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++    int d, j, no_p, no_q;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
++
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix_l += ystride;
++            pix_r += ystride;
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
++
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++
+diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
+new file mode 100644
+index 0000000000..0aa8809a4b
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.c
+@@ -0,0 +1,161 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdec.h"
++
++#include "rpi_hevcpred.h"
++#if (ARCH_ARM)
++#include "arm/rpi_hevcpred_arm.h"
++#endif
++
++#define PRED_C 0
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
++
++#define HEVC_PRED_Y(depth)                                \
++    hpc->intra_pred      = FUNC(intra_pred, depth);     \
++    hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
++    hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
++    hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
++    hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
++    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
++    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
++    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
++    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
++    hpc->pred_dc[0]      = FUNC(pred_dc_0, depth);      \
++    hpc->pred_dc[1]      = FUNC(pred_dc_1, depth);      \
++    hpc->pred_dc[2]      = FUNC(pred_dc_2, depth);      \
++    hpc->pred_dc[3]      = FUNC(pred_dc_3, depth);      \
++    hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
++    hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
++    hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
++    hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
++    hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
++    hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
++    hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
++    hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
++    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
++    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
++    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++    hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
++    hpc->pred_dc0[0]     = FUNC(pred_dc0_0, depth);     \
++    hpc->pred_dc0[1]     = FUNC(pred_dc0_1, depth);     \
++    hpc->pred_dc0[2]     = FUNC(pred_dc0_2, depth);     \
++    hpc->pred_dc0[3]     = FUNC(pred_dc0_3, depth);
++
++#define HEVC_PRED_C(depth)                                \
++    hpc->intra_pred_c      = FUNCC(intra_pred, depth);     \
++	hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
++	hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
++	hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
++	hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
++    hpc->pred_dc_c[0]      = FUNCC(pred_dc_0, depth);      \
++    hpc->pred_dc_c[1]      = FUNCC(pred_dc_1, depth);      \
++    hpc->pred_dc_c[2]      = FUNCC(pred_dc_2, depth);      \
++    hpc->pred_dc_c[3]      = FUNCC(pred_dc_3, depth);      \
++    hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
++    hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
++    hpc->pred_dc0_c[0]     = FUNCC(pred_dc0_0, depth);     \
++    hpc->pred_dc0_c[1]     = FUNCC(pred_dc0_1, depth);     \
++    hpc->pred_dc0_c[2]     = FUNCC(pred_dc0_2, depth);     \
++    hpc->pred_dc0_c[3]     = FUNCC(pred_dc0_3, depth);
++
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth); \
++    HEVC_PRED_C(depth);
++
++    switch (bit_depth) {
++    case 9:
++        HEVC_PRED(9);
++        break;
++    case 10:
++        HEVC_PRED(10);
++        break;
++    case 12:
++        HEVC_PRED(12);
++        break;
++    default:
++        HEVC_PRED(8);
++        break;
++    }
++
++#if (ARCH_ARM)
++    ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
++#elif (ARCH_MIPS)
++    ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#endif
++}
+diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
+new file mode 100644
+index 0000000000..9f0edb8798
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.h
+@@ -0,0 +1,123 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCPRED_H
++#define AVCODEC_RPI_HEVCPRED_H
++
++#include <stddef.h>
++#include <stdint.h>
++#include "config.h"
++
++struct HEVCRpiContext;
++struct HEVCRpiLocalContext;
++
++enum IntraPredMode {
++    INTRA_PLANAR = 0,
++    INTRA_DC,
++    INTRA_ANGULAR_2,
++    INTRA_ANGULAR_3,
++    INTRA_ANGULAR_4,
++    INTRA_ANGULAR_5,
++    INTRA_ANGULAR_6,
++    INTRA_ANGULAR_7,
++    INTRA_ANGULAR_8,
++    INTRA_ANGULAR_9,
++    INTRA_ANGULAR_10,
++    INTRA_ANGULAR_11,
++    INTRA_ANGULAR_12,
++    INTRA_ANGULAR_13,
++    INTRA_ANGULAR_14,
++    INTRA_ANGULAR_15,
++    INTRA_ANGULAR_16,
++    INTRA_ANGULAR_17,
++    INTRA_ANGULAR_18,
++    INTRA_ANGULAR_19,
++    INTRA_ANGULAR_20,
++    INTRA_ANGULAR_21,
++    INTRA_ANGULAR_22,
++    INTRA_ANGULAR_23,
++    INTRA_ANGULAR_24,
++    INTRA_ANGULAR_25,
++    INTRA_ANGULAR_26,
++    INTRA_ANGULAR_27,
++    INTRA_ANGULAR_28,
++    INTRA_ANGULAR_29,
++    INTRA_ANGULAR_30,
++    INTRA_ANGULAR_31,
++    INTRA_ANGULAR_32,
++    INTRA_ANGULAR_33,
++    INTRA_ANGULAR_34,
++};
++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
++#define INTRA_ANGULAR_VERTICAL   INTRA_ANGULAR_26
++
++typedef void intra_filter_fn_t(
++        uint8_t * const left, uint8_t * const top,
++        const unsigned int req, const unsigned int avail,
++        const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
++        const unsigned int stride,
++        const unsigned int top_right_size, const unsigned int down_left_size);
++
++typedef struct HEVCRpiPredContext {
++    void (*intra_pred)(const struct HEVCRpiContext * const s,
++                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++                          const unsigned int avail, const unsigned int log2_size);
++
++    intra_filter_fn_t *intra_filter[4];
++    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride);
++    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
++
++    void (*intra_pred_c)(const struct HEVCRpiContext * const s,
++                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++                          const unsigned int avail, const unsigned int log2_size);
++    intra_filter_fn_t *intra_filter_c[4];
++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride);
++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
++} HEVCRpiPredContext;
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
++
++#endif /* AVCODEC_RPI_HEVCPRED_H */
+diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
+new file mode 100644
+index 0000000000..f2ebcad332
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred_template.c
+@@ -0,0 +1,1407 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "bit_depth_template.c"
++
++#include "rpi_hevcdec.h"
++#include "rpi_hevcpred.h"
++
++#define DUMP_PRED 0
++
++#define POS(x, y) src[(x) + stride * (y)]
++
++// INCLUDED_ONCE defined at EOF
++#ifndef INCLUDED_ONCE
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
++
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++    uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++    uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++    pixel4_16 t = {{x, x, x, x}};
++    return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++    pixel4_32 t = {{x, x, x, x}};
++    return t;
++}
++#endif
++
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
++
++#if BIT_DEPTH == 8
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t  c8_src_ptr_t
++#define c_dst_ptr_t  c8_dst_ptr_t
++#else
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
++#endif
++
++
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
++#endif
++
++
++#if DUMP_PRED && !defined(INCLUDED_ONCE)
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++        for (unsigned int x = 0; x != size; x++) {
++            printf("%4d", data[x * 2]);
++        }
++        printf("\n");
++    }
++    printf("\n");
++}
++#endif
++
++#ifndef INCLUDED_ONCE
++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
++{
++    if ((n >>= 2) != 0) {
++        uint32_t v4 = v | (v << 8);
++        uint32_t * p = (uint32_t *)ptr;
++        v4 = v4 | (v4 << 16);
++        do {
++            *p++ = v4;
++        } while (--n != 0);
++    }
++}
++
++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
++{
++    if ((n >>= 2) != 0) {
++        uint32_t v2 = v | (v << 16);
++        uint32_t * p = (uint32_t *)ptr;
++        do {
++            *p++ = v2;
++            *p++ = v2;
++        } while (--n != 0);
++    }
++}
++
++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
++{
++    if ((n >>= 2) != 0) {
++        uint32_t * p = (uint32_t *)ptr;
++        do {
++            *p++ = v;
++            *p++ = v;
++            *p++ = v;
++            *p++ = v;
++        } while (--n != 0);
++    }
++}
++
++// Beware that this inverts the avail ordering
++// For CIP it seems easier this way round
++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
++                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++                              unsigned int s0, unsigned int odd_s)
++{
++    const unsigned int n = 1 << log2_intra_bits;
++    unsigned int fa = 0;
++    unsigned int i;
++
++    size >>= 2;   // Now in 4-pel units
++    s0 >>= 2;
++
++    if ((avail & AVAIL_DL) != 0)
++        fa |= ((1 << s0) - 1) << (size - s0);
++    if ((avail & AVAIL_L) != 0)
++        fa |= ((1 << size) - 1) << size;
++    if ((avail & AVAIL_UL) != 0)
++        fa |= 1 << (size << 1);
++
++    if (odd_s) {
++        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
++            fa &= ~1;
++        is_intra += i_stride;
++    }
++
++    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
++        const unsigned int m = ((1 << n) - 1) << i;
++        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
++            fa &= ~m;
++    }
++
++    return fa;
++}
++
++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
++                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++                                unsigned int s1, unsigned int odd_s)
++{
++    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
++    {
++        return 0;
++    }
++    else
++    {
++        const unsigned int n = 1 << log2_intra_bits;
++        unsigned int fa = 0;
++        unsigned int i;
++        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
++
++        size >>= 2;   // Now in 4-pel units
++        s1 >>= 2;
++
++        if ((avail & AVAIL_U) != 0)
++            fa |= ((1 << size) - 1);
++        if ((avail & AVAIL_UR) != 0)
++            fa |= ((1 << s1) - 1) << size;
++
++        if (odd_s) {
++            fa &= im | ~1;
++            im >>= 1;
++        }
++
++        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
++            const unsigned int m = ((1 << n) - 1) << i;
++            if ((im & 1) == 0)
++                fa &= ~m;
++        }
++        return fa;
++    }
++}
++
++
++
++static inline unsigned int rmbd(unsigned int x)
++{
++#if 1
++    return __builtin_ctz(x);
++#else
++    unsigned int n = 0;
++    if ((x & 0xffff) == 0) {
++        x >>= 16;
++        n += 16;
++    }
++    if ((x & 0xff) == 0) {
++        x >>= 8;
++        n += 8;
++    }
++    if ((x & 0xf) == 0) {
++        x >>= 4;
++        n += 4;
++    }
++    if ((x & 0x3) == 0) {
++        x >>= 2;
++        n += 2;
++    }
++
++    return (x & 1) == 0 ? n + 1 : n;
++#endif
++}
++#endif
++
++
++static void FUNC(cip_fill)(pixel * const left, pixel * const top,
++    const unsigned int avail_l, const unsigned int avail_u,
++    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++    const unsigned int stride,
++    const unsigned int size)
++{
++    pixel a;
++    unsigned int i;
++
++    // 1st find DL value
++    if ((avail_l & 1) == 0) {
++        if (avail_l != 0)
++            a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
++        else
++        {
++            // (avail_l | avail_u) != 0 so this must be good
++            const unsigned int n = rmbd(avail_u)*4;
++            a = (n >= size) ? src_ur[n - size] : src_u[n];
++        }
++    }
++
++    // L
++    {
++        pixel * d = left + size * 2 - 1;
++        const pixel * s = src_l + (size * 2 - 1) * stride;
++        unsigned int x = avail_l;
++        for (i = 0; i < size * 2; i += 4, x >>= 1)
++        {
++            if ((x & 1) != 0) {
++                // Avail
++                *d-- = *s;
++                s -= stride;
++                *d-- = *s;
++                s -= stride;
++                *d-- = *s;
++                s -= stride;
++                *d-- = a = *s;
++                s -= stride;
++            }
++            else
++            {
++                *d-- = a;
++                *d-- = a;
++                *d-- = a;
++                *d-- = a;
++                s -= stride * 4;
++            }
++        }
++        // UL
++        *d = a = (x & 1) != 0 ? *s : a;
++    }
++
++    // U
++    {
++        pixel * d = top;
++        const pixel * s = src_u;
++        unsigned int x = avail_u;
++
++        for (i = 0; i < size; i += 4, x >>= 1)
++        {
++            if ((x & 1) != 0) {
++                // Avail
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = a = *s++;
++            }
++            else
++            {
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                s += 4;
++            }
++        }
++
++        // UR
++        s = src_ur;
++        for (i = 0; i < size; i += 4, x >>= 1)
++        {
++            if ((x & 1) != 0) {
++                // Avail
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = a = *s++;
++            }
++            else
++            {
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                s += 4;
++            }
++        }
++    }
++}
++
++
++#if !PRED_C && PW == 1
++#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
++#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
++#else
++#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
++#endif
++
++// Reqs:
++//
++// Planar:  DL[0], L, ul, U, UR[0]
++// DC:         dl, L, ul, U, ur
++// A2-9:       DL, L, ul, u, ur
++// A10:        dl, L, ul, u, ur
++// A11-17      dl, L, UL, U, ur
++// A18-25      dl, L, Ul, U, ur
++// A26         dl, l, ul, U, ur
++// A27-34      dl, l, ul, U, UR
++
++#ifndef INCLUDED_ONCE
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++
++static const uint8_t req_avail_c[35] =
++{
++    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0         |  AVAIL_U,             // DC
++    AVAIL_DL | AVAIL_L,                                    // 2
++    AVAIL_DL | AVAIL_L,                                    // 3
++    AVAIL_DL | AVAIL_L,                                    // 4
++    AVAIL_DL | AVAIL_L,                                    // 5
++    AVAIL_DL | AVAIL_L,                                    // 6
++    AVAIL_DL | AVAIL_L,                                    // 7
++    AVAIL_DL | AVAIL_L,                                    // 8
++    AVAIL_DL | AVAIL_L,                                    // 9
++               AVAIL_L,                                    // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
++                                    AVAIL_U,               // 26 (V)
++                                    AVAIL_U | AVAIL_UR,    // 27
++                                    AVAIL_U | AVAIL_UR,    // 28
++                                    AVAIL_U | AVAIL_UR,    // 29
++                                    AVAIL_U | AVAIL_UR,    // 30
++                                    AVAIL_U | AVAIL_UR,    // 31
++                                    AVAIL_U | AVAIL_UR,    // 32
++                                    AVAIL_U | AVAIL_UR,    // 33
++                                    AVAIL_U | AVAIL_UR     // 34
++};
++
++static const uint8_t req_avail[4][35] = {
++{
++    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0         |  AVAIL_U,             // DC
++    AVAIL_DL | AVAIL_L,                                    // 2
++    AVAIL_DL | AVAIL_L,                                    // 3
++    AVAIL_DL | AVAIL_L,                                    // 4
++    AVAIL_DL | AVAIL_L,                                    // 5
++    AVAIL_DL | AVAIL_L,                                    // 6
++    AVAIL_DL | AVAIL_L,                                    // 7
++    AVAIL_DL | AVAIL_L,                                    // 8
++    AVAIL_DL | AVAIL_L,                                    // 9
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 26 (V)
++                                    AVAIL_U | AVAIL_UR,    // 27
++                                    AVAIL_U | AVAIL_UR,    // 28
++                                    AVAIL_U | AVAIL_UR,    // 29
++                                    AVAIL_U | AVAIL_UR,    // 30
++                                    AVAIL_U | AVAIL_UR,    // 31
++                                    AVAIL_U | AVAIL_UR,    // 32
++                                    AVAIL_U | AVAIL_UR,    // 33
++                                    AVAIL_U | AVAIL_UR     // 34
++},
++{  // 3
++    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0        | AVAIL_U,                            // DC
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
++    AVAIL_DL | AVAIL_L                                 | 0,             // 3
++    AVAIL_DL | AVAIL_L                                 | 0,             // 4
++    AVAIL_DL | AVAIL_L                                 | 0,             // 5
++    AVAIL_DL | AVAIL_L                                 | 0,             // 6
++    AVAIL_DL | AVAIL_L                                 | 0,             // 7
++    AVAIL_DL | AVAIL_L                                 | 0,             // 8
++    AVAIL_DL | AVAIL_L                                 | 0,             // 9
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
++                                    AVAIL_U | AVAIL_UR | 0,             // 27
++                                    AVAIL_U | AVAIL_UR | 0,             // 28
++                                    AVAIL_U | AVAIL_UR | 0,             // 29
++                                    AVAIL_U | AVAIL_UR | 0,             // 30
++                                    AVAIL_U | AVAIL_UR | 0,             // 31
++                                    AVAIL_U | AVAIL_UR | 0,             // 32
++                                    AVAIL_U | AVAIL_UR | 0,             // 33
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
++},
++{  // 4
++    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0        | AVAIL_U,                            // DC
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 3
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 4
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 5
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 6
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 7
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 8
++    AVAIL_DL | AVAIL_L                                 | 0,             // 9
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
++                                    AVAIL_U | AVAIL_UR | 0,             // 27
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 28
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 29
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 30
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 31
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 32
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 33
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
++},
++{  // 5
++    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0        | AVAIL_U,                            // DC
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 2
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 3
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 4
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 5
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 6
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 7
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 8
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 9
++               AVAIL_L                                 | 0,             // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 25
++                                    AVAIL_U            | 0,             // 26 (V)
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER  // 34
++}
++};
++
++
++#endif
++
++#define filter_light1 FUNC(filter_light1)
++static inline pixel filter_light1(pixel a, pixel b, pixel c)
++{
++    return (a + b*2 + c + 2) >> 2;
++}
++
++#define filter_light FUNC(filter_light)
++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
++{
++    pixel p0;
++    pixel p2 = *src;
++    // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
++    unsigned int n_minus_1 = n - 1;
++
++    do
++    {
++        src += sstride;
++        p0 = p1;
++        p1 = p2;
++        p2 = *src;
++        *dst++ = filter_light1(p0, p1, p2);
++    } while (--n_minus_1 != 0);
++    *dst = filter_light1(p1, p2, pn);
++}
++
++#define filter_strong FUNC(filter_strong)
++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
++{
++    unsigned int a = 64 * p0 + 32;
++    const int v = p1 - p0;
++
++    do
++    {
++        *dst++ = (a += v) >> 6;
++    } while (--n != 0);
++}
++
++#define intra_filter FUNC(intra_filter)
++static av_always_inline void intra_filter(
++    pixel * const left, pixel * const top,
++    const unsigned int req, const unsigned int avail,
++    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++    const unsigned int stride,
++    const unsigned int top_right_size, const unsigned int down_left_size,
++    const unsigned int log2_size)
++{
++    const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
++    const unsigned int size = 1 << log2_size;
++
++    // a_ is the first pel in a section working round dl -> ur
++    // b_ is the last
++    // Beware that top & left work out from UL so usage of a_ & b_ may
++    // swap between them.  It is a bad naming scheme but I have found no
++    // better
++    const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
++    const pixel * b_dl = src_l + size * stride;
++    const pixel * a_l  = src_l + (size - 1) * stride;
++    const pixel * b_l  = src_l;
++    const pixel * ab_ul = src_l - stride;
++    const pixel * a_u = src_u;
++    const pixel * b_u = src_u + size - 1;
++    const pixel * a_ur = src_ur;
++    const pixel * b_ur = src_ur + top_right_size - 1;
++
++    const unsigned int want = req & ~avail;
++    const unsigned int have = req & avail;
++    unsigned int i;
++
++    if ((avail & AVAIL_DL) == 0)
++    {
++        a_dl = a_ur;
++        if ((avail & AVAIL_U) != 0)
++            a_dl = a_u;
++        if ((avail & AVAIL_UL) != 0)
++            a_dl = ab_ul;
++        if ((avail & AVAIL_L) != 0)
++            a_dl = a_l;
++        b_dl = a_dl;
++    }
++
++    if ((avail & AVAIL_L) == 0)
++    {
++        a_l = b_dl;
++        b_l = b_dl;
++    }
++    if ((avail & AVAIL_UL) == 0)
++    {
++        ab_ul = b_l;
++    }
++    if ((avail & AVAIL_U) == 0)
++    {
++        a_u = ab_ul;
++        b_u = ab_ul;
++    }
++    if ((avail & AVAIL_UR) == 0)
++    {
++        a_ur = b_u;
++        b_ur = b_u;
++    }
++
++    if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2)  // PRED_C, log2_size compiler opt hints
++    {
++        if ((req & AVAIL_UL) != 0)
++            left[-1] = *ab_ul;
++
++        if ((want & AVAIL_L) != 0)
++            EXTEND(left, *a_l, size);
++        if ((want & AVAIL_DL) != 0)
++            EXTEND(left + size, *a_dl, size);
++        if ((want & AVAIL_U) != 0)
++            EXTEND(top, *a_u, size);
++        if ((want & AVAIL_UR) != 0)
++            EXTEND(top + size, *a_ur, size);
++
++        if ((have & AVAIL_U) != 0)
++            // Always good - even with sand
++            memcpy(top, a_u, size * sizeof(pixel));
++        if ((have & AVAIL_UR) != 0)
++        {
++            memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
++            EXTEND(top + size + top_right_size, *b_ur,
++                   size - top_right_size);
++        }
++        if ((have & AVAIL_L) != 0)
++        {
++            for (i = 0; i < size; i++)
++                left[i] = b_l[stride * i];
++        }
++        if ((have & AVAIL_DL) != 0)
++        {
++            for (i = 0; i < down_left_size; i++)
++                left[i + size] = b_dl[stride * i];
++            EXTEND(left + size + down_left_size, *a_dl,
++                   size - down_left_size);
++        }
++    }
++    else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
++            FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
++            FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
++    {
++        if ((req & (AVAIL_U | AVAIL_UR)) != 0)
++            filter_strong(top, *ab_ul, *b_ur, size * 2);
++        left[-1] = *ab_ul;
++        if ((req & (AVAIL_L | AVAIL_DL)) != 0)
++            filter_strong(left, *ab_ul, *a_dl, size*2);
++    }
++    else
++    {
++        // Same code for both have & want for UL
++        if ((req & AVAIL_UL) != 0)
++        {
++            left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
++        }
++
++        if ((want & AVAIL_L) != 0)
++        {
++            EXTEND(left, *a_l, size);
++            left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
++        }
++        if ((want & AVAIL_DL) != 0)
++        {
++            // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
++            EXTEND(left + size, *a_l, size);
++        }
++        if ((want & AVAIL_U) != 0)
++        {
++            EXTEND(top, *a_u, size);
++            top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
++        }
++        if ((want & AVAIL_UR) != 0)
++        {
++            // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
++            EXTEND(top + size, *a_ur, size);
++        }
++
++        if ((have & AVAIL_U) != 0)
++        {
++            filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
++        }
++        if ((have & AVAIL_UR) != 0) {
++            filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
++            top[size*2 - 1] = *b_ur;
++            EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
++        }
++        if ((have & AVAIL_L) != 0)
++        {
++            filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
++        }
++        if ((have & AVAIL_DL) != 0)
++        {
++            filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
++            left[size*2 - 1] = *a_dl;
++            EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
++        }
++    }
++}
++
++#define INTRA_FILTER(log2_size) \
++static void FUNC(intra_filter_ ## log2_size)( \
++     uint8_t * const left, uint8_t * const top, \
++     const unsigned int req, const unsigned int avail, \
++     const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
++     const unsigned int stride, \
++     const unsigned int top_right_size, const unsigned int down_left_size) \
++{ \
++    intra_filter((pixel *)left, (pixel *)top, req, avail, \
++        (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
++}
++
++INTRA_FILTER(2)
++INTRA_FILTER(3)
++INTRA_FILTER(4)
++INTRA_FILTER(5)
++
++#undef intra_filter
++#undef INTRA_FILTER
++
++static void FUNC(intra_pred)(const HEVCRpiContext * const s,
++                                              const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
++                                              const unsigned int log2_size)
++{
++    // c_idx will alaways be 1 for _c versions and 0 for y
++    const unsigned int c_idx = PRED_C;
++    const unsigned int hshift = ctx_hshift(s, c_idx);
++    const unsigned int vshift = ctx_vshift(s, c_idx);
++    const unsigned int size = (1 << log2_size);
++    const unsigned int x = x0 >> hshift;
++    const unsigned int y = y0 >> vshift;
++
++    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
++    pixel *const src = c_idx == 0 ?
++        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
++
++    // Align so we can do multiple loads in the asm
++    // Padded to 16 byte boundary so as not to confuse anything
++    DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
++    DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
++
++    pixel  * const left  = left_array  + 16 / sizeof(pixel);
++    const pixel * top_pred = top;
++
++    const pixel * src_l = src - 1;
++    const pixel * src_u = src - stride;
++    const pixel * src_ur = src_u + size;
++#if !PRED_C
++    const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
++#else
++    const unsigned int req = req_avail_c[mode];
++#endif
++
++    // If we have nothing to pred from then fill with grey
++    // This isn't a common case but dealing with it here means we don't have to
++    // test for it later
++    if (avail == 0)
++    {
++dc_only:
++#if !PRED_C
++        s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
++#else
++        s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
++#endif
++        return;
++    }
++
++    {
++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
++        const AVFrame * const frame = s->frame;
++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
++        if ((x & mask) == 0)
++            src_l -= stripe_adj;
++        if (((x + size) & mask) == 0)
++            src_ur += stripe_adj;
++    }
++
++    // Can deal with I-slices in 'normal' code even if CIP
++    // This also means that we don't need to generate (elsewhere) is_intra
++    // for IRAP frames
++    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
++        s->sh.slice_type != HEVC_SLICE_I)
++    {
++        // * If we ever actually care about CIP performance then we should
++        //   special case out size 4 stuff (can be done by 'normal') and
++        //   have 8-pel avail masks
++        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
++                                           -(int)(s->ps.sps->pcm_width),
++                                           1 << (((x - 1) >> (3 - hshift)) & 7),
++                                           1 - hshift,
++                                           avail,
++                                           size,
++                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
++                                           vshift != 0 ? 0 : (y >> 2) & 1);
++
++        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
++                                           (x >> (3 - hshift)) & 7,
++                                           1 - hshift,
++                                           avail,
++                                           size,
++                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
++                                           hshift != 0 ? 0 : (x >> 2) & 1);
++
++        // Anything left?
++        if ((avail_l | avail_u) == 0)
++            goto dc_only;
++
++        FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
++
++#if !PRED_C
++        if ((req & FILTER_LIGHT) != 0)
++        {
++            const unsigned threshold = 1 << (BIT_DEPTH - 5);
++            if ((req & FILTER_STRONG) != 0 &&
++                (int)(FFABS(left[-1]  + top[63] - 2 * top[31]))  < threshold &&
++                (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
++            {
++                filter_strong(top, left[-1], top[63], 64);
++                filter_strong(left, left[-1], left[63], 64);
++            } else
++            {
++                // LHS writes UL too so copy for top
++                const pixel p_ul = left[-1];
++                filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
++                filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
++            }
++        }
++#endif
++    }
++    else
++    {
++        const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
++        if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
++            ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
++        {
++            top_pred = src_u;
++        }
++        else
++        {
++#if !PRED_C
++            s->hpc.intra_filter[log2_size - 2]
++#else
++            s->hpc.intra_filter_c[log2_size - 2]
++#endif
++                ((uint8_t *)left, (uint8_t *)top, req, avail,
++                 (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
++                              ur_size,
++                              FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
++        }
++    }
++
++
++#if !PRED_C
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                       (uint8_t *)left, stride);
++        break;
++    case INTRA_ANGULAR_HORIZONTAL:
++        s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    case INTRA_ANGULAR_VERTICAL:
++        s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    default:
++        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    }
++#else
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                       (uint8_t *)left, stride);
++        break;
++    case INTRA_ANGULAR_HORIZONTAL:
++        s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    case INTRA_ANGULAR_VERTICAL:
++        s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    default:
++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    }
++
++#if DUMP_PRED
++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
++}
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++                                  const uint8_t *_left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++    int size = 1 << trafo_size;
++    for (y = 0; y < size; y++)
++        for (x = 0; x < size; x++)
++            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
++                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
++}
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++                                  const uint8_t * _left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    int size = 1 << trafo_size;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
++
++    for (y = 0; y < size; y++, src += stride)
++    {
++        for (x = 0; x < size; x++)
++        {
++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++        }
++    }
++}
++#endif
++
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
++}
++
++PRED_PLANAR(0)
++PRED_PLANAR(1)
++PRED_PLANAR(2)
++PRED_PLANAR(3)
++
++#undef PRED_PLANAR
++
++#if !PRED_C
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size)
++{
++    int i, j, x, y;
++    int size          = (1 << log2_size);
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++    int dc            = size;
++    pixel4 a;
++    for (i = 0; i < size; i++)
++        dc += left[i] + top[i];
++
++    dc >>= log2_size + 1;
++
++    a = PIXEL_SPLAT_X4(dc);
++
++    for (i = 0; i < size; i++)
++        for (j = 0; j < size; j+=4)
++            AV_WN4P(&POS(j, i), a);
++
++//    if (c_idx == 0 && size < 32)
++// As we now have separate fns for y & c - no need to test that
++    if (size < 32)
++    {
++        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
++        for (x = 1; x < size; x++)
++            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
++        for (y = 1; y < size; y++)
++            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++    }
++}
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
++    unsigned int dc0 = size;
++    unsigned int dc1 = size;
++
++    for (i = 0; i < size; i++)
++    {
++        dc0 += left[i][0] + top[i][0];
++        dc1 += left[i][1] + top[i][1];
++    }
++
++    dc0 >>= log2_size + 1;
++    dc1 >>= log2_size + 1;
++
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = dc0;
++            src[j][1] = dc1;
++
++        }
++    }
++}
++#endif
++
++#define PRED_DC(size)\
++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    FUNC(pred_dc)(src, top, left, stride, size + 2);                        \
++}
++
++PRED_DC(0)
++PRED_DC(1)
++PRED_DC(2)
++PRED_DC(3)
++
++#undef PRED_DC
++
++
++
++
++#if !PRED_C
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++    int i, j;
++    int size          = (1 << log2_size);
++    pixel *src        = (pixel *)_src;
++    pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
++
++    for (i = 0; i < size; i++)
++        for (j = 0; j < size; j+=4)
++            AV_WN4P(&POS(j, i), a);
++}
++#else
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const pixel a = (1 << (BIT_DEPTH - 1));
++
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = a;
++            src[j][1] = a;
++        }
++    }
++}
++#endif
++
++#define PRED_DC0(size)\
++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride)   \
++{                                                                               \
++    FUNC(pred_dc0)(src, stride, size + 2);                        \
++}
++
++PRED_DC0(0)
++PRED_DC0(1)
++PRED_DC0(2)
++PRED_DC0(3)
++
++#undef PRED_DC0
++
++
++
++
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++};
++static const int inv_angle[] = {
++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++    -630, -910, -1638, -4096
++};
++#endif
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride,
++                                                int mode, int size)
++{
++    int x, y;
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++
++    int angle = intra_pred_angle[mode - 2];
++    pixel ref_array[3 * MAX_TB_SIZE + 4];
++    pixel *ref_tmp = ref_array + size;
++    const pixel *ref;
++    int last = (size * angle) >> 5;
++
++    if (mode >= 18) {
++        ref = top - 1;
++
++        if (angle < 0)
++        {
++            memcpy(ref_tmp + 1, top, size * PW);
++            ref_tmp[0] = left[-1];
++
++            for (x = last; x <= -1; x++)
++                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++            ref = ref_tmp;
++        }
++
++        for (y = 0; y < size; y++) {
++            int idx  = ((y + 1) * angle) >> 5;
++            int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; x += 4) {
++                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
++                                           fact  * ref[x + idx + 2] + 16) >> 5;
++                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
++                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
++                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
++                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
++                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
++                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
++                }
++            } else {
++                for (x = 0; x < size; x += 4)
++                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
++            }
++        }
++        if (mode == 26 && size < 32) {
++            for (y = 0; y < size; y++)
++                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
++        }
++
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            for (x = 0; x <= size; x += 4)
++                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
++            // Inv angle <= -256 so top offset >= 0
++            for (x = last; x <= -1; x++)
++                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++            ref = ref_tmp;
++        }
++
++        for (x = 0; x < size; x++) {
++            int idx  = ((x + 1) * angle) >> 5;
++            int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
++                                       fact  * ref[y + idx + 2] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                    POS(x, y) = ref[y + idx + 1];
++            }
++        }
++        if (mode == 10 && size < 32) {
++            for (x = 0; x < size; x += 4) {
++                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - left[-1]) >> 1));
++                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
++                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
++                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
++            }
++        }
++    }
++}
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride,
++                                                int mode, int size)
++{
++    int x, y;
++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
++    c_src_ptr_t top  = (c_src_ptr_t)_top;
++    c_src_ptr_t left = (c_src_ptr_t)_left;
++
++    const int angle = intra_pred_angle[mode - 2];
++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++    c_dst_ptr_t ref_tmp = ref_array + size;
++    c_src_ptr_t ref;
++    const int last = (size * angle) >> 5;
++
++    if (mode >= 18) {
++        ref = top - 1;
++        if (angle < 0) {
++            memcpy(ref_tmp + 1, top, size * 2 * PW);
++            ref_tmp[0][0] = left[-1][0];
++            ref_tmp[0][1] = left[-1][1];
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c_src_ptr_t)ref_tmp;
++        }
++
++        for (y = 0; y < size; y++, src += stride) {
++            const int idx  = ((y + 1) * angle) >> 5;
++            const int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; ++x) {
++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                memcpy(src, ref + idx + 1, size * 2 * PW);
++            }
++        }
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c_src_ptr_t)ref_tmp;
++        }
++
++        for (x = 0; x < size; x++, src++) {
++            const int idx  = ((x + 1) * angle) >> 5;
++            const int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                {
++                    src[y * stride][0] = ref[y + idx + 1][0];
++                    src[y * stride][1] = ref[y + idx + 1][1];
++                }
++            }
++        }
++    }
++}
++#endif
++
++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
++}
++
++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
++}
++
++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
++}
++
++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
++}
++
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
++
++#undef EXTEND
++#undef POS
++#undef PW
++
++#undef filter_light1
++#undef filter_light
++#undef filter_strong
++#undef ref_gen
++
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
++
+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+new file mode 100644
+index 0000000000..98a0b104b7
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,155 @@
++/*
++Copyright (c) 2012, Broadcom Europe Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <stdint.h>
++#include <sys/ioctl.h>
++
++#include <linux/ioctl.h>
++
++#define MAJOR_NUM 100
++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
++#define DEVICE_FILE_NAME "/dev/vcio"
++
++#include "rpi_mailbox.h"
++//#include <interface/vctypes/vc_image_structs.h>
++
++/*
++ * use ioctl to send mbox property message
++ */
++
++static int mbox_property(int file_desc, void *buf)
++{
++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
++
++   if (ret_val < 0) {
++      printf("ioctl_set_msg failed:%d\n", ret_val);
++   }
++
++#ifdef DEBUG
++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++   for (i=0; i<size/4; i++)
++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
++#endif
++   return ret_val;
++}
++
++#define GET_VCIMAGE_PARAMS 0x30044
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
++{
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
++
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
++
++    rv = mbox_property(fd, buf);
++    memcpy(img, rimg, sizeof(*img));
++
++    return rv;
++}
++
++
++#define SET_CLOCK_RATE 0x00038002
++#define GET_MAX_CLOCK 0x00030004
++#define CLOCK_HEVC 11
++
++static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
++{
++    uint32_t buf[32];
++    uint32_t * p = buf;
++    int rv;
++
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = command;
++    *p++ = 8;
++    *p++ = 8;
++    *p++ = *word0;
++    *p++ = *word1;
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
++
++    rv = mbox_property(fd, buf);
++    *word0 = buf[6];
++    *word1 = buf[7];
++    return rv;
++}
++
++int mbox_open() {
++   int file_desc;
++
++   // open a char device file used for communicating with kernel mbox driver
++   file_desc = open(DEVICE_FILE_NAME, 0);
++   if (file_desc < 0) {
++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
++   }
++   return file_desc;
++}
++
++void mbox_close(int file_desc) {
++  close(file_desc);
++}
++
++int mbox_request_clock(int fd) {
++   int rv;
++   unsigned word0, word1 = 0;
++   word0 = CLOCK_HEVC;
++   rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
++   if (rv != 0)
++      return rv;
++   word1 = word0;
++   word0 = CLOCK_HEVC;
++   rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++   return rv;
++}
++
++int mbox_release_clock(int fd) {
++  int rv;
++  unsigned word0, word1 = 0;
++  word0 = CLOCK_HEVC;
++  word1 = 0;
++  rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++  return rv;
++}
+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+new file mode 100644
+index 0000000000..b2654ef01e
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,58 @@
++#ifndef RPI_MAILBOX_H
++#define RPI_MAILBOX_H
++
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++  void *u, *v;
++  int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++    VC_IMAGE_EXTRA_UV_T uv;
++//  VC_IMAGE_EXTRA_RGBA_T rgba;
++//  VC_IMAGE_EXTRA_PAL_T pal;
++//  VC_IMAGE_EXTRA_TF_T tf;
++//  VC_IMAGE_EXTRA_BAYER_T bayer;
++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++//  VC_IMAGE_EXTRA_CODEC_T codec;
++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
++
++
++typedef struct VC_IMAGE_T {
++  unsigned short                  type;           /* should restrict to 16 bits */
++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++  unsigned short                  width;          /* width in pixels */
++  unsigned short                  height;         /* height in pixels */
++  int                             pitch;          /* pitch of image_data array in bytes */
++  int                             size;           /* number of bytes available in image_data array */
++  void                           *image_data;     /* pixel data */
++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++  void                           *metadata;       /* metadata header for the image */
++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
++  int                             metadata_size;  /* size of metadata of each channel in bytes */
++  int                             channel_offset; /* offset of consecutive channels in bytes */
++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                            into a linked-mulitchannel image */
++  uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                            it is being linked. */
++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++} VC_IMAGE_T;
++
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
++
++extern int mbox_open(void);
++extern void mbox_close(int file_desc);
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
++
++int mbox_request_clock(int fd);
++int mbox_release_clock(int fd);
++
++#endif
+diff --git a/libavcodec/rpi_mem.c b/libavcodec/rpi_mem.c
+new file mode 100644
+index 0000000000..812921f665
+--- /dev/null
++++ b/libavcodec/rpi_mem.c
+@@ -0,0 +1,326 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++
++#include "config.h"
++
++#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++
++#define OPT_PREFER_CMA 0
++
++struct rpi_cache_flush_env_s {
++  struct vcsm_user_clean_invalid2_s v;
++};
++
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++    if (p->arm != NULL)
++        vcsm_unlock_ptr(p->arm);
++    if (p->vcsm_handle != 0)
++        vcsm_free(p->vcsm_handle);
++    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++    const int numbytes, const unsigned int cache_type, const char * const name)
++{
++    memset(p, 0, sizeof(*p));
++    p->numbytes = (numbytes + 255) & ~255;  // Round up
++
++    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
++        goto fail;
++    }
++    if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
++        goto fail;
++    }
++    if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
++        goto fail;
++    }
++    if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    gpu_free_internal(p);
++    return AVERROR(ENOMEM);
++}
++
++// Public gpu fns
++
++// Allocate memory on GPU
++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
++// Returns 0 on success.
++// This allocates memory that will not be cached in ARM's data cache.
++// Therefore safe to use without data cache flushing.
++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
++{
++    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
++}
++
++// This allocates data that will be
++//    Cached in ARM L2
++//    Uncached in VPU L2
++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
++{
++    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
++}
++
++void gpu_free(GPU_MEM_PTR_T * const p) {
++    gpu_free_internal(p);
++}
++
++void rpi_mem_gpu_uninit(void)
++{
++    vcsm_exit();
++    bcm_host_deinit();
++}
++
++int rpi_mem_gpu_init(const unsigned int flags)
++{
++    const int wants_cma = bcm_host_is_fkms_active();
++    int use_cma;
++
++    (void)flags;
++
++    if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
++        use_cma = 1;
++    else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
++        use_cma = 0;
++    else
++        return AVERROR(EINVAL);
++
++    bcm_host_init();
++
++    return use_cma + 1;
++}
++
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
++
++#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
++{
++  rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
++  *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
++  return rfe;
++}
++
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++  // Nothing needed
++}
++
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
++{
++    int rc = 0;
++    if (rfe->v.op_count != 0) {
++        if (vcsm_clean_invalid2(&rfe->v) != 0)
++        {
++          const int err = errno;
++          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
++          rc = AVERROR(err);
++        }
++        rfe->v.op_count = 0;
++    }
++    return rc;
++}
++
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++  int rc = rpi_cache_flush_execute(rfe);;
++
++  return rc;
++}
++
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
++{
++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++
++  av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
++
++  b->invalidate_mode = mode;
++  b->block_count = blocks;
++  b->start_address = gm->arm + offset0;
++  b->block_size = block_size;
++  b->inter_block_stride = block_stride;
++}
++
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset, const unsigned int size)
++{
++  // Deal with empty pointer trivially
++  if (gm == NULL || size == 0)
++    return;
++
++  av_assert1(offset <= gm->numbytes);
++  av_assert1(size <= gm->numbytes);
++  av_assert1(offset + size <= gm->numbytes);
++
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
++}
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
++
++
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++  if (gpu_is_buf1(frame)) {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++  }
++  else
++  {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++  }
++}
++
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++  const unsigned int y_offset = frame->linesize[0] * y0;
++  const unsigned int y_size = frame->linesize[0] * height;
++  // Round UV up/down to get everything
++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
++
++#if 0
++  // *** frame->height is cropped height so not good
++  // As all unsigned they will also reject -ve
++  // Test individually as well as added to reject overflow
++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
++  av_assert0(n <= (unsigned int)frame->height);
++  av_assert0(start_line + n <= (unsigned int)frame->height);
++#endif
++
++  if (!gpu_is_buf1(frame))
++  {
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    }
++  }
++  else if (!av_rpi_is_sand_frame(frame))
++  {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++    }
++  }
++  else
++  {
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
++    av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
++
++    if (do_chroma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++      b->block_size = uv_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++    if (do_luma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++      b->block_size = y_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++  }
++}
++
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++  rpi_cache_buf_t cbuf;
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++  rpi_cache_flush_finish(rfe);
++}
++
+diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h
+new file mode 100644
+index 0000000000..a451079806
+--- /dev/null
++++ b/libavcodec/rpi_mem.h
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_MEM_H
++#define RPI_MEM_H
++
++typedef struct gpu_mem_ptr_s {
++  unsigned char *arm; // Pointer to memory mapped on ARM side
++  int vc_handle;   // Videocore handle of relocatable memory
++  int vcsm_handle; // Handle for use by VCSM
++  int vc;       // Address for use in GPU code
++  int numbytes; // Size of memory block
++} GPU_MEM_PTR_T;
++
++// General GPU functions
++
++#define GPU_INIT_GPU 1
++#define GPU_INIT_CMA 2
++
++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
++int rpi_mem_gpu_init(const unsigned int flags);
++void rpi_mem_gpu_uninit(void);
++
++// Cache flush stuff
++
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
++
++typedef struct {uint32_t t[33];} rpi_cache_buf_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & clear but do not free the env
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
++} rpi_cache_flush_mode_t;
++
++struct AVFrame;
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
++
++#endif
+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+new file mode 100644
+index 0000000000..cb7b96119e
+--- /dev/null
++++ b/libavcodec/rpi_qpu.c
+@@ -0,0 +1,776 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++#include "libavutil/avassert.h"
++
++#include "config.h"
++
++#include <pthread.h>
++#include <time.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include "rpi_mailbox.h"
++#include "rpi_mem.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
++
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
++
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL       0
++
++// QPU "noflush" flags
++// a mixture of flushing & profiling
++
++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
++
++#define vcos_verify_ge0(x) ((x)>=0)
++
++// Size in 32bit words
++#define QPU_CODE_SIZE 4098
++#define VPU_CODE_SIZE 16384
++
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
++// Odd rows
++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
++};
++
++// Code/constants on GPU
++struct GPU
++{
++//  unsigned int qpu_code[QPU_CODE_SIZE];
++    unsigned int vpu_code8[VPU_CODE_SIZE];
++    unsigned int vpu_code10[VPU_CODE_SIZE];
++    short transMatrix2even[16*16*2];
++};
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++    int count;
++    int64_t start[WAIT_COUNT_MAX];
++    int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++    unsigned int jcount;
++    int64_t start0;
++    int64_t last_update;
++    trace_time_one_t active;
++    trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++    sem_t sem;
++    struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++    vq_wait_t * head;
++    vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++    int open_count;
++    int init_count;
++    int vpu_i_cache_flushed;
++    GPU_MEM_PTR_T qpu_code_gm_ptr;
++    GPU_MEM_PTR_T code_gm_ptr;
++    GPU_MEM_PTR_T dummy_gm_ptr;
++    vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
++// Stop more than one thread trying to allocate memory or use the processing resources at once
++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
++static gpu_env_t * gpu = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
++}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++    // Update totals for levels that are still pending
++    for (int i = 0; i < tto->count; ++i) {
++        tto->total[i] += now - tto->start[i];
++        tto->start[i] = now;
++    }
++
++    printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++         prefix,
++         T_ARG(now - start0 - tto->total[0]),
++         T_ARG(tto->total[0]),
++         T_ARG(tto->total[1]),
++         T_ARG(tto->total[2]),
++         T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++    av_assert0(tto->count < WAIT_COUNT_MAX);
++    tto->start[tto->count++] = now;
++}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++    const int n = --tto->count;
++    av_assert0(n >= 0);
++    tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++    printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++    tto_print(&ttw->active, now, ttw->start0, "Active");
++    tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
++}
++
++#endif
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++    if (p->arm != NULL)
++        vcsm_unlock_ptr(p->arm);
++    if (p->vcsm_handle != 0)
++        vcsm_free(p->vcsm_handle);
++    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++    const int numbytes, const unsigned int cache_type, const char * const name)
++{
++    memset(p, 0, sizeof(*p));
++    p->numbytes = (numbytes + 255) & ~255;  // Round up
++
++    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
++        (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
++        (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
++        (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++    {
++        gpu_free_internal(p);
++        return AVERROR(ENOMEM);
++    }
++    return 0;
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++    gpu_env_t * const ge = gpu;
++
++    // We have to hope that eveything has terminated...
++    gpu = NULL;
++
++    vc_gpuserv_deinit();
++
++    gpu_free_internal(&ge->code_gm_ptr);
++    gpu_free_internal(&ge->qpu_code_gm_ptr);
++    gpu_free_internal(&ge->dummy_gm_ptr);
++
++    vcsm_exit();
++
++    vq_wait_pool_deinit(&ge->wait_pool);
++
++    free(ge);
++}
++
++
++// Connect to QPU, returns 0 on success.
++static int gpu_init(gpu_env_t ** const gpu) {
++    volatile struct GPU* ptr;
++    gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++    int rv;
++    *gpu = NULL;
++
++    if (ge == NULL)
++        return -1;
++
++    vq_wait_pool_init(&ge->wait_pool);
++
++    vcsm_init();
++
++    // Now copy over the QPU code into GPU memory
++    if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
++      return rv;
++
++    {
++        int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
++        av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
++        memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
++        memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
++    }
++
++    // And the VPU code
++    if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
++        return rv;
++    ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
++
++    // Zero everything so we have zeros between the code bits
++    memset((void *)ptr, 0, sizeof(*ptr));
++    {
++        int num_bytes = sizeof(rpi_hevc_transform8);
++        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++        memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++    }
++    {
++        int num_bytes = sizeof(rpi_hevc_transform10);
++        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++        memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
++    }
++    // And the transform coefficients
++    memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
++
++    // Generate a dummy "frame" & fill with 0x80
++    // * Could reset to 1 <<bit_depth?
++    if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
++        return rv;
++    memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
++
++    *gpu = ge;
++    return 0;
++}
++
++
++
++static void gpu_unlock(void) {
++    pthread_mutex_unlock(&gpu_mutex);
++}
++
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++    pthread_mutex_lock(&gpu_mutex);
++
++    av_assert1(gpu != NULL);
++    return gpu;
++}
++
++static gpu_env_t * gpu_lock_ref(void)
++{
++    pthread_mutex_lock(&gpu_mutex);
++
++    if (gpu == NULL) {
++        int rv = gpu_init(&gpu);
++        if (rv != 0) {
++            gpu_unlock();
++            return NULL;
++        }
++    }
++
++    ++gpu->open_count;
++    return gpu;
++}
++
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++    if (--ge->open_count == 0)
++        gpu_term();
++
++    gpu_unlock();
++}
++
++static inline gpu_env_t * gpu_ptr(void)
++{
++    av_assert1(gpu != NULL);
++    return gpu;
++}
++
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
++  uint32_t a = 0;
++
++  // Make sure that the gpu is initialized
++  av_assert1(gpu != NULL);
++  switch (bit_depth){
++    case 8:
++      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++      break;
++    case 10:
++      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++      break;
++    default:
++      av_assert0(0);
++  }
++  return a;
++}
++
++unsigned int vpu_get_constants(void) {
++  av_assert1(gpu != NULL);
++  return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
++}
++
++void gpu_ref(void)
++{
++  gpu_lock_ref();
++  gpu_unlock();
++}
++
++void gpu_unref(void)
++{
++  gpu_env_t * const ge = gpu_lock();
++  gpu_unlock_unref(ge);
++}
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_init(&wp->pool[i].sem, 0, 0);
++    wp->pool[i].next = wp->pool + i + 1;
++  }
++  wp->head = wp->pool + 0;
++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
++}
++
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  wp->head = NULL;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_destroy(&wp->pool[i].sem);
++    wp->pool[i].next = NULL;
++  }
++}
++
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(void)
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  vq_wait_t * const wait = ge->wait_pool.head;
++  ge->wait_pool.head = wait->next;
++  wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  tto_start(&ge->ttw.active, ns_time());
++#endif
++
++  gpu_unlock();
++  return wait;
++}
++
++static void vq_wait_delete(vq_wait_t * const wait)
++{
++  gpu_env_t * const ge = gpu_lock();
++  wait->next = ge->wait_pool.head;
++  ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    trace_time_wait_t * const ttw = &ge->ttw;
++    const int64_t now = ns_time();
++    ++ttw->jcount;
++    tto_end(&ttw->wait, now);
++
++    if (ttw->start0 == 0)
++    {
++      ttw->start0 = ttw->active.start[0];
++      ttw->last_update = ttw->start0;
++    }
++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++    {
++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++      ttw_print(ttw, now);
++    }
++  }
++#endif
++  gpu_unlock_unref(ge);
++}
++
++static void vq_wait_wait(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++      const int64_t now = ns_time();
++      gpu_env_t * const ge = gpu_lock();
++      tto_start(&ge->ttw.wait, now);
++      gpu_unlock();
++  }
++#endif
++
++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++    /* loop */;
++}
++
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    gpu_env_t *const ge = gpu_lock();
++    tto_end(&ge->ttw.active, ns_time());
++    gpu_unlock();
++  }
++#endif
++
++  sem_post(&wait->sem);
++}
++
++
++
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU  1
++#define VPU_QPU_MASK_VPU  2
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
++{
++//  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++  vpu_qpu_job_env_t * vqj = buf;
++//  memset(vqj, 0, sizeof(*vqj));
++  vqj->n = 0;
++  vqj->mask = 0;
++  return vqj;
++}
++
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
++{
++//  memset(vqj, 0, sizeof(*vqj));
++//  free(vqj);
++}
++
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++  struct gpu_job_s * const j = vqj->j + vqj->n++;
++  av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
++  return j;
++}
++
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++  if (vpu_code != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_VPU;
++
++    j->command = EXECUTE_VPU;
++    j->callback.func = 0;
++    j->callback.cookie = NULL;
++    // The bottom two bits of the execute address contain no-flush flags
++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
++    // as we never reload code
++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
++    j->u.v.q[1] = r0;
++    j->u.v.q[2] = r1;
++    j->u.v.q[3] = r2;
++    j->u.v.q[4] = r3;
++    j->u.v.q[5] = r4;
++    j->u.v.q[6] = r5;
++    gpu->vpu_i_cache_flushed = 1;
++  }
++}
++
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
++{
++  if (n != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_QPU;
++
++    j->command = EXECUTE_QPU;
++    j->callback.func = 0;
++    j->callback.cookie = NULL;
++
++    j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
++#else
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++#endif
++    j->u.q.timeout = 5000;
++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  }
++}
++
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
++{
++  vq_wait_post(v);
++}
++
++// Poke a user-supplied sem
++static void vpu_qpu_job_callback_sem(void * v)
++{
++  sem_post((sem_t *)v);
++}
++
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++  vq_wait_t * wait;
++
++  if (vqj->mask == 0) {
++    *wait_h = NULL;
++    return;
++  }
++
++  // We are going to want a sync object
++  wait = vq_wait_new();
++
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert1(j->callback.func == 0);
++
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
++
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++
++  vqj->mask = 0;
++  *wait_h = wait;
++}
++
++// Returns 0 if no sync added ('cos Q empty), 1 if sync added
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
++{
++  // If nothing on q then just return
++  if (vqj->mask == 0)
++    return 0;
++
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert1(j->callback.func == 0);
++
++    j->callback.func = vpu_qpu_job_callback_sem;
++    j->callback.cookie = sem;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
++
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_sem;
++    j->callback.cookie = sem;
++  }
++
++  vqj->mask = 0;
++  return 1;
++}
++
++
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++  if (vqj->n == 0)
++    return 0;
++
++  return vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
++
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++  int rv;
++  rv = vpu_qpu_job_start(vqj);
++  vpu_qpu_job_delete(vqj);
++  return rv;
++}
++
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++  if (wait_h != NULL)
++  {
++    vq_wait_t * const wait = *wait_h;
++    if (wait != NULL) {
++      *wait_h = NULL;
++      vq_wait_wait(wait);
++      vq_wait_delete(wait);
++    }
++  }
++}
++
++int vpu_qpu_init()
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++
++  if (ge->init_count++ == 0)
++  {
++    vc_gpuserv_init();
++  }
++
++  gpu_unlock();
++  return 0;
++}
++
++void vpu_qpu_term()
++{
++  gpu_env_t * const ge = gpu_lock();
++
++  if (--ge->init_count == 0) {
++    vc_gpuserv_deinit();
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    ttw_print(&ge->ttw, ns_time());
++#endif
++  }
++
++  gpu_unlock_unref(ge);
++}
++
++uint32_t qpu_fn(const int * const mc_fn)
++{
++  return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
++}
++
++uint32_t qpu_dummy(void)
++{
++  return gpu->dummy_gm_ptr.vc;
++}
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
++{
++  // Dummy values we can catch with emulation
++  qf->y_pxx = ~1U;
++  qf->y_bxx = ~2U;
++  qf->y_p00 = ~3U;
++  qf->y_b00 = ~4U;
++  qf->c_pxx = ~5U;
++  qf->c_bxx = ~6U;
++
++  switch (bit_depth) {
++    case 8:
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
++      qf->c_pxx = qpu_fn(mc_filter_c_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c_b);
++      break;
++    case 10:
++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++      break;
++    default:
++      return -1;
++  }
++  return 0;
++}
++
+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+new file mode 100644
+index 0000000000..8777687021
+--- /dev/null
++++ b/libavcodec/rpi_qpu.h
+@@ -0,0 +1,103 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_QPU_H
++#define RPI_QPU_H
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#pragma GCC diagnostic ignored "-Wstrict-prototypes"
++#include "interface/vmcs_host/vc_vchi_gpuserv.h"  // for gpu_job_s
++#pragma GCC diagnostic pop
++
++// QPU specific functions
++
++typedef struct HEVCRpiQpu {
++    uint32_t c_pxx;
++    uint32_t c_pxx_l1;
++    uint32_t c_bxx;
++    uint32_t y_pxx;
++    uint32_t y_bxx;
++    uint32_t y_p00;
++    uint32_t y_b00;
++} HEVCRpiQpu;
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
++
++uint32_t qpu_fn(const int * const mc_fn);
++uint32_t qpu_dummy(void);
++
++#define QPU_N_GRP    4
++#define QPU_N_MAX    12
++
++#define QPU_MAIL_EL_VALS  2
++
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
++
++// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
++{
++  unsigned int n;
++  unsigned int mask;
++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
++extern unsigned int vpu_get_constants(void);
++
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
++
++void gpu_ref(void);
++void gpu_unref(void);
++
++#endif
+diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+new file mode 100644
+index 0000000000..37be9a0f49
+--- /dev/null
++++ b/libavcodec/rpi_zc.c
+@@ -0,0 +1,1227 @@
++#include "config.h"
++
++#include "libavcodec/avcodec.h"
++#include "rpi_mem.h"
++#include "rpi_mailbox.h"
++#include "rpi_zc.h"
++#include "libavutil/avassert.h"
++#include <pthread.h>
++
++#include "libavutil/buffer_internal.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#define TRACE_ALLOC 0
++#define DEBUG_ALWAYS_KEEP_LOCKED 0
++
++struct ZcPoolEnt;
++
++typedef struct ZcPool
++{
++    size_t numbytes;
++    struct ZcPoolEnt * head;
++    pthread_mutex_t lock;
++} ZcPool;
++
++typedef struct ZcPoolEnt
++{
++    size_t numbytes;
++
++    unsigned int vcsm_handle;
++    unsigned int vc_handle;
++    void * map_arm;
++    unsigned int map_vc;
++
++    struct ZcPoolEnt * next;
++    struct ZcPool * pool;
++} ZcPoolEnt;
++
++typedef struct ZcOldCtxVals
++{
++    int thread_safe_callbacks;
++    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
++    void * opaque;
++} ZcOldCtxVals;
++
++typedef struct AVZcEnv
++{
++    unsigned int refcount;
++    ZcOldCtxVals old;
++
++    void * pool_env;
++    av_rpi_zc_alloc_buf_fn_t * alloc_buf;
++    av_rpi_zc_free_pool_fn_t * free_pool;
++
++    unsigned int pool_size;
++} ZcEnv;
++
++typedef struct ZcUserBufEnv {
++    void * v;
++    const av_rpi_zc_buf_fn_tab_t * fn;
++    size_t numbytes;
++    int offset;
++} ZcUserBufEnv;
++
++#define ZC_BUF_INVALID  0
++#define ZC_BUF_VALID    1
++#define ZC_BUF_NEVER    2
++
++typedef struct ZcBufEnv {
++    GPU_MEM_PTR_T gmem;
++    AVZcEnvPtr zc;
++    int is_valid;
++    AVBufferRef * user;
++    AVRpiZcFrameGeometry geo;
++    size_t size_y;
++    size_t size_c;
++    size_t size_pic;
++    ssize_t offset;
++    pthread_mutex_t lock;
++    pthread_cond_t cond;
++} ZcBufEnv;
++
++
++
++
++
++
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++#define STRIDE_ROUND    64
++#define STRIDE_OR       0
++
++#define DEBUG_ZAP0_BUFFERS 0
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
++        (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++//----------------------------------------------------------------------------
++//
++// Internal pool stuff
++
++// Pool entry functions
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
++{
++    ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
++
++    // Round up to 4k & add 4k
++    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
++    if (zp == NULL) {
++        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
++        goto fail0;
++    }
++
++    // The 0x80 here maps all pages here rather than waiting for lazy mapping
++    // BEWARE that in GPU land a later unlock/lock pair will put us back into
++    // lazy mode - which will also break cache invalidate calls.
++    if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
++        goto fail1;
++    }
++
++#if TRACE_ALLOC
++    printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
++#endif
++
++    zp->numbytes = alloc_size;
++    zp->pool = pool;
++    return zp;
++
++fail1:
++    av_free(zp);
++fail0:
++    return NULL;
++}
++
++static void zc_pool_ent_free(ZcPoolEnt * const zp)
++{
++#if TRACE_ALLOC
++    printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
++#endif
++
++    if (zp->vcsm_handle != 0)
++    {
++        // VC addr & handle need no dealloc
++        if (zp->map_arm != NULL)
++            vcsm_unlock_hdl(zp->vcsm_handle);
++        vcsm_free(zp->vcsm_handle);
++    }
++    av_free(zp);
++}
++
++//----------------------------------------------------------------------------
++//
++// Pool functions
++
++static void zc_pool_free_ent_list(ZcPoolEnt * p)
++{
++    while (p != NULL)
++    {
++        ZcPoolEnt * const zp = p;
++        p = p->next;
++        zc_pool_ent_free(zp);
++    }
++}
++
++static void zc_pool_flush(ZcPool * const pool)
++{
++    ZcPoolEnt * p = pool->head;
++    pool->head = NULL;
++    pool->numbytes = ~0U;
++    zc_pool_free_ent_list(p);
++}
++
++static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
++{
++    ZcPoolEnt * zp = NULL;
++    ZcPoolEnt * flush_list = NULL;
++    size_t numbytes;
++
++    pthread_mutex_lock(&pool->lock);
++
++    numbytes = pool->numbytes;
++
++    // If size isn't close then dump the pool
++    // Close in this context means within 128k
++    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
++    {
++        flush_list = pool->head;
++        pool->head = NULL;
++        pool->numbytes = numbytes = req_bytes;
++    }
++    else if (pool->head != NULL)
++    {
++        zp = pool->head;
++        pool->head = zp->next;
++    }
++
++    pthread_mutex_unlock(&pool->lock);
++
++    zc_pool_free_ent_list(flush_list);
++
++    if (zp == NULL)
++        zp = zc_pool_ent_alloc(pool, numbytes);
++
++    return zp;
++}
++
++static void zc_pool_put_ent(ZcPoolEnt * const zp)
++{
++    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
++    if (zp != NULL)
++    {
++        pthread_mutex_lock(&pool->lock);
++#if TRACE_ALLOC
++        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
++#endif
++
++        if (pool->numbytes == zp->numbytes)
++        {
++            zp->next = pool->head;
++            pool->head = zp;
++            pthread_mutex_unlock(&pool->lock);
++        }
++        else
++        {
++            pthread_mutex_unlock(&pool->lock);
++            zc_pool_ent_free(zp);
++        }
++    }
++}
++
++static ZcPool *
++zc_pool_new(void)
++{
++    ZcPool * const pool = av_mallocz(sizeof(*pool));
++    if (pool == NULL)
++        return NULL;
++
++    pool->numbytes = -1;
++    pool->head = NULL;
++    pthread_mutex_init(&pool->lock, NULL);
++    return pool;
++}
++
++static void
++zc_pool_delete(ZcPool * const pool)
++{
++    if (pool != NULL)
++    {
++        pool->numbytes = -1;
++        zc_pool_flush(pool);
++        pthread_mutex_destroy(&pool->lock);
++        av_free(pool);
++    }
++}
++
++//============================================================================
++//
++// ZC implementation using above pool implementation
++//
++// Fn table fns...
++
++static void zc_pool_free_v(void * v)
++{
++    zc_pool_put_ent(v);
++}
++
++static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    return zp->vcsm_handle;
++}
++
++static unsigned int zc_pool_ent_vc_handle_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    if (zp->vc_handle == 0)
++    {
++        if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
++            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
++                   __func__, zp->vcsm_handle);
++    }
++    return zp->vc_handle;
++}
++
++static void * zc_pool_ent_map_arm_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    if (zp->map_arm == NULL)
++    {
++        if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
++            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
++                   __func__, zp->vcsm_handle);
++    }
++    return zp->map_arm;
++}
++
++static unsigned int zc_pool_ent_map_vc_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    if (zp->map_vc == 0)
++    {
++        if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
++            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
++                   __func__, zp->vcsm_handle);
++    }
++    return zp->map_vc;
++}
++
++static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
++    .free        = zc_pool_free_v,
++    .vcsm_handle = zc_pool_ent_vcsm_handle_v,
++    .vc_handle   = zc_pool_ent_vc_handle_v,
++    .map_arm     = zc_pool_ent_map_arm_v,
++    .map_vc      = zc_pool_ent_map_vc_v,
++};
++
++// ZC Env fns
++
++// Delete pool
++// All buffers guaranteed freed by now
++static void
++zc_pool_delete_v(void * v)
++{
++    zc_pool_delete((ZcPool *)v);
++    rpi_mem_gpu_uninit();
++}
++
++// Allocate a new ZC buffer
++static AVBufferRef *
++zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
++{
++    ZcPool * const pool = v;
++    ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
++    AVBufferRef * buf;
++
++    (void)geo;  // geo ignored here
++
++    if (zp == NULL) {
++        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
++        goto fail0;
++    }
++
++    if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
++        goto fail2;
++    }
++
++    return buf;
++
++fail2:
++    zc_pool_put_ent(zp);
++fail0:
++    return NULL;
++}
++
++// Init wrappers - the public fns
++
++AVZcEnvPtr
++av_rpi_zc_int_env_alloc(void * logctx)
++{
++    ZcEnv * zc;
++    ZcPool * pool_env;
++
++    if (rpi_mem_gpu_init(0) < 0)
++        return NULL;
++
++    if ((pool_env = zc_pool_new()) == NULL)
++        goto fail1;
++
++    if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
++        goto fail2;
++
++    return zc;
++
++fail2:
++    zc_pool_delete(pool_env);
++fail1:
++    rpi_mem_gpu_uninit();
++    return NULL;
++}
++
++void
++av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
++{
++    const AVZcEnvPtr zc = *zcp;
++    *zcp = NULL;
++    if (zc != NULL)
++        av_rpi_zc_env_release(zc);
++}
++
++//============================================================================
++//
++// Geometry
++//
++// This is a separate chunck to the rest
++
++// Get mailbox fd - should be in a lock when called
++// Rely on process close to close it
++static int mbox_fd(void)
++{
++    static int fd = -1;
++    if (fd != -1)
++        return fd;
++    return (fd = mbox_open());
++}
++
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++    const int format, const unsigned int video_width, const unsigned int video_height)
++{
++    static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++
++    AVRpiZcFrameGeometry geo = {
++        .format       = format,
++        .video_width  = video_width,
++        .video_height = video_height
++    };
++
++    switch (format)
++    {
++        case AV_PIX_FMT_YUV420P:
++            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++            geo.stride_c = geo.stride_y / 2;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            geo.bytes_per_pel = 1;
++            geo.stripe_is_yc = 1;
++            break;
++
++        case AV_PIX_FMT_YUV420P10:
++            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++            geo.stride_c = geo.stride_y / 2;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            geo.bytes_per_pel = 2;
++            geo.stripe_is_yc = 1;
++            break;
++
++        case AV_PIX_FMT_SAND128:
++        case AV_PIX_FMT_RPI4_8:
++        {
++            const unsigned int stripe_w = 128;
++
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                mbox_get_image_params(mbox_fd(), &new_img);
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.stripe_is_yc = 1;
++            if (geo.height_y * stripe_w > img.pitch)
++            {
++                // "tall" sand - all C blocks now follow Y
++                geo.height_y = img.pitch / stripe_w;
++                geo.height_c = geo.height_y;
++                geo.stripe_is_yc = 0;
++            }
++            geo.planes_c = 1;
++            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 1;
++
++            pthread_mutex_unlock(&sand_lock);
++#if 0
++            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++                   video_width, video_height,
++                   geo.stride_y, geo.stride_c,
++                   geo.height_y, geo.height_c,
++                   geo.stripes, img.pitch);
++#endif
++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++            break;
++        }
++
++        case AV_PIX_FMT_RPI4_10:
++        {
++            const unsigned int stripe_w = 128;  // bytes
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV10COL,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                mbox_get_image_params(mbox_fd(), &new_img);
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 1;
++            geo.stripe_is_yc = 1;
++
++            pthread_mutex_unlock(&sand_lock);
++
++#if 0
++            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++                   video_width, video_height,
++                   geo.stride_y, geo.stride_c,
++                   geo.height_y, geo.height_c,
++                   geo.stripes, img.pitch);
++#endif
++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++            break;
++        }
++
++        case AV_PIX_FMT_SAND64_16:
++        case AV_PIX_FMT_SAND64_10:
++        {
++            const unsigned int stripe_w = 128;  // bytes
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++             if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV_16,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                mbox_get_image_params(mbox_fd(), &new_img);
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 2;
++            geo.stripe_is_yc = 1;
++
++            pthread_mutex_unlock(&sand_lock);
++            break;
++        }
++
++        default:
++            break;
++    }
++    return geo;
++}
++
++//============================================================================
++//
++// ZC Env fns
++//
++// Frame copy fns
++
++static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
++    const AVFrame * const src)
++{
++    AVFrame dest_frame;
++    AVFrame * const dest = &dest_frame;
++    unsigned int i;
++    uint8_t * psrc, * pdest;
++
++    dest->format = src->format;
++    dest->width = src->width;
++    dest->height = src->height;
++
++    if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
++        av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
++    {
++        return NULL;
++    }
++
++    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
++         i != dest->height;
++         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
++    {
++        memcpy(pdest, psrc, dest->width);
++    }
++    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
++         i != dest->height / 2;
++         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
++    {
++        memcpy(pdest, psrc, dest->width / 2);
++    }
++    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
++         i != dest->height / 2;
++         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
++    {
++        memcpy(pdest, psrc, dest->width / 2);
++    }
++
++    return dest->buf[0];
++}
++
++
++static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
++    const AVFrame * const src)
++{
++    assert(0);
++    return NULL;
++}
++
++
++static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
++    const AVFrame * const src, const unsigned int src_bits)
++{
++    assert(0);
++    return NULL;
++}
++
++//----------------------------------------------------------------------------
++//
++// Public info extraction calls
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
++
++static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
++{
++    // Kludge where we check the free fn to check this is really
++    // one of our buffers - can't think of a better way
++    return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
++        av_buffer_get_opaque(buf);
++}
++
++static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
++{
++    // As gmem is the first el NULL should be preserved
++    return &pic_zbe_ptr(buf)->gmem;
++}
++
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? 0 : p->vcsm_handle;
++}
++
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? -1 : p->vc_handle;
++}
++
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++    return zbe == NULL ? 0 : zbe->offset;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++    return zbe == NULL ? 0 : zbe->size_pic;
++}
++
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? 0 : p->numbytes;
++}
++
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
++{
++    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++    return zbe == NULL ? NULL : &zbe->geo;
++}
++
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
++    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
++{
++    av_assert0(!maycopy || zc != NULL);
++
++    if (frame->format != AV_PIX_FMT_YUV420P &&
++        frame->format != AV_PIX_FMT_YUV420P10 &&
++        !av_rpi_is_sand_frame(frame))
++    {
++        av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
++        return NULL;
++    }
++
++    if (frame->buf[1] != NULL || frame->format != expected_format)
++    {
++#if RPI_ZC_SAND_8_IN_10_BUF
++        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
++        {
++//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
++            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
++        }
++#endif
++
++        if (maycopy)
++        {
++            if (frame->buf[1] != NULL)
++                av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
++            else
++                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
++
++            switch (frame->format)
++            {
++                case AV_PIX_FMT_YUV420P10:
++                    return zc_420p10_to_sand128(zc, frame);
++
++                case AV_PIX_FMT_SAND64_10:
++                    return zc_sand64_16_to_sand128(zc, frame, 10);
++
++                default:
++                    return zc_copy(zc, frame);
++            }
++        }
++        else
++        {
++            if (frame->buf[1] != NULL)
++                av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
++            else
++                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
++            return NULL;
++        }
++    }
++
++    if (pic_gm_ptr(frame->buf[0]) == NULL)
++    {
++        if (maycopy)
++        {
++            av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
++            return zc_copy(zc, frame);
++        }
++        else
++        {
++            av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
++            return NULL;
++        }
++    }
++
++    return av_buffer_ref(frame->buf[0]);
++}
++
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
++{
++    if (fr_ref != NULL)
++    {
++        av_buffer_unref(&fr_ref);
++    }
++}
++
++//----------------------------------------------------------------------------
++
++// Extract user environment from an AVBufferRef
++void * av_rpi_zc_buf_v(AVBufferRef * const buf)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++    if (zbe != NULL && zbe->user != NULL)
++    {
++        const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
++        return zub == NULL ? NULL : zub->v;
++    }
++    return NULL;
++}
++
++// AV buffer pre-free callback
++static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
++{
++    if (opaque != NULL)
++    {
++        ZcUserBufEnv * const zub = opaque;
++
++        if (zub->fn->free)
++            zub->fn->free(zub->v);
++
++        av_free(zub);
++    }
++}
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
++{
++    if (opaque != NULL)
++    {
++        ZcBufEnv * const zbe = opaque;
++
++        av_buffer_unref(&zbe->user);
++
++        if (zbe->zc != NULL)
++            av_rpi_zc_env_release(zbe->zc);
++
++        pthread_cond_destroy(&zbe->cond);
++        pthread_mutex_destroy(&zbe->lock);
++        av_free(zbe);
++    }
++}
++
++
++// Wrap the various ZC bits in an AV Buffer and resolve those things we want
++// resolved now.
++// Currently we resolve everything, but in future we might not
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
++{
++    AVBufferRef *buf;
++    ZcUserBufEnv * zub;
++
++    if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
++        return NULL;
++
++    zub->fn = fn_tab;
++    zub->v = v;
++    zub->numbytes = numbytes;
++    zub->offset = addr_offset;
++
++    if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
++        av_free(zub);
++        return NULL;
++    }
++
++    return buf;
++}
++
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++
++    if (zbe == NULL)
++        return AVERROR(EINVAL);
++
++    if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
++        return AVERROR(EAGAIN);
++
++    if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
++    {
++        pthread_mutex_lock(&zbe->lock);
++        while (!zbe->is_valid)
++            pthread_cond_wait(&zbe->cond, &zbe->lock);
++        pthread_mutex_unlock(&zbe->lock);
++    }
++
++    if (zbe->is_valid == ZC_BUF_NEVER)
++        return AVERROR(EINVAL);
++
++    // Do alloc if we need it
++    if (zbe->user == NULL)
++    {
++        ZcEnv * const zc = zbe->zc;
++        const ZcUserBufEnv * zub;
++
++        av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
++
++        if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
++        {
++            av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++            goto fail;
++        }
++        zub = (const ZcUserBufEnv *)zbe->user->data;
++
++        // Track
++
++        zbe->offset = zub->offset;
++        zbe->gmem.numbytes = zub->numbytes;
++        if ((zbe->gmem.arm =  zub->fn->map_arm(zub->v)) == NULL)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++            goto fail;
++        }
++
++        if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
++            goto fail;
++        }
++
++        if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++            goto fail;
++        }
++        if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++            goto fail;
++        }
++
++        buf->buffer->data = zbe->gmem.arm + zbe->offset;
++        buf->buffer->size = zbe->size_pic;
++
++        // In this mode we shouldn't have anyone waiting for us
++        // so no need to signal
++        if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
++            zbe->is_valid = 1;
++    }
++
++    // Just overwrite - no point in testing
++    buf->data = zbe->gmem.arm + zbe->offset;
++    buf->size = zbe->size_pic;
++    return 0;
++
++fail:
++    av_buffer_unref(&zbe->user);
++    return AVERROR(ENOMEM);
++}
++
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
++{
++    int rv;
++
++    // Do alloc if we need it
++    if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
++        return rv;
++
++    // If we are a framebuf copy then the alloc can be done but we haven't
++    // imported its results yet
++    if (frame->data[0] == NULL)
++    {
++        const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++        frame->linesize[0] = zbe->geo.stride_y;
++        frame->linesize[1] = zbe->geo.stride_c;
++        frame->linesize[2] = zbe->geo.stride_c;
++        // abuse: linesize[3] = "stripe stride"
++        // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++        // In a general case this makes the calculation an xor and multiply rather
++        // than a divide and multiply
++        if (zbe->geo.stripes > 1)
++            frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
++
++        frame->data[0] = frame->buf[0]->data;
++        frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
++        if (zbe->geo.planes_c > 1)
++            frame->data[2] = frame->data[1] + zbe->size_c;
++
++        frame->extended_data = frame->data;
++        // Leave extended buf alone
++    }
++
++    return 0;
++}
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++    if (zbe == NULL)
++        return AVERROR(EINVAL);
++
++    zbe->is_valid = ZC_BUF_VALID;
++    pthread_cond_broadcast(&zbe->cond);
++
++    return 0;
++}
++
++int av_rpi_zc_set_broken_frame(AVFrame * const frame)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++    if (zbe == NULL)
++        return AVERROR(EINVAL);
++
++    zbe->is_valid = ZC_BUF_NEVER;
++    pthread_cond_broadcast(&zbe->cond);
++
++    return 0;
++}
++
++void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
++{
++    zc->pool_size = pool_size;
++}
++
++unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
++{
++    return zc->pool_size;
++}
++
++int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
++{
++#if 1
++    ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
++
++    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++        frame->buf[i] = NULL;
++        frame->data[i] = NULL;
++        frame->linesize[i] = 0;
++    }
++
++    if (zbe == NULL)
++        return AVERROR(ENOMEM);
++
++    if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
++    {
++        av_free(zbe);
++        return AVERROR(ENOMEM);
++    }
++
++    pthread_mutex_init(&zbe->lock, NULL);
++    pthread_cond_init(&zbe->cond, NULL);
++    zbe->zc = zc;
++    atomic_fetch_add(&zc->refcount, 1);
++
++    zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);  // Note geometry for later use
++    zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
++    zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
++    zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
++
++#else
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
++    const unsigned int size_y = geo.stride_y * geo.height_y;
++    const unsigned int size_c = geo.stride_c * geo.height_c;
++    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
++    AVBufferRef * buf;
++    unsigned int i;
++
++//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
++
++    if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++        return AVERROR(ENOMEM);
++    }
++
++    // Track
++    atomic_fetch_add(&zc->refcount, 1);
++    pic_zbe_ptr(buf)->zc = zc;
++
++    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++        frame->buf[i] = NULL;
++        frame->data[i] = NULL;
++        frame->linesize[i] = 0;
++    }
++
++    frame->buf[0] = buf;
++
++    frame->linesize[0] = geo.stride_y;
++    frame->linesize[1] = geo.stride_c;
++    frame->linesize[2] = geo.stride_c;
++    // abuse: linesize[3] = "stripe stride"
++    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++    // In a general case this makes the calculation an xor and multiply rather
++    // than a divide and multiply
++    if (geo.stripes > 1)
++        frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
++
++    frame->data[0] = buf->data;
++    frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
++    if (geo.planes_c > 1)
++        frame->data[2] = frame->data[1] + size_c;
++
++    frame->extended_data = frame->data;
++    // Leave extended buf alone
++
++#if RPI_ZC_SAND_8_IN_10_BUF != 0
++    // *** If we intend to use this for real we will want a 2nd buffer pool
++    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
++#endif
++#endif
++
++    return 0;
++}
++
++void av_rpi_zc_env_release(const AVZcEnvPtr zc)
++{
++    const int n = atomic_fetch_add(&zc->refcount, -1);
++    if (n == 1)  // was 1, now 0
++    {
++        zc->free_pool(zc->pool_env);
++        av_free(zc);
++    }
++}
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++                    void * pool_env,
++                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++    ZcEnv * zc;
++
++    if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
++    {
++        av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
++        return NULL;
++    }
++
++    *zc = (ZcEnv){
++        .refcount = ATOMIC_VAR_INIT(1),
++        .pool_env = pool_env,
++        .alloc_buf = alloc_buf_fn,
++        .free_pool = free_pool_fn,
++        .pool_size = 0
++    };
++
++    return zc;
++}
++
++//============================================================================
++//
++// External ZC initialisation
++
++#define RPI_GET_BUFFER2 1
++
++
++static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
++{
++#if !RPI_GET_BUFFER2
++    return avcodec_default_get_buffer2(s, frame, flags);
++#else
++    int rv;
++
++    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
++    {
++//        printf("Do default alloc: format=%#x\n", frame->format);
++        rv = avcodec_default_get_buffer2(s, frame, flags);
++    }
++    else if (frame->format == AV_PIX_FMT_YUV420P ||
++             av_rpi_is_sand_frame(frame))
++    {
++        if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
++            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
++    }
++    else
++    {
++        rv = avcodec_default_get_buffer2(s, frame, flags);
++    }
++
++#if 0
++    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
++        frame->format, frame->width, frame->height,
++        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
++        frame->data[0], frame->data[1], frame->data[2],
++        frame->buf[0], frame->buf[1], frame->buf[2],
++        av_buffer_get_opaque(frame->buf[0]));
++#endif
++    return rv;
++#endif
++}
++
++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
++{
++    return s->get_buffer2 == zc_get_buffer2;
++}
++
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++                    void * pool_env,
++                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++    ZcEnv * zc;
++
++    av_assert0(!av_rpi_zc_in_use(s));
++
++    if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
++        return AVERROR(ENOMEM);
++
++    zc->old = (ZcOldCtxVals){
++        .opaque = s->opaque,
++        .get_buffer2 = s->get_buffer2,
++        .thread_safe_callbacks = s->thread_safe_callbacks
++    };
++
++    s->opaque = zc;
++    s->get_buffer2 = zc_get_buffer2;
++    s->thread_safe_callbacks = 1;
++    return 0;
++}
++
++void av_rpi_zc_uninit2(struct AVCodecContext * const s)
++{
++    ZcEnv * const zc = s->opaque;
++
++    av_assert0(av_rpi_zc_in_use(s));
++
++    s->get_buffer2 = zc->old.get_buffer2;
++    s->opaque = zc->old.opaque;
++    s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
++
++    av_rpi_zc_env_release(zc);
++}
++
+diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+new file mode 100644
+index 0000000000..18e71314bb
+--- /dev/null
++++ b/libavcodec/rpi_zc.h
+@@ -0,0 +1,174 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef LIBAVCODEC_RPI_ZC_H
++#define LIBAVCODEC_RPI_ZC_H
++
++// Zero-Copy frame code for RPi
++// RPi needs Y/U/V planes to be contiguous for display.  By default
++// ffmpeg will allocate separated planes so a memcpy is needed before
++// display.  This code provides a method a making ffmpeg allocate a single
++// bit of memory for the frame when can then be reference counted until
++// display has finished with it.
++
++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
++// 0 disables
++// *** This option still in development
++//     Only works if SAO active
++//     Allocates buffers that are twice the required size
++#define RPI_ZC_SAND_8_IN_10_BUF  0
++
++struct AVBufferRef;
++struct AVFrame;
++struct AVCodecContext;
++enum AVPixelFormat;
++
++// "Opaque" pointer to whatever we are using as a buffer reference
++typedef struct AVBufferRef * AVRpiZcRefPtr;
++
++struct AVZcEnv;
++typedef struct AVZcEnv * AVZcEnvPtr;
++
++typedef struct AVRpiZcFrameGeometry
++{
++    unsigned int stride_y;  // Luma stride (bytes)
++    unsigned int height_y;  // Luma height (lines)
++    unsigned int stride_c;  // Chroma stride (bytes)
++    unsigned int height_c;  // Chroma stride (lines)
++    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
++    unsigned int stripes;   // Number of stripes (sand)
++    unsigned int bytes_per_pel;
++    int stripe_is_yc;       // A single stripe is Y then C (false for tall sand)
++
++    int format;                 // Requested format
++    unsigned int video_width;   // Requested width
++    unsigned int video_height;  // Requested height
++} AVRpiZcFrameGeometry;
++
++
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++    const int format,
++    const unsigned int video_width, const unsigned int video_height);
++
++// Generate a ZC reference to the buffer(s) in this frame
++// If the buffer doesn't appear to be one allocated by ZC
++// then the behaviour depends on maycopy:
++//   If maycopy=0 then return NULL
++//   If maycopy=1 && the src frame is in a form where we can easily copy
++//     the data, then allocate a new buffer and copy the data into it
++//   Otherwise return NULL
++// If maycopy == 0 then ZC may be NULL
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
++    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
++
++// Get the vc_handle from the frame ref
++// Returns -1 if ref doesn't look valid
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get the vcsm_handle from the frame ref
++// Returns 0 if ref doesn't look valid
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
++// Get the number of bytes allocated from the frame ref
++// Returns 0 if ref doesn't look valid
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
++// Geometry this frame was allocated with
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
++
++// Unreference the buffer refed/allocated by _zc_ref
++// If fr_ref is NULL then this will NOP
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
++
++// Test to see if the context is using zc (checks get_buffer2)
++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
++
++// Init ZC into a context
++// Sets opaque, get_buffer2, thread_safe_callbacks
++// Use if you want to allocate your own pools and/or create ZC buffers for
++// all decoders
++// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken
++// apart by av_rpi_zc_xxx calls without this
++
++typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
++                                               const AVRpiZcFrameGeometry * geo);
++typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
++
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++                    void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
++
++// Free ZC from a context
++void av_rpi_zc_uninit2(struct AVCodecContext * const s);
++
++void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
++AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
++void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
++unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
++
++// Get buffer generates placeholders for later alloc
++int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
++// Resolve actually does the alloc (noop if already alloced)
++// Set data pointers on a buffer/frame that was copied before the alloc
++// accured
++#define ZC_RESOLVE_FAIL         0  // return error on invalid
++#define ZC_RESOLVE_ALLOC        1  // alloc as invalid
++#define ZC_RESOLVE_WAIT_VALID   2  // wait for valid
++#define ZC_RESOLVE_ALLOC_VALID  3  // alloc as valid
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame);
++int av_rpi_zc_set_broken_frame(AVFrame * const frame);
++
++
++typedef struct av_rpi_zc_buf_fn_tab_s {
++    void (* free)(void * v);
++
++    unsigned int (* vcsm_handle)(void * v);
++    unsigned int (* vc_handle)(void * v);
++    void * (* map_arm)(void * v);
++    unsigned int (* map_vc)(void * v);
++} av_rpi_zc_buf_fn_tab_t;
++
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
++void * av_rpi_zc_buf_v(AVBufferRef * const buf);
++
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++                    void * pool_env,
++                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
++void av_rpi_zc_env_release(const AVZcEnvPtr zc);
++
++
++#endif
++
+diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h
+new file mode 100644
+index 0000000000..9b7b6536a4
+--- /dev/null
++++ b/libavcodec/rpi_zc_frames.h
+@@ -0,0 +1,142 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_ZC_FRAMES_H
++#define RPI_ZC_FRAMES_H
++
++#define RPI_ONE_BUF 1
++
++#include "rpi_mem.h"  // for GPU_MEM_PTR_T
++#include "libavutil/frame.h"
++
++#if !RPI_ONE_BUF
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++    return p->vc;
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++    return p->vc;
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++    return p->vc;
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++}
++
++#else
++
++static inline int gpu_is_buf1(const AVFrame * const frame)
++{
++    return frame->buf[1] == NULL;
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
++{
++    return av_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
++{
++    return av_buffer_pool_buffer_get_opaque(frame->buf[n]);
++}
++
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++    return gm->vc + (frame->data[n] - gm->arm);
++}
++
++
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++    return get_vc_address3(frame, 0);
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++    return get_vc_address3(frame, 1);
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++    return get_vc_address3(frame, 2);
++}
++
++#if 0
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.numbytes = frame->data[1] - frame->data[0];
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 0);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.arm += frame->data[1] - frame->data[0];
++        g.vc += frame->data[1] - frame->data[0];
++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 1);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.arm += frame->data[2] - frame->data[0];
++        g.vc += frame->data[2] - frame->data[0];
++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 2);
++}
++#endif
++#endif
++
++#endif
+diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c
+new file mode 100644
+index 0000000000..95550b106b
+--- /dev/null
++++ b/libavcodec/rpivid_hevc.c
+@@ -0,0 +1,2032 @@
++// FFMPEG HEVC decoder hardware accelerator
++// Andrew Holme, Argon Design Ltd
++// Copyright (c) June 2017 Raspberry Pi Ltd
++
++#include <stdio.h>
++#include <fcntl.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <sys/mman.h>
++
++#include "fftools/ffmpeg.h"
++#include "libavutil/avassert.h"
++#include "libavutil/imgutils.h"
++#include "avcodec.h"
++#include "hwconfig.h"
++#include "decode.h"
++
++#include "hevc.h"
++#include "hevcdec.h"
++#include "rpi_zc.h"
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++#include "rpi_mailbox.h"
++
++
++#define OPT_PHASE_TIMING 0      // Generate stats for phase usage
++
++#define NUM_SCALING_FACTORS 4064
++
++#define AXI_BASE64 0
++
++#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
++#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
++
++#define RPIVID_COL_PICS 17                 // 16 ref & current
++
++#define RPIVID_BITBUFS          2          // Bit + Cmd bufs (phase 0 & 1)
++#define RPIVID_BITBUF_SIZE      (4 << 20)  // Bit + Cmd buf size
++
++#define RPIVID_COEFFBUFS        3          // PU + Coeff bufs (phase 1 & 2)
++#define RPIVID_COEFFBUF_SIZE    (16 << 20) // PU + Coeff buf size
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// Register offsets
++
++#define RPI_SPS0         0
++#define RPI_SPS1         4
++#define RPI_PPS          8
++#define RPI_SLICE        12
++#define RPI_TILESTART    16
++#define RPI_TILEEND      20
++#define RPI_SLICESTART   24
++#define RPI_MODE         28
++#define RPI_LEFT0        32
++#define RPI_LEFT1        36
++#define RPI_LEFT2        40
++#define RPI_LEFT3        44
++#define RPI_QP           48
++#define RPI_CONTROL      52
++#define RPI_STATUS       56
++#define RPI_VERSION      60
++#define RPI_BFBASE       64
++#define RPI_BFNUM        68
++#define RPI_BFCONTROL    72
++#define RPI_BFSTATUS     76
++#define RPI_PUWBASE      80
++#define RPI_PUWSTRIDE    84
++#define RPI_COEFFWBASE   88
++#define RPI_COEFFWSTRIDE 92
++#define RPI_SLICECMDS    96
++#define RPI_BEGINTILEEND 100
++#define RPI_TRANSFER     104
++#define RPI_CFBASE       108
++#define RPI_CFNUM        112
++#define RPI_CFSTATUS     116
++
++#define RPI_PURBASE       0x8000
++#define RPI_PURSTRIDE     0x8004
++#define RPI_COEFFRBASE    0x8008
++#define RPI_COEFFRSTRIDE  0x800C
++#define RPI_NUMROWS       0x8010
++#define RPI_CONFIG2       0x8014
++#define RPI_OUTYBASE      0x8018
++#define RPI_OUTYSTRIDE    0x801C
++#define RPI_OUTCBASE      0x8020
++#define RPI_OUTCSTRIDE    0x8024
++#define RPI_STATUS2       0x8028
++#define RPI_FRAMESIZE     0x802C
++#define RPI_MVBASE        0x8030
++#define RPI_MVSTRIDE      0x8034
++#define RPI_COLBASE       0x8038
++#define RPI_COLSTRIDE     0x803C
++#define RPI_CURRPOC       0x8040
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Unused but left here to illustrate the diffrences between FFmpegs prob
++// structure and the rpivid one
++
++struct FFM_PROB {
++    uint8_t  sao_merge_flag                   [ 1];
++    uint8_t  sao_type_idx                     [ 1];
++    uint8_t  split_coding_unit_flag           [ 3];
++    uint8_t  cu_transquant_bypass_flag        [ 1];
++    uint8_t  skip_flag                        [ 3];
++    uint8_t  cu_qp_delta                      [ 3];
++    uint8_t  pred_mode_flag                   [ 1];
++    uint8_t  part_mode                        [ 4];
++    uint8_t  prev_intra_luma_pred_flag        [ 1];
++    uint8_t  intra_chroma_pred_mode           [ 2];
++    uint8_t  merge_flag                       [ 1];
++    uint8_t  merge_idx                        [ 1];
++    uint8_t  inter_pred_idc                   [ 5];
++    uint8_t  ref_idx_l0                       [ 2];
++    uint8_t  ref_idx_l1                       [ 2];
++    uint8_t  abs_mvd_greater0_flag            [ 2];
++    uint8_t  abs_mvd_greater1_flag            [ 2];
++    uint8_t  mvp_lx_flag                      [ 1];
++    uint8_t  no_residual_data_flag            [ 1];
++    uint8_t  split_transform_flag             [ 3];
++    uint8_t  cbf_luma                         [ 2];
++    uint8_t  cbf_cb_cr                        [ 4];
++    uint8_t  transform_skip_flag/*[][]*/      [ 2];
++    uint8_t  explicit_rdpcm_flag/*[][]*/      [ 2];
++    uint8_t  explicit_rdpcm_dir_flag/*[][]*/  [ 2];
++    uint8_t  last_significant_coeff_x_prefix  [18];
++    uint8_t  last_significant_coeff_y_prefix  [18];
++    uint8_t  significant_coeff_group_flag     [ 4];
++    uint8_t  significant_coeff_flag           [44];
++    uint8_t  coeff_abs_level_greater1_flag    [24];
++    uint8_t  coeff_abs_level_greater2_flag    [ 6];
++    uint8_t  log2_res_scale_abs               [ 8];
++    uint8_t  res_scale_sign_flag              [ 2];
++    uint8_t  cu_chroma_qp_offset_flag         [ 1];
++    uint8_t  cu_chroma_qp_offset_idx          [ 1];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_PROB {
++    uint8_t  SAO_MERGE_FLAG             [ 1];
++    uint8_t  SAO_TYPE_IDX               [ 1];
++    uint8_t  SPLIT_FLAG                 [ 3];
++    uint8_t  CU_SKIP_FLAG               [ 3];
++    uint8_t  CU_TRANSQUANT_BYPASS_FLAG  [ 1];
++    uint8_t  PRED_MODE                  [ 1];
++    uint8_t  PART_SIZE                  [ 4];
++    uint8_t  INTRA_PRED_MODE            [ 1];
++    uint8_t  CHROMA_PRED_MODE           [ 1];
++    uint8_t  MERGE_FLAG_EXT             [ 1];
++    uint8_t  MERGE_IDX_EXT              [ 1];
++    uint8_t  INTER_DIR                  [ 5];
++    uint8_t  REF_PIC                    [ 2];
++    uint8_t  MVP_IDX                    [ 1];
++    uint8_t  MVD                        [ 2];
++    uint8_t  QT_ROOT_CBF                [ 1];
++    uint8_t  TRANS_SUBDIV_FLAG          [ 3];
++    uint8_t  QT_CBF                     [ 6];
++    uint8_t  DQP                        [ 2];
++    uint8_t  ONE_FLAG                   [24];
++    uint8_t  LASTX                      [18];
++    uint8_t  LASTY                      [18];
++    uint8_t  SIG_CG_FLAG                [ 4];
++    uint8_t  ABS_FLAG                   [ 6];
++    uint8_t  TRANSFORMSKIP_FLAG         [ 2];
++    uint8_t  SIG_FLAG                   [42];
++    uint8_t  SIG_FLAG_unused            [ 2];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_CMD {
++    uint32_t addr;
++    uint32_t data;
++} __attribute__((packed));
++
++struct RPI_BIT {
++    int         cmd;
++    const void *ptr;
++    int         len;
++};
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_T;
++
++// Actual addressability is 38bits but we can only alloc in the bottom 32
++// currently - when passed to rpivid h/w the address is always >> 6 so will
++// fit in 32 bit there
++// At some point we may weant to make this uint64_t
++typedef uint32_t vid_vc_addr_t;
++
++typedef enum rpivid_decode_state_e {
++    RPIVID_DECODE_NEW = 0,
++    RPIVID_DECODE_START,
++    RPIVID_DECODE_SLICE,
++    RPIVID_DECODE_END,
++} rpivid_decode_state_t;
++
++#define RPI_PROB_VALS 154U
++#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
++
++typedef struct dec_env_s {
++    const AVCodecContext * avctx;
++
++    rpivid_decode_state_t state;
++    unsigned int    decode_order;
++
++    int             phase_no;           // Current phase (i.e. the last one we waited for)
++    struct dec_env_s * phase_wait_q_next;
++    sem_t           phase_wait;
++
++    struct RPI_BIT *bit_fifo;
++    struct RPI_CMD *cmd_fifo;
++    unsigned int    bit_len, bit_max;
++    unsigned int    cmd_len, cmd_max;
++    unsigned int    num_slice_msgs;
++    unsigned int    PicWidthInCtbsY;
++    unsigned int    PicHeightInCtbsY;
++    unsigned int    dpbno_col;
++    uint32_t        reg_slicestart;
++    unsigned int    wpp_entry_x;
++    unsigned int    wpp_entry_y;
++    uint16_t        slice_msgs[2*HEVC_MAX_REFS*8+3];
++    uint8_t         scaling_factors[NUM_SCALING_FACTORS];
++//    unsigned int    RefPicList[2][HEVC_MAX_REFS];
++} dec_env_t;
++
++#define RPIVID_PHASES 3
++#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
++#define RPIVID_PHASE_START (-1)          // Phase after we have inced decode_order
++
++#if OPT_PHASE_TIMING
++static const unsigned int time_thresholds[8] = {
++    10, 15, 20, 30, 45, 60, 75, 90
++};
++#endif
++
++typedef struct phase_wait_env_s {
++    unsigned int    last_order;
++    dec_env_t *     q;
++#if OPT_PHASE_TIMING
++    uint64_t phase_time;
++    uint64_t max_phase_time;
++    uint64_t time_in_phase;
++    uint64_t time_out_phase;
++    unsigned int max_time_decode_order;
++    unsigned int time_bins[9];
++    unsigned int time_bins3[9];
++    unsigned int time_bins5[9];
++    uint64_t time_stash[16];
++    unsigned int i3;
++#endif
++} phase_wait_env_t;                      // Single linked list of threads waiting for this phase
++
++typedef struct RPI_T {
++    atomic_int      ref_count;
++    sem_t           ref_zero;
++
++    dec_env_t **    dec_envs;
++    AVZcEnvPtr      zc;
++
++    pthread_mutex_t phase_lock;
++    phase_wait_env_t phase_reqs[RPIVID_PHASES];
++
++    volatile uint32_t * regs;
++    volatile uint32_t * ints;
++
++    GPU_MEM_PTR_T   gcolbuf;
++    unsigned int    col_stride;
++    size_t          col_picsize;
++
++    unsigned int    bitbuf_no;
++    sem_t           bitbuf_sem;
++    GPU_MEM_PTR_T   gbitbufs[RPIVID_BITBUFS];
++
++    unsigned int    max_pu_msgs;
++    unsigned int    coeffbuf_no;
++    sem_t           coeffbuf_sem;
++    GPU_MEM_PTR_T   gcoeffbufs[RPIVID_COEFFBUFS];
++
++    unsigned int    decode_order;
++    int             mbox_fd;
++    int             gpu_init_type;
++} RPI_T;
++
++#if OPT_PHASE_TIMING
++static uint64_t tus64(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++#endif
++
++static inline unsigned int rnd64(unsigned int x)
++{
++    return (x + 63) & ~63;
++}
++
++static inline int rpi_sem_wait(sem_t * const sem)
++{
++    int rv;
++    while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
++        /* Loop */;
++    return rv;
++}
++
++//============================================================================
++
++#define TRACE_DEV 0
++#define TRACE_ENTRY 0
++
++#define REGS_NAME "/dev/rpivid-hevcmem"
++#define REGS_SIZE 0x10000
++#define INTS_NAME "/dev/rpivid-intcmem"
++#define INTS_SIZE 0x10000  // 4 is probably enough but we are going to alloc a page anyway
++
++static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
++{
++    void *gpio_map;
++    int  mem_fd;
++
++    /* open /dev/mem */
++    if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
++        av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
++        return NULL;
++    }
++
++    // Now map it
++    gpio_map = mmap(
++       NULL,
++       size,
++       PROT_READ|PROT_WRITE,
++       MAP_SHARED,
++       mem_fd,
++       0
++    );
++
++    close(mem_fd);  // No longer need the FD
++
++    if (gpio_map == MAP_FAILED) {
++        av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
++        return NULL;
++    }
++
++    return (volatile uint32_t *)gpio_map;
++}
++
++static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
++{
++    volatile uint32_t * const gpio_map = *p_gpio_map;
++    if (gpio_map != NULL) {
++        *p_gpio_map = NULL;
++        munmap((void *)gpio_map, size);
++    }
++}
++
++#define MANGLE(x) ((x) &~0xc0000000)          // ** If x is ever a 64 bit thing this will need fixing!
++#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
++
++static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
++{
++    rpi->regs[addr >> 2] = MANGLE64(data);
++}
++
++static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
++{
++    rpi->regs[addr >> 2] = data >> 6;  // ?? rnd64 - but not currently needed
++}
++
++static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
++{
++#if TRACE_DEV
++    printf("W %x %08x\n", addr, data);
++#endif
++
++    rpi->regs[addr >> 2] = data;
++}
++
++static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
++{
++    const uint32_t v = rpi->regs[addr >> 2];
++#if TRACE_DEV
++    printf("R %x (=%x)\n", addr, v);
++#endif
++    return v;
++}
++
++#define ARG_IC_ICTRL_ACTIVE1_INT_SET                   0x00000001
++#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET                  0x00000002
++#define ARG_IC_ICTRL_ACTIVE1_EN_SET                    0x00000004
++#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET                0x00000008
++#define ARG_IC_ICTRL_ACTIVE2_INT_SET                   0x00000010
++#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET                  0x00000020
++#define ARG_IC_ICTRL_ACTIVE2_EN_SET                    0x00000040
++#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET                0x00000080
++
++static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
++{
++    const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
++    const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
++    uint32_t ival;
++    while (((ival = rpi->ints[0]) & mask_done) == 0) {
++        usleep(1000);
++    }
++    rpi->ints[0] = ival & mask_reset;
++}
++
++#if TRACE_DEV
++static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
++    int i;
++
++    for (i=0; i<num; i++)
++    {
++        if ((i%4)==0)
++          printf("%08x: ", 0x7eb00000 + addr + 4*i);
++
++        printf("%08x", rpi->regs[(addr>>2)+i]);
++
++        if ((i%4)==3 || i+1 == num)
++            printf("\n");
++        else
++            printf(" ");
++    }
++}
++
++static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
++    int i;
++
++    for (i=0; i<size>>2; i++)
++    {
++        if ((i%4)==0)
++            printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
++
++        printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
++
++        if ((i%4)==3 || i+1 == size>>2)
++            printf("\n");
++        else
++            printf(" ");
++    }
++}
++#endif
++
++
++//////////////////////////////////////////////////////////////////////////////
++// Scaling factors
++
++static void expand_scaling_list(
++    const unsigned int sizeID,
++    const unsigned int matrixID,
++    uint8_t * const dst0,
++    const uint8_t * const src0,
++    uint8_t dc)
++{
++    switch (sizeID) {
++        case 0:
++            memcpy(dst0, src0, 16);
++            break;
++        case 1:
++            memcpy(dst0, src0, 64);
++            break;
++        case 2:
++        {
++            uint8_t * d = dst0;
++            for (unsigned int y=0; y != 16; y++) {
++                const uint8_t * s = src0 + (y >> 1) * 8;
++                for (unsigned int x = 0; x != 8; ++x) {
++                    *d++ = *s;
++                    *d++ = *s++;
++                }
++            }
++            dst0[0] = dc;
++            break;
++        }
++        default:
++        {
++            uint8_t * d = dst0;
++            for (unsigned int y=0; y != 32; y++) {
++                const uint8_t * s = src0 + (y >> 2) * 8;
++                for (unsigned int x = 0; x != 8; ++x) {
++                    *d++ = *s;
++                    *d++ = *s;
++                    *d++ = *s;
++                    *d++ = *s++;
++                }
++            }
++            dst0[0] = dc;
++            break;
++        }
++    }
++}
++
++static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
++    // Array of constants for scaling factors
++    static const uint32_t scaling_factor_offsets[4][6] = {
++        // MID0    MID1    MID2    MID3    MID4    MID5
++        {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050},   // SID0 (4x4)
++        {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0},   // SID1 (8x8)
++        {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0},   // SID2 (16x16)
++        {0x07E0,      0,      0, 0x0BE0,      0,      0}};  // SID3 (32x32)
++
++    // ffmpeg places SID3,MID1 where matrixID 3 normally is
++    const ScalingList * const sl =
++        s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
++                                                  : &s->ps.sps->scaling_list;
++    unsigned int mid;
++
++    for (mid=0; mid<6; mid++)
++        expand_scaling_list(0, mid,
++            de->scaling_factors + scaling_factor_offsets[0][mid],
++            sl->sl[0][mid], 0);
++    for (mid=0; mid<6; mid++)
++        expand_scaling_list(1, mid,
++            de->scaling_factors + scaling_factor_offsets[1][mid],
++            sl->sl[1][mid], 0);
++    for (mid=0; mid<6; mid++)
++        expand_scaling_list(2, mid,
++            de->scaling_factors + scaling_factor_offsets[2][mid],
++            sl->sl[2][mid],
++            sl->sl_dc[0][mid]);
++    // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
++    for (mid=0; mid<6; mid += 3)
++        expand_scaling_list(3, mid,
++            de->scaling_factors + scaling_factor_offsets[3][mid],
++            sl->sl[3][mid],
++            sl->sl_dc[1][mid]);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Probabilities
++
++static const uint8_t prob_init[3][156] = {
++	{
++		 153, 200, 139, 141, 157, 154, 154, 154,
++		 154, 154, 184, 154, 154, 154, 184,  63,
++		 154, 154, 154, 154, 154, 154, 154, 154,
++		 154, 154, 154, 154, 154, 153, 138, 138,
++		 111, 141,  94, 138, 182, 154, 154, 154,
++		 140,  92, 137, 138, 140, 152, 138, 139,
++		 153,  74, 149,  92, 139, 107, 122, 152,
++		 140, 179, 166, 182, 140, 227, 122, 197,
++		 110, 110, 124, 125, 140, 153, 125, 127,
++		 140, 109, 111, 143, 127, 111,  79, 108,
++		 123,  63, 110, 110, 124, 125, 140, 153,
++		 125, 127, 140, 109, 111, 143, 127, 111,
++		  79, 108, 123,  63,  91, 171, 134, 141,
++		 138, 153, 136, 167, 152, 152, 139, 139,
++		 111, 111, 125, 110, 110,  94, 124, 108,
++		 124, 107, 125, 141, 179, 153, 125, 107,
++		 125, 141, 179, 153, 125, 107, 125, 141,
++		 179, 153, 125, 140, 139, 182, 182, 152,
++		 136, 152, 136, 153, 136, 139, 111, 136,
++		 139, 111,   0,   0,	},
++	{
++		 153, 185, 107, 139, 126, 197, 185, 201,
++		 154, 149, 154, 139, 154, 154, 154, 152,
++		 110, 122,  95,  79,  63,  31,  31, 153,
++		 153, 168, 140, 198,  79, 124, 138,  94,
++		 153, 111, 149, 107, 167, 154, 154, 154,
++		 154, 196, 196, 167, 154, 152, 167, 182,
++		 182, 134, 149, 136, 153, 121, 136, 137,
++		 169, 194, 166, 167, 154, 167, 137, 182,
++		 125, 110,  94, 110,  95,  79, 125, 111,
++		 110,  78, 110, 111, 111,  95,  94, 108,
++		 123, 108, 125, 110,  94, 110,  95,  79,
++		 125, 111, 110,  78, 110, 111, 111,  95,
++		  94, 108, 123, 108, 121, 140,  61, 154,
++		 107, 167,  91, 122, 107, 167, 139, 139,
++		 155, 154, 139, 153, 139, 123, 123,  63,
++		 153, 166, 183, 140, 136, 153, 154, 166,
++		 183, 140, 136, 153, 154, 166, 183, 140,
++		 136, 153, 154, 170, 153, 123, 123, 107,
++		 121, 107, 121, 167, 151, 183, 140, 151,
++		 183, 140,   0,   0,	},
++	{
++		 153, 160, 107, 139, 126, 197, 185, 201,
++		 154, 134, 154, 139, 154, 154, 183, 152,
++		 154, 137,  95,  79,  63,  31,  31, 153,
++		 153, 168, 169, 198,  79, 224, 167, 122,
++		 153, 111, 149,  92, 167, 154, 154, 154,
++		 154, 196, 167, 167, 154, 152, 167, 182,
++		 182, 134, 149, 136, 153, 121, 136, 122,
++		 169, 208, 166, 167, 154, 152, 167, 182,
++		 125, 110, 124, 110,  95,  94, 125, 111,
++		 111,  79, 125, 126, 111, 111,  79, 108,
++		 123,  93, 125, 110, 124, 110,  95,  94,
++		 125, 111, 111,  79, 125, 126, 111, 111,
++		  79, 108, 123,  93, 121, 140,  61, 154,
++		 107, 167,  91, 107, 107, 167, 139, 139,
++		 170, 154, 139, 153, 139, 123, 123,  63,
++		 124, 166, 183, 140, 136, 153, 154, 166,
++		 183, 140, 136, 153, 154, 166, 183, 140,
++		 136, 153, 154, 170, 153, 138, 138, 122,
++		 121, 122, 121, 167, 151, 183, 140, 151,
++		 183, 140,   0,   0,	},
++};
++
++
++//////////////////////////////////////////////////////////////////////////////
++// Phase 1 command and bit FIFOs
++
++// ???? uint16_t addr - put in uint32_t
++static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
++    if (de->cmd_len==de->cmd_max)
++        av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
++    de->cmd_fifo[de->cmd_len].addr = addr;
++    de->cmd_fifo[de->cmd_len].data = data;
++    return de->cmd_len++;
++}
++
++static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
++    if (de->bit_len==de->bit_max)
++        av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
++    de->bit_fifo[de->bit_len].cmd = cmd_idx;
++    de->bit_fifo[de->bit_len].ptr = ptr;
++    de->bit_fifo[de->bit_len].len = len;
++    de->bit_len++;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write probability and scaling factor memories
++
++#if 0
++static void WriteProb(dec_env_t * const de) {
++    int i;
++    const uint8_t *p = (uint8_t *) &de->probabilities;
++    for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
++        p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++#endif
++
++static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
++    uint8_t dst[RPI_PROB_ARRAY_SIZE];
++
++    const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
++        s->sh.slice_type + 1 : 2 - s->sh.slice_type;
++    const uint8_t * p = prob_init[init_type];
++    const int q = av_clip(s->sh.slice_qp, 0, 51);
++    unsigned int i;
++
++    for (i = 0; i < RPI_PROB_VALS; i++) {
++        int init_value = p[i];
++        int m = (init_value >> 4) * 5 - 45;
++        int n = ((init_value & 15) << 3) - 16;
++        int pre = 2 * (((m * q) >> 4) + n) - 127;
++
++        pre ^= pre >> 31;
++        if (pre > 124)
++            pre = 124 + (pre & 1);
++        dst[i] = pre;
++    }
++    for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
++        dst[i] = 0;
++    }
++
++    for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
++        p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
++
++}
++
++
++static void WriteScalingFactors(dec_env_t * const de) {
++    int i;
++    const uint8_t *p = (uint8_t *) de->scaling_factors;
++    for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
++        p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
++    int i;
++    for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
++    return i-1;
++}
++
++static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
++    if (ctb < bd[num-1]) return ctb_size;
++    else if (width % ctb_size) return width % ctb_size;
++    else return ctb_size;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Handle PU and COEFF stream overflow
++
++
++// Returns:
++// -2 Other error
++// -1 Out of coeff space
++//  0  OK
++//  1  Out of PU space
++
++static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
++    uint32_t status;
++
++    // this is the definition of successful completion of phase 1
++    // it assures that status register is zero and all blocks in each tile have completed
++    if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
++        return 0;
++
++    status = apb_read(rpi, RPI_STATUS);
++
++    if ((status & 8) != 0)
++        return -1;
++
++    if ((status & 0x10) != 0)
++        return 1;
++
++    return -2;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write STATUS register with expected end CTU address of previous slice
++
++static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
++    const HEVCPPS * const pps = s->ps.pps;
++    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++}
++
++static void wpp_pause(dec_env_t * const de, int ctb_row) {
++    p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
++    p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
++    p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
++}
++
++static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++    const HEVCPPS *pps = s->ps.pps;
++    int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++    int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++    if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
++        wpp_pause(de, last_y);
++    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++    if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
++        p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
++{
++    const HEVCSPS *sps = s->ps.sps;
++    const HEVCPPS *pps = s->ps.pps;
++
++    p1_apb_write(de, RPI_SPS0,
++        (sps->log2_min_cb_size                    <<  0) +
++        (sps->log2_ctb_size                       <<  4) +
++        (sps->log2_min_tb_size                    <<  8) +
++        (sps->log2_max_trafo_size                 << 12) +
++        (sps->bit_depth                           << 16) +
++        (sps->bit_depth                           << 20) +
++        (sps->max_transform_hierarchy_depth_intra << 24) +
++        (sps->max_transform_hierarchy_depth_inter << 28));
++
++    p1_apb_write(de, RPI_SPS1,
++        (sps->pcm.bit_depth                                        <<  0) +
++        (sps->pcm.bit_depth_chroma                                 <<  4) +
++        (sps->pcm.log2_min_pcm_cb_size                             <<  8) +
++        (sps->pcm.log2_max_pcm_cb_size                             << 12) +
++        (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
++        (sps->amp_enabled_flag                                     << 18) +
++        (sps->pcm_enabled_flag                                     << 19) +
++        (sps->scaling_list_enable_flag                             << 20) +
++        (sps->sps_strong_intra_smoothing_enable_flag               << 21));
++
++    p1_apb_write(de, RPI_PPS,
++        (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth   <<  0) +
++        (pps->cu_qp_delta_enabled_flag                      <<  4) +
++        (pps->transquant_bypass_enable_flag                 <<  5) +
++        (pps->transform_skip_enabled_flag                   <<  6) +
++        (pps->sign_data_hiding_flag                         <<  7) +
++      (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) <<  8) +
++      (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
++        (pps->constrained_intra_pred_flag                   << 24));
++
++    if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
++
++    if (!s->sh.dependent_slice_segment_flag) {
++        int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++        int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++        de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
++    }
++
++    p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void write_slice(dec_env_t * const de, const HEVCContext * const s,
++                        const unsigned int slice_w, const unsigned int slice_h) {
++    uint32_t u32 =
++          (s->sh.slice_type                           << 12)
++        + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
++        + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
++        + (slice_w                                    << 17)
++        + (slice_h                                    << 24);
++
++    if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
++          (s->sh.max_num_merge_cand << 0)
++        + (s->sh.nb_refs[L0]        << 4)
++        + (s->sh.nb_refs[L1]        << 8);
++
++    if (s->sh.slice_type==HEVC_SLICE_B)
++        u32 |= s->sh.mvd_l1_zero_flag<<16;
++    p1_apb_write(de, RPI_SLICE, u32);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
++                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++    const HEVCSPS * const sps = s->ps.sps;
++    const HEVCPPS * const pps = s->ps.pps;
++
++    int ctb_size = 1<<sps->log2_ctb_size;
++    int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++
++    int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
++    int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
++
++    int endx = de->PicWidthInCtbsY-1;
++    int endy = ctb_row;
++
++    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
++    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++    p1_apb_write(de, RPI_TILESTART, 0);
++    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++    if (do_bte)
++        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++    write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
++
++    if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
++    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
++                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++    const HEVCSPS * const sps = s->ps.sps;
++    const HEVCPPS * const pps = s->ps.pps;
++
++    int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
++    int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
++
++    int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++    int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++
++    int endx = pps->col_bd[tile_x+1] - 1;
++    int endy = pps->row_bd[tile_y+1] - 1;
++
++    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
++    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++    p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
++    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++    if (do_bte)
++        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++    write_slice(de, s, slice_w, slice_h);
++
++    if (resetQPY)
++        p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++    p1_apb_write(de, RPI_MODE, (0xFFFF                            <<  0)
++                              + (0x0                               << 16)
++                              + ((tile_x==pps->num_tile_columns-1) << 17)
++                              + ((tile_y==pps->num_tile_rows-1)    << 18));
++
++    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Doesn't attempt to remove from context as we should only do this at the end
++// of time or on create error
++static void
++dec_env_delete(dec_env_t * const de)
++{
++//    gpu_free(&de->gbuf);
++
++    av_freep(&de->cmd_fifo);
++    av_freep(&de->bit_fifo);
++
++    sem_destroy(&de->phase_wait);
++    av_free(de);
++}
++
++static dec_env_t *
++dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++    dec_env_t * const de = av_mallocz(sizeof(*de));
++    int i;
++
++    if (de == NULL)
++        return NULL;
++
++    de->avctx = avctx;
++    de->phase_no = RPIVID_PHASE_NEW;
++
++    sem_init(&de->phase_wait, 0, 0);
++
++    if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
++        goto fail;
++
++    if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
++        goto fail;
++
++    pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++    for (i = 0; i != avctx->thread_count; ++i) {
++        if (rpi->dec_envs[i] == NULL)
++        {
++            rpi->dec_envs[i] = de;
++            break;
++        }
++    }
++    pthread_mutex_unlock(&rpi->phase_lock);
++
++    if (i == avctx->thread_count) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
++        goto fail;
++    }
++
++    return de;
++
++fail:
++    dec_env_delete(de);
++    return NULL;
++}
++
++
++static dec_env_t *
++dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++    dec_env_t * de = NULL;
++    const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
++
++    if (ref_count <= 0) {
++        // Already dead
++        av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
++        return NULL;
++    }
++
++    for (int i = 0; i != avctx->thread_count; ++i) {
++        if (rpi->dec_envs[i] == NULL)
++        {
++            de = dec_env_new(avctx, rpi);
++            break;
++        }
++        if (rpi->dec_envs[i]->avctx == avctx)
++        {
++            de = rpi->dec_envs[i];
++            break;
++        }
++    }
++    return de;
++}
++
++// Call at end of fn
++// Used to ensure we aren't in a worker thead when killed
++static void
++dec_env_release(RPI_T * const rpi, dec_env_t * const de)
++{
++    const int n = atomic_fetch_sub(&rpi->ref_count, 1);
++    if (n == 1) {
++        sem_post(&rpi->ref_zero);
++    }
++}
++
++//----------------------------------------------------------------------------
++
++// Wait for a slot in the given phase
++// Any error return is probably fatal
++static int
++wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++    int needs_wait = 0;
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++
++    pthread_mutex_lock(&rpi->phase_lock);
++    if (p->last_order + 1 != de->decode_order) {
++        de->phase_wait_q_next = p->q;
++        p->q = de;
++        needs_wait = 1;
++    }
++    pthread_mutex_unlock(&rpi->phase_lock);
++
++    if (needs_wait) {
++        while (sem_wait(&de->phase_wait) == -1)
++        {
++            int err;
++            if ((err = errno) != EINTR)
++                return AVERROR(err);
++        }
++    }
++
++    de->phase_no = phase_no;
++    return 0;
++}
++
++static void
++post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++    dec_env_t * next_de = NULL;
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++    dec_env_t ** q = &p->q;
++
++    pthread_mutex_lock(&rpi->phase_lock);
++
++    p->last_order = de->decode_order;
++    while (*q != NULL) {
++        dec_env_t * const t_de = *q;
++
++        if (t_de->decode_order == p->last_order + 1) {
++            // This is us - remove from Q
++            *q = t_de->phase_wait_q_next;
++            t_de->phase_wait_q_next = NULL; // Tidy
++            next_de = t_de;
++            break;
++        }
++        q = &t_de->phase_wait_q_next;
++    }
++
++    pthread_mutex_unlock(&rpi->phase_lock);
++
++    if (next_de != NULL)
++        sem_post(&next_de->phase_wait);
++}
++
++// Wait & signal stuff s.t. threads in other phases can continue
++static void
++abort_phases(RPI_T * const rpi, dec_env_t * const de)
++{
++    for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
++        wait_phase(rpi, de, i);
++        post_phase(rpi, de, i);
++    }
++    de->phase_no = RPIVID_PHASE_NEW;
++}
++
++// Start timing for phase
++// Stats only - no actual effect
++static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++    const int64_t now = tus64();
++    if (p->phase_time != 0)
++        p->time_out_phase += now - p->phase_time;
++    p->phase_time = now;
++#endif
++}
++
++#if OPT_PHASE_TIMING
++static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
++{
++    uint64_t tsum = 0;
++    unsigned int i;
++    for (i = 0; i != avg_n; ++i)
++        tsum += p->time_stash[(p->i3 - i) & 15];
++    for (i = 0; i != 9; ++i) {
++        if (time_thresholds[i] * 1000 * avg_n > tsum)
++            break;
++    }
++    return i;
++}
++#endif
++
++// End timing for phase
++// Stats only - no actual effect
++static inline void tend_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++    const uint64_t now = tus64();
++    const uint64_t in_time = now - p->phase_time;
++
++    p->time_in_phase += in_time;
++    p->phase_time = now;
++    p->time_stash[p->i3] = in_time;
++    if (in_time > p->max_phase_time) {
++        p->max_phase_time = in_time;
++        p->max_time_decode_order = p->last_order;
++    }
++    ++p->time_bins[tavg_bin_phase(p, 1)];
++    ++p->time_bins3[tavg_bin_phase(p, 3)];
++    ++p->time_bins5[tavg_bin_phase(p, 5)];
++
++    p->i3 = (p->i3 + 1) & 15;
++#endif
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Start frame
++
++static int rpi_hevc_start_frame(
++    AVCodecContext * avctx,
++    const uint8_t *buffer,
++    uint32_t size) {
++
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    dec_env_t * const de = dec_env_get(avctx, rpi);
++    const HEVCContext * const s = avctx->priv_data;
++    const HEVCSPS * const sps = s->ps.sps;
++    const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
++
++#if TRACE_ENTRY
++    printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return -1;
++    }
++
++    de->phase_no = RPIVID_PHASE_START;
++    de->decode_order = ++rpi->decode_order;  // *** atomic?
++
++    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++    if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++        return -1;
++    }
++    de->state = RPIVID_DECODE_START;
++
++    de->PicWidthInCtbsY  = (sps->width + CtbSizeY - 1) / CtbSizeY;  //7-15
++    de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY;  //7-17
++    de->bit_len = 0;
++    de->cmd_len = 0;
++
++#if TRACE_ENTRY
++    printf(">>> %s[%p]\n", __func__, de);
++#endif
++
++    dec_env_release(rpi, de);
++    return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Slice messages
++
++static void msg_slice(dec_env_t * const de, const uint16_t msg) {
++    de->slice_msgs[de->num_slice_msgs++] = msg;
++}
++
++static void program_slicecmds(dec_env_t * const de, const int sliceid) {
++    int i;
++    p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
++    for(i=0; i < de->num_slice_msgs; i++) {
++        p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
++    }
++}
++
++static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
++    const HEVCSPS * const sps = s->ps.sps;
++    const HEVCPPS * const pps = s->ps.pps;
++    const SliceHeader *sh = &s->sh;
++
++    int weightedPredFlag, i, rIdx;
++    uint16_t cmd_slice;
++    unsigned int collocated_from_l0_flag;
++
++    de->num_slice_msgs=0;
++    de->dpbno_col = 0;
++    cmd_slice = 0;
++    if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
++    if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
++    if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
++
++    if (sh->slice_type!=HEVC_SLICE_I) {
++        cmd_slice += sh->nb_refs[L0]<<2;
++        cmd_slice += sh->nb_refs[L1]<<6;
++    }
++
++    if (sh->slice_type==HEVC_SLICE_P ||  sh->slice_type==HEVC_SLICE_B)
++        cmd_slice |= sh->max_num_merge_cand<<11;
++
++    collocated_from_l0_flag =
++        !sh->slice_temporal_mvp_enabled_flag ?
++            0 :
++        sh->slice_type == HEVC_SLICE_B ?
++            (sh->collocated_list == L0) :
++            (sh->slice_type==HEVC_SLICE_P);
++    cmd_slice |= collocated_from_l0_flag<<14;
++
++    if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
++
++        int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
++        for(i=L0; i<=L1; i++) {
++            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++                HEVCFrame *c = s->ref; // CurrentPicture
++                if (c->poc < f->poc) NoBackwardPredFlag = 0;
++            }
++        }
++
++        if (sps->sps_temporal_mvp_enabled_flag)
++        {
++            const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
++                s->ref->refPicList + 0 :
++                s->ref->refPicList + 1;
++            de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
++        }
++
++        cmd_slice += NoBackwardPredFlag<<10;
++        msg_slice(de, cmd_slice);
++
++        // Write reference picture descriptions
++        weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
++
++        for(i=L0; i<=L1; i++)
++            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++                HEVCFrame *c = s->ref; // CurrentPicture
++                int pic = f - s->DPB;
++                // Make sure pictures are in range 0 to 15
++                int adjusted_pic = f<c? pic : pic-1;
++                int lt = s->ref->refPicList[i].isLongTerm[rIdx];
++                msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
++                msg_slice(de, f->poc);
++                if (weightedPredFlag) {
++                    msg_slice(de,   s->sh.luma_log2_weight_denom+(((i?s->  sh.luma_weight_l1:  s->sh.luma_weight_l0)[rIdx]   &0x1ff)<<3));
++                    msg_slice(de,                                  (i?s->  sh.luma_offset_l1:  s->sh.luma_offset_l0)[rIdx]   & 0xff);
++                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
++                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
++                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
++                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
++                }
++            }
++    }
++    else
++        msg_slice(de, cmd_slice);
++
++    msg_slice(de, ((sh->beta_offset/2)&15)
++        + (((sh->tc_offset/2)&15)                           <<  4)
++        + (sh->disable_deblocking_filter_flag               <<  8)
++        + (sh->slice_loop_filter_across_slices_enabled_flag <<  9)
++        + (pps->loop_filter_across_tiles_enabled_flag       << 10)); // CMD_DEBLOCK
++
++    msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
++}
++
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    dec_env_t * const de = dec_env_get(avctx,  rpi);
++
++#if TRACE_ENTRY
++    printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return;
++    }
++
++    switch (de->state) {
++        case RPIVID_DECODE_NEW:
++        case RPIVID_DECODE_END:
++            // Expected transition
++            break;
++
++        case RPIVID_DECODE_SLICE:
++            // Error transition
++            av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
++            break;
++
++        case RPIVID_DECODE_START:
++        default:
++            av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++            break;
++    }
++
++    abort_phases(rpi, de);
++    de->state = RPIVID_DECODE_NEW;
++
++    dec_env_release(rpi, de);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// End frame
++
++static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    const HEVCContext * const s = avctx->priv_data;
++    const HEVCPPS * const pps = s->ps.pps;
++    const HEVCSPS * const sps = s->ps.sps;
++    dec_env_t * const de = dec_env_get(avctx,  rpi);
++    AVFrame * const f = s->ref->frame;
++    const unsigned int dpbno_cur = s->ref - s->DPB;
++    vid_vc_addr_t cmds_vc;
++    vid_vc_addr_t pu_base_vc;
++    unsigned int pu_stride;
++    vid_vc_addr_t coeff_base_vc;
++    unsigned int coeff_stride;
++    unsigned int i;
++    int rv = 0;
++    int status = 0;
++    int coeffbuf_sem_claimed = 0;
++
++#if TRACE_ENTRY
++    fprintf("<<< %s[%p]\n", __func__, de);
++#endif
++
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return AVERROR_BUG;  // Should never happen
++    }
++
++    if (de->state != RPIVID_DECODE_SLICE) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++        rv = AVERROR_UNKNOWN;
++        goto fail;
++    }
++    de->state = RPIVID_DECODE_END;
++
++    // End of command compilation
++    {
++        const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
++        const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
++        if (pps->entropy_coding_sync_enabled_flag) {
++            if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
++                wpp_pause(de, last_y);
++        }
++        p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++    }
++
++    // Phase 0 ---------------------------------------------------------------
++
++    wait_phase(rpi, de, 0);
++    rpi_sem_wait(&rpi->bitbuf_sem);
++    tstart_phase(rpi, 0);
++
++    // Copy cmds & bits into gpu side buffer
++    // Layout: CMDS, BITS
++    {
++        uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
++        vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
++        unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
++
++        uint8_t * p = armbase + rnd64(cmd_bytes);
++        uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
++
++        cmds_vc = vcbase;
++
++        // Copy all the bits & update bitstream cmds to point at the right bits
++        for (i = 0; i < de->bit_len; ++i)
++        {
++            const unsigned int seg_len = de->bit_fifo[i].len;
++
++            if (p + seg_len > eobits) {
++                status = -1;
++                break;
++            }
++
++            memcpy(p, de->bit_fifo[i].ptr, seg_len);
++            de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
++
++            p += rnd64(seg_len);
++        }
++
++        memcpy(armbase, de->cmd_fifo, cmd_bytes);
++    }
++
++    if (status == 0)
++    {
++        if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
++            rpi->bitbuf_no = 0;
++    }
++    else
++    {
++        sem_post(&rpi->bitbuf_sem);
++        av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
++        rv = AVERROR_BUFFER_TOO_SMALL;
++    }
++
++    tend_phase(rpi, 0);
++    post_phase(rpi, de, 0);
++
++    if (status < 0)
++        goto fail;
++
++    // Phase 1 ---------------------------------------------------------------
++
++    wait_phase(rpi, de, 1);
++    rpi_sem_wait(&rpi->coeffbuf_sem);
++    coeffbuf_sem_claimed = 1;
++    tstart_phase(rpi, 1);
++
++    for (;;)
++    {
++        // (Re-)allocate PU/COEFF stream space
++        const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
++        unsigned int pu_size;
++
++        pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
++        pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
++        pu_size = pu_stride * de->PicHeightInCtbsY;
++
++        if (pu_size > total_size) {
++            status = -1;
++            break;
++        }
++
++        // Allocate all remaining space to coeff
++        coeff_base_vc = pu_base_vc + pu_size;
++        coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63;  // Round down to multiple of 64
++
++        apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
++        apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
++        apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
++        apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
++
++        // Trigger command FIFO
++        apb_write(rpi, RPI_CFNUM, de->cmd_len);
++#if TRACE_DEV
++        apb_dump_regs(rpi, 0x0, 32);
++        apb_dump_regs(rpi, 0x8000, 24);
++        axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
++#endif
++        apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
++
++        int_wait(rpi, 1);
++
++        status = check_status(rpi, de);
++
++        if (status != 1)
++            break;
++
++        // Status 1 means out of PU space so try again with more
++        // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
++        rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
++    }
++
++    // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
++    // may reuse a live buffer when we kick the coeff sem
++    if (status == 0)
++    {
++        if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
++            rpi->coeffbuf_no = 0;
++    }
++    else
++    {
++        if (status == -1)
++        {
++            av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
++            rv = AVERROR_BUFFER_TOO_SMALL;
++        }
++        else
++        {
++            av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
++            rv = AVERROR_INVALIDDATA;
++        }
++    }
++
++    tend_phase(rpi, 1);
++    sem_post(&rpi->bitbuf_sem);
++    post_phase(rpi, de, 1);
++
++    if (status != 0)
++        goto fail;
++
++    // Phase 2 ---------------------------------------------------------------
++
++    wait_phase(rpi, de, 2);
++
++    if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
++    {
++        // As we are in phase 2 already here we don't need to worry about
++        // ceoffbuf_no despite the early exit
++        post_phase(rpi, de, 2);
++        av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
++        goto fail;
++    }
++
++    tstart_phase(rpi, 2);
++
++    apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
++    apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
++    apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
++    apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
++
++    apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
++    apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
++    apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
++    apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
++
++    // Keep the last thing we resolved as fallback for any ref we fail to
++    // resolve.  As a final fallback use our current frame.  The pels might
++    // not be there yet but at least the memory is valid.
++    //
++    // Attempt to resolve the entire DPB - we could note what we have used
++    // in ref lists but probably simpler and more reliable to set the whole thing
++    {
++        AVFrame * fallback_frame = f;
++        for (i = 0; i != 16; ++i) {
++            // Avoid current frame
++            const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
++            AVFrame * fr = hevc_fr->frame;
++
++            if (fr != NULL &&
++                av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
++            {
++                fallback_frame = fr;
++            }
++            else
++            {
++                fr = fallback_frame;
++            }
++
++            apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
++            apb_write(rpi, 0x9004+16*i, 0);
++            apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
++            apb_write(rpi, 0x900C+16*i, 0);
++        }
++    }
++
++    apb_write(rpi, RPI_CONFIG2,
++          (sps->bit_depth                             << 0) // BitDepthY
++        + (sps->bit_depth                             << 4) // BitDepthC
++       + ((sps->bit_depth>8)                          << 8) // BitDepthY
++       + ((sps->bit_depth>8)                          << 9) // BitDepthC
++        + (sps->log2_ctb_size                         <<10)
++        + (pps->constrained_intra_pred_flag           <<13)
++        + (sps->sps_strong_intra_smoothing_enable_flag<<14)
++        + (sps->sps_temporal_mvp_enabled_flag         <<15)
++        + (pps->log2_parallel_merge_level             <<16)
++        + (s->sh.slice_temporal_mvp_enabled_flag      <<19)
++        + (sps->pcm.loop_filter_disable_flag          <<20)
++       + ((pps->cb_qp_offset&31)                      <<21)
++       + ((pps->cr_qp_offset&31)                      <<26));
++
++    apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
++    apb_write(rpi, RPI_CURRPOC, s->poc);
++
++    // collocated reads/writes
++    if (sps->sps_temporal_mvp_enabled_flag) {
++        av_assert0(de->dpbno_col < RPIVID_COL_PICS);
++        av_assert0(dpbno_cur < RPIVID_COL_PICS);
++
++        apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
++        apb_write_vc_len(rpi, RPI_MVSTRIDE,  rpi->col_stride);
++        apb_write_vc_addr(rpi, RPI_MVBASE,  rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
++        apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
++    }
++
++#if TRACE_DEV
++    apb_dump_regs(rpi, 0x0, 32);
++    apb_dump_regs(rpi, 0x8000, 24);
++#endif
++
++    apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
++    apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
++
++    int_wait(rpi, 2);
++
++    tend_phase(rpi, 2);
++    coeffbuf_sem_claimed = 0;
++    sem_post(&rpi->coeffbuf_sem);
++    // Set valid here to avoid race in resolving in any pending phase 2
++    av_rpi_zc_set_valid_frame(f);
++
++    post_phase(rpi, de, 2);
++
++    // Flush frame for CPU access
++    // Arguably the best place would be at the start of phase 2 but here
++    // will overlap with the wait
++    //
++    // * Even better would be to have better lock/unlock control in ZC for external access
++    if (rpi->gpu_init_type == GPU_INIT_GPU)  // * CMA is currently always uncached
++    {
++        rpi_cache_buf_t cbuf;
++        rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
++        rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++        rpi_cache_flush_finish(fe);
++    }
++
++#if TRACE_ENTRY
++    printf(">>> %s[%p] OK\n", __func__, de);
++#endif
++
++    dec_env_release(rpi, de);
++    return 0;
++
++fail:
++    av_rpi_zc_set_broken_frame(f);
++    if (coeffbuf_sem_claimed)
++        sem_post(&rpi->coeffbuf_sem);
++    abort_phases(rpi, de);  // Dummy any unresolved phases
++
++#if TRACE_ENTRY
++    printf(">>> %s[%p] FAIL\n", __func__, de);
++#endif
++
++    dec_env_release(rpi, de);
++    return rv;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
++    const int rpi_use_emu = 0; // FFmpeg removes emulation prevention bytes
++    const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
++    const GetBitContext *gb = &s->HEVClc->gb;
++    const int len = 1 + gb->size_in_bits/8 - gb->index/8;
++    const void *ptr = &gb->buffer[gb->index/8];
++
++    p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
++    p1_apb_write(de, RPI_BFNUM, len);
++    p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
++    p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
++{
++    const HEVCPPS * const pps = s->ps.pps;
++
++    int i, resetQPY=1;
++    int indep = !s->sh.dependent_slice_segment_flag;
++    int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++
++    if (ctb_addr_ts)
++        wpp_end_previous_slice(de, s, ctb_addr_ts);
++    pre_slice_decode(de, s);
++    WriteBitstream(de, s);
++    if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
++        WriteProb(de, s);
++    else if (ctb_col==0)
++        p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++    else
++        resetQPY=0;
++    program_slicecmds(de, s->slice_idx);
++    new_slice_segment(de, s);
++    wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
++    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++        int last_x = de->PicWidthInCtbsY-1;
++        if (de->PicWidthInCtbsY>2)
++            wpp_pause(de, ctb_row);
++        p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
++        if (de->PicWidthInCtbsY==2)
++            p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++        if (de->PicWidthInCtbsY==1)
++            WriteProb(de, s);
++        else
++            p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++        ctb_addr_ts += pps->column_width[0];
++        wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
++    }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++    const HEVCPPS * const pps = s->ps.pps;
++    int i, resetQPY;
++
++    if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
++    pre_slice_decode(de, s);
++    WriteBitstream(de, s);
++    resetQPY = ctb_addr_ts==0
++            || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
++            || !s->sh.dependent_slice_segment_flag;
++    if (resetQPY) WriteProb(de, s);
++    program_slicecmds(de, s->slice_idx);
++    new_slice_segment(de, s);
++    new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
++    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
++        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++        int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++        int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++        int last_x = pps->col_bd[tile_x+1]-1;
++        int last_y = pps->row_bd[tile_y+1]-1;
++        p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
++        WriteProb(de, s);
++        ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
++        new_entry_point(de, s, 0, 1, ctb_addr_ts);
++    }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int cabac_start_align(HEVCContext *s)
++{
++    GetBitContext *gb = &s->HEVClc->gb;
++    skip_bits(gb, 1);
++    align_get_bits(gb);
++    // Should look at getting rid of this
++    return ff_init_cabac_decoder(&s->HEVClc->cc,
++                          gb->buffer + get_bits_count(gb) / 8,
++                          (get_bits_left(gb) + 7) / 8);
++}
++
++static int rpi_hevc_decode_slice(
++    AVCodecContext *avctx,
++    const uint8_t *buffer,
++    uint32_t size)
++{
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    HEVCContext * const s = avctx->priv_data;
++    dec_env_t * const de = dec_env_get(avctx, rpi);
++    const HEVCPPS *pps = s->ps.pps;
++    int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++
++#if TRACE_ENTRY
++    printf("<<< %s[%p]\n", __func__, de);
++#endif
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return -1;
++    }
++
++    if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++        return -1;
++    }
++    de->state = RPIVID_DECODE_SLICE;
++
++//    ff_hevc_cabac_init(s, ctb_addr_ts);
++    cabac_start_align(s);
++    if (s->ps.sps->scaling_list_enable_flag)
++        populate_scaling_factors(de, s);
++    pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
++                                             : decode_slice(de, s, ctb_addr_ts);
++#if TRACE_ENTRY
++    printf(">>> %s[%p]\n", __func__, de);
++#endif
++    dec_env_release(rpi, de);
++    return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
++{
++    int rv;
++    if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
++        av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
++    return rv;
++}
++
++static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    HEVCContext * const s = avctx->priv_data;
++    // Frame buffering + 1 output.  Would need thread_count extra but we now
++    // alloc at the start of phase 2 so that is the only thread we need the
++    // extra buffer for.
++    const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
++    int rv;
++
++    if (av_rpi_zc_in_use(avctx))
++    {
++        const AVZcEnvPtr zc = avctx->opaque;
++        av_rpi_zc_set_decoder_pool_size(zc, pool_req);
++        av_rpi_zc_get_buffer(zc, frame);   // get_buffer2 would alloc
++    }
++    else
++    {
++        if (rpi->zc == NULL) {
++            pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++            // Alloc inside lock to make sure we only ever alloc one
++            if (rpi->zc == NULL) {
++                rpi->zc = av_rpi_zc_int_env_alloc(s);
++            }
++            pthread_mutex_unlock(&rpi->phase_lock);
++        }
++        av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
++        rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
++            av_rpi_zc_get_buffer(rpi->zc, frame);
++    }
++
++    if (rv == 0 &&
++        (rv = ff_attach_decode_data(frame)) < 0)
++    {
++        av_frame_unref(frame);
++    }
++
++    if (rv == 0)
++    {
++        FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
++        fdd->post_process = rpivid_retrieve_data;
++    }
++
++    return rv;
++}
++
++#if OPT_PHASE_TIMING
++static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
++{
++    av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
++           bins[0],  bins[1], bins[2], bins[3],
++           bins[4],  bins[5], bins[6], bins[7], bins[8]);
++}
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_free(AVCodecContext *avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++
++#if TRACE_ENTRY
++    printf("<<< %s\n", __func__);
++#endif
++
++    dec_env_release(rpi, NULL);
++
++    // Wait for everything else to stop
++    {
++        struct timespec tt;
++        clock_gettime(CLOCK_REALTIME, &tt);
++        tt.tv_sec += 2;
++        while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
++            const int err = errno;
++            if (err == ETIMEDOUT) {
++                av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
++                return -1;
++            }
++            if (err != EINTR) {
++                av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
++                break;
++            }
++        }
++    }
++
++#if OPT_PHASE_TIMING
++    {
++        unsigned int i;
++        for (i = 0; i != RPIVID_PHASES; ++i) {
++            const phase_wait_env_t * const p = rpi->phase_reqs + i;
++            av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
++                   (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
++                   (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
++            av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d        >\n",
++                   time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
++                   time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
++            log_bin_phase(avctx, p->time_bins);
++            log_bin_phase(avctx, p->time_bins3);
++            log_bin_phase(avctx, p->time_bins5);
++            av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
++                   (unsigned int)(p->max_phase_time / 1000),
++                   p->max_time_decode_order);
++        }
++        av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
++    }
++#endif
++
++    if (rpi->dec_envs != NULL)
++    {
++        for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
++            dec_env_delete(rpi->dec_envs[i]);
++        }
++        av_freep(&rpi->dec_envs);
++    }
++
++    av_rpi_zc_int_env_freep(&rpi->zc);
++
++    gpu_free(&rpi->gcolbuf);
++
++    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++        gpu_free(rpi->gbitbufs + i);
++    }
++    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++        gpu_free(rpi->gcoeffbufs + i);
++    }
++
++    unmap_devp(&rpi->regs, REGS_SIZE);
++    unmap_devp(&rpi->ints, INTS_SIZE);
++
++    if (rpi->gpu_init_type > 0)
++        rpi_mem_gpu_uninit();
++
++    if (rpi->mbox_fd >= 0) {
++        mbox_release_clock(rpi->mbox_fd);
++        mbox_close(rpi->mbox_fd);
++    }
++
++    sem_destroy(&rpi->ref_zero);
++    sem_destroy(&rpi->coeffbuf_sem);
++    sem_destroy(&rpi->bitbuf_sem);
++
++#if TRACE_ENTRY
++    printf(">>> %s\n", __func__);
++#endif
++    return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_init(AVCodecContext *avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++//    const char *err;
++
++#if TRACE_ENTRY
++    printf("<<< %s\n", __func__);
++#endif
++
++    if (avctx->width>4096 || avctx->height>4096) {
++        av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
++        return AVERROR(ENOTSUP);
++    }
++
++    memset(rpi, 0, sizeof(*rpi));
++
++    rpi->mbox_fd = -1;
++    rpi->decode_order = 0;
++
++    // Initial PU/COEFF stream buffer split chosen as worst case seen so far
++    rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
++
++
++    atomic_store(&rpi->ref_count, 1);
++    sem_init(&rpi->ref_zero, 0, 0);
++
++    sem_init(&rpi->bitbuf_sem,   0, RPIVID_BITBUFS);
++    sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
++
++    pthread_mutex_init(&rpi->phase_lock, NULL);
++
++    if ((rpi->mbox_fd = mbox_open()) < 0)
++    {
++        av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
++        goto fail;
++    }
++    mbox_request_clock(rpi->mbox_fd);
++
++    if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
++        (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
++        goto fail;
++    }
++
++    if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
++        goto fail;
++    }
++
++    if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
++        goto fail;
++    }
++
++    rpi->col_stride = rnd64(avctx->width);
++    rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
++    if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
++    {
++        av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
++        goto fail;
++    }
++
++    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++        if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
++        {
++            av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
++            goto fail;
++        }
++    }
++
++    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++        if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
++        {
++            av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
++            goto fail;
++        }
++    }
++
++    return 0;
++
++fail:
++    rpi_hevc_free(avctx);
++    return AVERROR_EXTERNAL;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
++    .name           = "hevc_rpi4_8",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_RPI4_8,
++    .alloc_frame    = rpivid_hevc_alloc_frame,
++    .start_frame    = rpi_hevc_start_frame,
++    .end_frame      = rpi_hevc_end_frame,
++    .abort_frame    = rpi_hevc_abort_frame,
++    .decode_slice   = rpi_hevc_decode_slice,
++    .init           = rpi_hevc_init,
++    .uninit         = rpi_hevc_free,
++    .priv_data_size = sizeof(RPI_T),
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
++    .name           = "hevc_rpi4_10",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_RPI4_10,
++    .alloc_frame    = rpivid_hevc_alloc_frame,
++    .start_frame    = rpi_hevc_start_frame,
++    .end_frame      = rpi_hevc_end_frame,
++    .abort_frame    = rpi_hevc_abort_frame,
++    .decode_slice   = rpi_hevc_decode_slice,
++    .init           = rpi_hevc_init,
++    .uninit         = rpi_hevc_free,
++    .priv_data_size = sizeof(RPI_T),
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
+diff --git a/libavcodec/v4l2_phase.c b/libavcodec/v4l2_phase.c
+new file mode 100644
+index 0000000000..0a7f6abd33
+--- /dev/null
++++ b/libavcodec/v4l2_phase.c
+@@ -0,0 +1,140 @@
++// v4l2_phase.c
++
++#include <stdio.h>
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "libavutil/log.h"
++#include "v4l2_phase.h"
++
++typedef struct phase_envss {
++    unsigned int last_order;
++    pthread_mutex_t lock;
++    pthread_cond_t cond;
++} phase_env;
++
++struct V4L2PhaseControl {
++    unsigned int order;
++    unsigned int phase_count;
++    phase_env p[V4L2PHASE_PHASE_COUNT];
++};
++
++
++unsigned int ff_v4l2_phase_order_next(V4L2PhaseControl * const pc)
++{
++    return ++pc->order;
++}
++
++// Phase isn't required but it acts as a check that we know what we are doing
++int
++ff_v4l2_phase_claim(V4L2PhaseInfo * const pi, unsigned int phase)
++{
++    V4L2PhaseControl *const pc = pi->ctrl;
++    phase_env * const p = pc->p + phase;
++
++    if (pi->n2 != phase * 2) {
++        av_log(NULL, AV_LOG_ERROR, "%s: Unexpected phase: req=%d, cur=%d/%d\n", __func__, phase, pi->n2 >> 1, pi->n2 & 1);
++        return -1;
++    }
++
++    pthread_mutex_lock(&p->lock);
++
++    while (pi->order != p->last_order + 1) {
++        pthread_cond_wait(&p->cond, &p->lock);
++    }
++
++    pi->n2++;
++    pthread_mutex_unlock(&p->lock);
++    return 0;
++}
++
++int
++ff_v4l2_phase_release(V4L2PhaseInfo * const pi, unsigned int phase)
++{
++    V4L2PhaseControl *const pc = pi->ctrl;
++    phase_env * const p = pc->p + phase;
++
++    if (pi->n2 != ((phase << 1) | 1)) {
++        av_log(NULL, AV_LOG_ERROR, "%s: Unexpected phase: req=%d, cur=%d/%d\n", __func__, phase, pi->n2 >> 1, pi->n2 & 1);
++        return -1;
++    }
++
++    if (pi->order != p->last_order + 1) {
++        av_log(NULL, AV_LOG_ERROR, "%s: order_mismatch\n", __func__);
++        return -1;
++    }
++
++    pthread_mutex_lock(&p->lock);
++    p->last_order = pi->order;
++    pi->n2++;
++    pthread_cond_broadcast(&p->cond);
++    pthread_mutex_unlock(&p->lock);
++    return 0;
++}
++
++// Init the PhaseInfo, assign a new order, claim phase 0
++int
++ff_v4l2_phase_start(V4L2PhaseInfo * const pi, V4L2PhaseControl * const pc)
++{
++    pi->n2 = 0;
++    pi->ctrl = pc;
++    pi->order = ff_v4l2_phase_order_next(pc);
++    return ff_v4l2_phase_claim(pi, 0);
++}
++
++// Release any claimed phase and claim+release all remaining phases
++void ff_v4l2_phase_abort(V4L2PhaseInfo * const pi)
++{
++    V4L2PhaseControl *const pc = pi->ctrl;
++
++    // Nothing to do
++    if (pi->n2 == 0 || pi->n2 >= pc->phase_count * 2)
++        return;
++
++    // Run through all remaining phases
++    do {
++        if ((pi->n2 & 1) == 0)
++            ff_v4l2_phase_claim(pi, pi->n2 >> 1);
++        else
++            ff_v4l2_phase_release(pi, pi->n2 >> 1);
++    } while (pi->n2 < pc->phase_count * 2);
++}
++
++
++V4L2PhaseControl *
++ff_v4l2_phase_control_new(unsigned int phase_count)
++{
++    V4L2PhaseControl * pc;
++    unsigned int i;
++    if (phase_count > V4L2PHASE_PHASE_COUNT)
++        return NULL;
++    if ((pc = av_mallocz(sizeof(*pc))) == NULL)
++        return NULL;
++    pc->phase_count = phase_count;
++    for (i = 0; i != phase_count; ++i) {
++        phase_env * const p = pc->p + i;
++        p->last_order = 0;
++        pthread_mutex_init(&p->lock, NULL);
++        pthread_cond_init(&p->cond, NULL);
++    }
++    return pc;
++}
++
++void
++ff_v4l2_phase_control_deletez(V4L2PhaseControl ** const ppc)
++{
++    V4L2PhaseControl * const pc = *ppc;
++    unsigned int i;
++
++    if (pc == NULL)
++        return;
++    *ppc = NULL;
++
++    for (i = 0; i != pc->phase_count; ++i) {
++        phase_env * const p = pc->p + i;
++        pthread_mutex_destroy(&p->lock);
++        pthread_cond_destroy(&p->cond);
++    }
++}
++
++
+diff --git a/libavcodec/v4l2_phase.h b/libavcodec/v4l2_phase.h
+new file mode 100644
+index 0000000000..392f22b988
+--- /dev/null
++++ b/libavcodec/v4l2_phase.h
+@@ -0,0 +1,37 @@
++// v4l2_phase.h
++#ifndef AVCODEC_V4L2_PHASE_H
++#define AVCODEC_V4L2_PHASE_H
++
++#define V4L2PHASE_PHASE_COUNT 2
++
++struct V4L2PhaseControl;
++typedef struct V4L2PhaseControl V4L2PhaseControl;
++
++typedef struct V4L2PhaseInfo {
++    unsigned int n2;  // (phase << 1) | (claimed)
++    unsigned int order;
++    V4L2PhaseControl * ctrl;
++} V4L2PhaseInfo;
++
++unsigned int ff_v4l2_phase_order_next(V4L2PhaseControl * const pc);
++
++static inline int ff_v4l2_phase_started(const V4L2PhaseInfo * const pi)
++{
++    return pi->n2 != 0;
++}
++
++// Init the PhaseInfo, assign a new order, claim phase 0
++int ff_v4l2_phase_start(V4L2PhaseInfo * const pi, V4L2PhaseControl * const pc);
++
++// Phase isn't required but it acts as a check that we know what we are doing
++int ff_v4l2_phase_claim(V4L2PhaseInfo * const pi, unsigned int phase);
++int ff_v4l2_phase_release(V4L2PhaseInfo * const pi, unsigned int phase);
++
++// Release any claimed phase and claim+release all remaining phases
++void ff_v4l2_phase_abort(V4L2PhaseInfo * const pi);
++
++
++V4L2PhaseControl * ff_v4l2_phase_control_new(unsigned int phase_count);
++void ff_v4l2_phase_control_deletez(V4L2PhaseControl ** const ppc);
++
++#endif
+diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c
+new file mode 100644
+index 0000000000..4ca42d29ec
+--- /dev/null
++++ b/libavcodec/v4l2_request.c
+@@ -0,0 +1,1054 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <drm_fourcc.h>
++#include <linux/media.h>
++#include <sys/mman.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++
++#include <sys/sysmacros.h>
++#include <libudev.h>
++
++#include "decode.h"
++#include "internal.h"
++#include "v4l2_request.h"
++#include "v4l2_phase.h"
++
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    return req ? v4l2_timeval_to_ns(&req->capture.buffer.timestamp) : 0;
++}
++
++int ff_v4l2_request_start_phase_control(AVFrame *frame, struct V4L2PhaseControl * ctrl)
++{
++    V4L2RequestDescriptor * const req = (V4L2RequestDescriptor*)frame->data[0];
++    return ff_v4l2_phase_start(&req->phase, ctrl);
++}
++
++void ff_v4l2_request_abort_phase_control(AVFrame *frame)
++{
++    if (frame != NULL && frame->data[0] != NULL) {
++        V4L2RequestDescriptor *const req = (V4L2RequestDescriptor *)frame->data[0];
++        ff_v4l2_phase_abort(&req->phase);
++    }
++}
++
++int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    memset(&req->drm, 0, sizeof(AVDRMFrameDescriptor));
++    req->output.used = 0;
++    return 0;
++}
++
++int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    if (req->output.used + size + (AV_INPUT_BUFFER_PADDING_SIZE * 4) <= req->output.size) {
++        memcpy(req->output.addr + req->output.used, data, size);
++        req->output.used += size;
++    } else {
++        av_log(avctx, AV_LOG_ERROR, "%s: output.used=%u output.size=%u size=%u\n", __func__, req->output.used, req->output.size, size);
++    }
++    return 0;
++}
++
++static int v4l2_request_controls(V4L2RequestContext *ctx, int request_fd, unsigned long type, struct v4l2_ext_control *control, int count)
++{
++    struct v4l2_ext_controls controls = {
++        .controls = control,
++        .count = count,
++        .request_fd = request_fd,
++        .which = (request_fd >= 0) ? V4L2_CTRL_WHICH_REQUEST_VAL : 0,
++    };
++
++    if (!control || !count)
++        return 0;
++
++    return ioctl(ctx->video_fd, type, &controls);
++}
++
++static int v4l2_request_set_controls(V4L2RequestContext *ctx, int request_fd, struct v4l2_ext_control *control, int count)
++{
++    return v4l2_request_controls(ctx, request_fd, VIDIOC_S_EXT_CTRLS, control, count);
++}
++
++int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    ret = v4l2_request_controls(ctx, -1, VIDIOC_S_EXT_CTRLS, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return ret;
++}
++
++int ff_v4l2_request_get_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    ret = v4l2_request_controls(ctx, -1, VIDIOC_G_EXT_CTRLS, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get controls failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return ret;
++}
++
++int ff_v4l2_request_query_control(AVCodecContext *avctx, struct v4l2_query_ext_ctrl *control)
++{
++    int ret;
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERY_EXT_CTRL, control);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id)
++{
++    int ret;
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    struct v4l2_queryctrl control = {
++        .id = id,
++    };
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCTRL, &control);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return AVERROR(EINVAL);
++    }
++
++    return control.default_value;
++}
++
++static int v4l2_request_queue_buffer(V4L2RequestContext *ctx, int request_fd, V4L2RequestBuffer *buf, uint32_t flags)
++{
++    struct v4l2_plane planes[1] = {};
++    struct v4l2_buffer buffer = {
++        .type = buf->buffer.type,
++        .memory = buf->buffer.memory,
++        .index = buf->index,
++        .timestamp.tv_usec = ctx->timestamp,
++        .bytesused = buf->used,
++        .request_fd = request_fd,
++        .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags,
++    };
++
++    buf->buffer.timestamp = buffer.timestamp;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
++        planes[0].bytesused = buf->used;
++        buffer.bytesused = 0;
++        buffer.length = 1;
++        buffer.m.planes = planes;
++    }
++
++    return ioctl(ctx->video_fd, VIDIOC_QBUF, &buffer);
++}
++
++static int v4l2_request_dequeue_buffer(V4L2RequestContext *ctx, V4L2RequestBuffer *buf)
++{
++    int ret;
++    struct v4l2_plane planes[1] = {};
++    struct v4l2_buffer buffer = {
++        .type = buf->buffer.type,
++        .memory = buf->buffer.memory,
++        .index = buf->index,
++    };
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) {
++        buffer.length = 1;
++        buffer.m.planes = planes;
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_DQBUF, &buffer);
++    if (ret < 0)
++        return ret;
++
++    buf->buffer.timestamp = buffer.timestamp;
++    return 0;
++}
++
++const uint32_t v4l2_request_capture_pixelformats[] = {
++#if CONFIG_SAND
++    V4L2_PIX_FMT_NV12_COL128,
++    V4L2_PIX_FMT_NV12_10_COL128,
++#endif
++    V4L2_PIX_FMT_NV12,
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++    V4L2_PIX_FMT_SUNXI_TILED_NV12,
++#endif
++};
++
++static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4l2_format *format)
++{
++    AVDRMFrameDescriptor *desc = &req->drm;
++    AVDRMLayerDescriptor *layer = &desc->layers[0];
++    uint32_t pixelformat = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat;
++
++    switch (pixelformat) {
++    case V4L2_PIX_FMT_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#if CONFIG_SAND
++    case V4L2_PIX_FMT_NV12_COL128:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(format->fmt.pix.bytesperline);
++        break;
++    case V4L2_PIX_FMT_NV12_10_COL128:
++        layer->format = DRM_FORMAT_P030;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(format->fmt.pix.bytesperline);
++        break;
++#endif
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
++        break;
++#endif
++    default:
++        return -1;
++    }
++
++    desc->nb_objects = 1;
++    desc->objects[0].fd = req->capture.fd;
++    desc->objects[0].size = req->capture.size;
++
++    desc->nb_layers = 1;
++    layer->nb_planes = 2;
++
++    layer->planes[0].object_index = 0;
++    layer->planes[0].offset = 0;
++    layer->planes[0].pitch = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.plane_fmt[0].bytesperline : format->fmt.pix.bytesperline;
++#if CONFIG_SAND
++    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = format->fmt.pix.height * 128;
++        layer->planes[0].pitch = format->fmt.pix.width;
++        layer->planes[1].pitch = format->fmt.pix.width;
++    }
++    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = format->fmt.pix.height * 128;
++        layer->planes[0].pitch = format->fmt.pix.width * 2; // Lies but it keeps DRM import happy
++        layer->planes[1].pitch = format->fmt.pix.width * 2;
++    }
++    else
++#endif
++    {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = layer->planes[0].pitch * (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.height : format->fmt.pix.height);
++        layer->planes[1].pitch = layer->planes[0].pitch;
++    }
++
++    return 0;
++}
++
++static int v4l2_request_queue_decode(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++    struct timeval tv = { 2, 0 };
++    fd_set except_fds;
++    int ret;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice);
++
++    if (first_slice)
++        ctx->timestamp++;
++
++    ret = v4l2_request_set_controls(ctx, req->request_fd, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    memset(req->output.addr + req->output.used, 0, AV_INPUT_BUFFER_PADDING_SIZE * 4);
++
++    ret = v4l2_request_queue_buffer(ctx, req->request_fd, &req->output, last_slice ? 0 : V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: queue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    if (first_slice) {
++        ret = v4l2_request_queue_buffer(ctx, -1, &req->capture, 0);
++        if (ret < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: queue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
++            return -1;
++        }
++    }
++
++    // NOTE: do we need to dequeue when request fails/timeout?
++
++    // 4. queue request and wait
++    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_QUEUE, NULL);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: queue request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        goto fail;
++    }
++
++    FD_ZERO(&except_fds);
++    FD_SET(req->request_fd, &except_fds);
++
++    ret = select(req->request_fd + 1, NULL, NULL, &except_fds, &tv);
++    if (ret == 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: request %d timeout\n", __func__, req->request_fd);
++        goto fail;
++    } else if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: select request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        goto fail;
++    }
++
++    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++        return -1;
++    }
++
++    if (last_slice) {
++        if (ff_v4l2_phase_started(&req->phase)) {
++            ff_v4l2_phase_release(&req->phase, 0);
++            ff_v4l2_phase_claim(&req->phase, 1);
++        }
++
++        ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
++
++        if (ff_v4l2_phase_started(&req->phase)) {
++            ff_v4l2_phase_release(&req->phase, 1);
++        }
++
++        if (ret < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
++            return -1;
++        }
++    }
++
++    // TODO: check errors
++    // buffer.flags & V4L2_BUF_FLAG_ERROR
++
++    if (last_slice)
++        return v4l2_request_set_drm_descriptor(req, &ctx->format);
++
++    return 0;
++
++fail:
++    ret = v4l2_request_dequeue_buffer(ctx, &req->output);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno);
++
++    ret = v4l2_request_dequeue_buffer(ctx, &req->capture);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno);
++
++    ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL);
++    if (ret < 0)
++        av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno);
++
++    return -1;
++}
++
++int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice)
++{
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0];
++
++    // fall back to queue each slice as a full frame
++    if ((req->output.capabilities & V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) != V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
++        return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
++
++    return v4l2_request_queue_decode(avctx, frame, control, count, first_slice, last_slice);
++}
++
++int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count)
++{
++    return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1);
++}
++
++static int v4l2_request_try_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    struct v4l2_fmtdesc fmtdesc = {
++        .index = 0,
++        .type = type,
++    };
++
++    if (V4L2_TYPE_IS_OUTPUT(type)) {
++        struct v4l2_create_buffers buffers = {
++            .count = 0,
++            .memory = V4L2_MEMORY_MMAP,
++            .format.type = type,
++        };
++
++        if (ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers) < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
++            return -1;
++        }
++
++        if ((buffers.capabilities & V4L2_BUF_CAP_SUPPORTS_REQUESTS) != V4L2_BUF_CAP_SUPPORTS_REQUESTS) {
++            av_log(avctx, AV_LOG_INFO, "%s: output buffer type do not support requests, capabilities %u\n", __func__, buffers.capabilities);
++            return -1;
++        }
++    }
++
++    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
++        if (fmtdesc.pixelformat == pixelformat)
++            return 0;
++
++        fmtdesc.index++;
++    }
++
++    av_log(avctx, AV_LOG_INFO, "%s: pixelformat %u not supported for type %u\n", __func__, pixelformat, type);
++    return -1;
++}
++
++static int v4l2_request_set_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat, uint32_t buffersize)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    struct v4l2_format format = {
++        .type = type,
++    };
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++        format.fmt.pix_mp.width = avctx->coded_width;
++        format.fmt.pix_mp.height = avctx->coded_height;
++        format.fmt.pix_mp.pixelformat = pixelformat;
++        format.fmt.pix_mp.plane_fmt[0].sizeimage = buffersize;
++        format.fmt.pix_mp.num_planes = 1;
++    } else {
++        format.fmt.pix.width = avctx->coded_width;
++        format.fmt.pix.height = avctx->coded_height;
++        format.fmt.pix.pixelformat = pixelformat;
++        format.fmt.pix.sizeimage = buffersize;
++    }
++
++    return ioctl(ctx->video_fd, VIDIOC_S_FMT, &format);
++}
++
++static int v4l2_request_select_capture_format(AVCodecContext *avctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    enum v4l2_buf_type type = ctx->format.type;
++
++#if 0
++    struct v4l2_format format = {
++        .type = type,
++    };
++    struct v4l2_fmtdesc fmtdesc = {
++        .index = 0,
++        .type = type,
++    };
++    uint32_t pixelformat;
++    int i;
++
++    if (ioctl(ctx->video_fd, VIDIOC_G_FMT, &format) < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
++        return -1;
++    }
++
++    pixelformat = V4L2_TYPE_IS_MULTIPLANAR(type) ? format.fmt.pix_mp.pixelformat : format.fmt.pix.pixelformat;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
++        if (pixelformat == v4l2_request_capture_pixelformats[i])
++            return v4l2_request_set_format(avctx, type, pixelformat, 0);
++    }
++
++    while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) {
++        for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
++            if (fmtdesc.pixelformat == v4l2_request_capture_pixelformats[i])
++                return v4l2_request_set_format(avctx, type, fmtdesc.pixelformat, 0);
++        }
++
++        fmtdesc.index++;
++    }
++#else
++    for (int i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) {
++        uint32_t pixelformat = v4l2_request_capture_pixelformats[i];
++        if (!v4l2_request_try_format(avctx, type, pixelformat))
++            return v4l2_request_set_format(avctx, type, pixelformat, 0);
++    }
++#endif
++
++    return -1;
++}
++
++static int v4l2_request_probe_video_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret = AVERROR(EINVAL);
++    struct v4l2_capability capability = {0};
++    unsigned int capabilities = 0;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get video device devnode failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++//    ctx->video_fd = open(path, O_RDWR | O_NONBLOCK, 0);
++    ctx->video_fd = open(path, O_RDWR, 0);
++    if (ctx->video_fd < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERYCAP, &capability);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get video capability failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if (capability.capabilities & V4L2_CAP_DEVICE_CAPS)
++        capabilities = capability.device_caps;
++    else
++        capabilities = capability.capabilities;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s capabilities=%u\n", __func__, avctx, ctx, path, capabilities);
++
++    if ((capabilities & V4L2_CAP_STREAMING) != V4L2_CAP_STREAMING) {
++        av_log(avctx, AV_LOG_ERROR, "%s: missing required streaming capability\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) == V4L2_CAP_VIDEO_M2M_MPLANE) {
++        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++    } else if ((capabilities & V4L2_CAP_VIDEO_M2M) == V4L2_CAP_VIDEO_M2M) {
++        ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++        ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++    } else {
++        av_log(avctx, AV_LOG_ERROR, "%s: missing required mem2mem capability\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_try_format(avctx, ctx->output_type, pixelformat);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_WARNING, "%s: try output format failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_set_controls(ctx, -1, control, count);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_set_format(avctx, ctx->output_type, pixelformat, buffersize);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: set output format failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = v4l2_request_select_capture_format(avctx);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_WARNING, "%s: select capture format failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    if (ctx->video_fd >= 0) {
++        close(ctx->video_fd);
++        ctx->video_fd = -1;
++    }
++    return ret;
++}
++
++static int v4l2_request_init_context(AVCodecContext *avctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &ctx->format);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, ctx->format.fmt.pix_mp.pixelformat, ctx->format.fmt.pix_mp.width, ctx->format.fmt.pix_mp.height, ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline, ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage, ctx->format.fmt.pix_mp.num_planes);
++    } else {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, ctx->format.fmt.pix.pixelformat, ctx->format.fmt.pix.width, ctx->format.fmt.pix.height, ctx->format.fmt.pix.bytesperline, ctx->format.fmt.pix.sizeimage);
++    }
++
++    ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM);
++    if (ret < 0)
++        goto fail;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->output_type);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: output stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->format.type);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: capture stream on failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    ff_v4l2_request_uninit(avctx);
++    return ret;
++}
++
++static int v4l2_request_probe_media_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++    struct media_device_info device_info = {0};
++    struct media_v2_topology topology = {0};
++    struct media_v2_interface *interfaces = NULL;
++    struct udev *udev = udev_device_get_udev(device);
++    struct udev_device *video_device;
++    dev_t devnum;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media device devnode failed\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ctx->media_fd = open(path, O_RDWR, 0);
++    if (ctx->media_fd < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media device info failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s driver=%s\n", __func__, avctx, ctx, path, device_info.driver);
++
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    if (topology.num_interfaces <= 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: media device has no interfaces\n", __func__);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    interfaces = av_mallocz(topology.num_interfaces * sizeof(struct media_v2_interface));
++    if (!interfaces) {
++        av_log(avctx, AV_LOG_ERROR, "%s: allocating media interface struct failed\n", __func__);
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno);
++        ret = AVERROR(EINVAL);
++        goto fail;
++    }
++
++    ret = AVERROR(EINVAL);
++    for (int i = 0; i < topology.num_interfaces; i++) {
++        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
++            continue;
++
++        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
++        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
++        if (!video_device) {
++            av_log(avctx, AV_LOG_ERROR, "%s: video_device=%p\n", __func__, video_device);
++            continue;
++        }
++
++        ret = v4l2_request_probe_video_device(video_device, avctx, pixelformat, buffersize, control, count);
++        udev_device_unref(video_device);
++
++        if (!ret)
++            break;
++    }
++
++    av_freep(&interfaces);
++    return ret;
++
++fail:
++    av_freep(&interfaces);
++    if (ctx->media_fd >= 0) {
++        close(ctx->media_fd);
++        ctx->media_fd = -1;
++    }
++    return ret;
++}
++
++int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret = AVERROR(EINVAL);
++    struct udev *udev;
++    struct udev_enumerate *enumerate;
++    struct udev_list_entry *devices;
++    struct udev_list_entry *entry;
++    struct udev_device *device;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p hw_device_ctx=%p hw_frames_ctx=%p\n", __func__, avctx, avctx->hw_device_ctx, avctx->hw_frames_ctx);
++
++    ctx->media_fd = -1;
++    ctx->video_fd = -1;
++    ctx->timestamp = 0;
++
++    udev = udev_new();
++    if (!udev) {
++        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev context failed\n", __func__);
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    enumerate = udev_enumerate_new(udev);
++    if (!enumerate) {
++        av_log(avctx, AV_LOG_ERROR, "%s: allocating udev enumerator failed\n", __func__);
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    udev_enumerate_add_match_subsystem(enumerate, "media");
++    udev_enumerate_scan_devices(enumerate);
++
++    devices = udev_enumerate_get_list_entry(enumerate);
++    udev_list_entry_foreach(entry, devices) {
++        const char *path = udev_list_entry_get_name(entry);
++        if (!path)
++            continue;
++
++        device = udev_device_new_from_syspath(udev, path);
++        if (!device)
++            continue;
++
++        ret = v4l2_request_probe_media_device(device, avctx, pixelformat, buffersize, control, count);
++        udev_device_unref(device);
++
++        if (!ret)
++            break;
++    }
++
++    udev_enumerate_unref(enumerate);
++
++    if (!ret)
++        ret = v4l2_request_init_context(avctx);
++
++fail:
++    udev_unref(udev);
++    return ret;
++}
++
++int ff_v4l2_request_uninit(AVCodecContext *avctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p\n", __func__, avctx, ctx);
++
++    if (ctx->video_fd >= 0) {
++        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->output_type);
++        if (ret < 0)
++            av_log(avctx, AV_LOG_ERROR, "%s: output stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
++
++        ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->format.type);
++        if (ret < 0)
++            av_log(avctx, AV_LOG_ERROR, "%s: capture stream off failed, %s (%d)\n", __func__, strerror(errno), errno);
++    }
++
++    if (avctx->hw_frames_ctx) {
++        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
++        av_buffer_pool_flush(hwfc->pool);
++    }
++
++    if (ctx->video_fd >= 0)
++        close(ctx->video_fd);
++
++    if (ctx->media_fd >= 0)
++        close(ctx->media_fd);
++
++    return 0;
++}
++
++static int v4l2_request_buffer_alloc(AVCodecContext *avctx, V4L2RequestBuffer *buf, enum v4l2_buf_type type)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++    struct v4l2_plane planes[1] = {};
++    struct v4l2_create_buffers buffers = {
++        .count = 1,
++        .memory = V4L2_MEMORY_MMAP,
++        .format.type = type,
++    };
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p buf=%p type=%u\n", __func__, avctx, buf, type);
++
++    ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &buffers.format);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: get format failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
++        return ret;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buffers.format.type)) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, buffers.format.fmt.pix_mp.pixelformat, buffers.format.fmt.pix_mp.width, buffers.format.fmt.pix_mp.height, buffers.format.fmt.pix_mp.plane_fmt[0].bytesperline, buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage, buffers.format.fmt.pix_mp.num_planes);
++    } else {
++        av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, buffers.format.fmt.pix.pixelformat, buffers.format.fmt.pix.width, buffers.format.fmt.pix.height, buffers.format.fmt.pix.bytesperline, buffers.format.fmt.pix.sizeimage);
++    }
++
++    ret = ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno);
++        return ret;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++        buf->width = buffers.format.fmt.pix_mp.width;
++        buf->height = buffers.format.fmt.pix_mp.height;
++        buf->size = buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage;
++        buf->buffer.length = 1;
++        buf->buffer.m.planes = planes;
++    } else {
++        buf->width = buffers.format.fmt.pix.width;
++        buf->height = buffers.format.fmt.pix.height;
++        buf->size = buffers.format.fmt.pix.sizeimage;
++    }
++
++    buf->index = buffers.index;
++    buf->capabilities = buffers.capabilities;
++    buf->used = 0;
++
++    buf->buffer.type = type;
++    buf->buffer.memory = V4L2_MEMORY_MMAP;
++    buf->buffer.index = buf->index;
++
++    ret = ioctl(ctx->video_fd, VIDIOC_QUERYBUF, &buf->buffer);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: query buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
++        return ret;
++    }
++
++    if (V4L2_TYPE_IS_OUTPUT(type)) {
++        void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset);
++        if (addr == MAP_FAILED) {
++            av_log(avctx, AV_LOG_ERROR, "%s: mmap failed, %s (%d)\n", __func__, strerror(errno), errno);
++            return -1;
++        }
++
++        buf->addr = (uint8_t*)addr;
++    } else {
++        struct v4l2_exportbuffer exportbuffer = {
++            .type = type,
++            .index = buf->index,
++            .flags = O_RDONLY,
++        };
++
++        ret = ioctl(ctx->video_fd, VIDIOC_EXPBUF, &exportbuffer);
++        if (ret < 0) {
++            av_log(avctx, AV_LOG_ERROR, "%s: export buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno);
++            return ret;
++        }
++
++        buf->fd = exportbuffer.fd;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
++    return 0;
++}
++
++static void v4l2_request_buffer_free(V4L2RequestBuffer *buf)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size);
++
++    if (buf->addr)
++        munmap(buf->addr, buf->size);
++
++    if (buf->fd >= 0)
++        close(buf->fd);
++}
++
++static void v4l2_request_frame_free(void *opaque, uint8_t *data)
++{
++    AVCodecContext *avctx = opaque;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)data;
++
++    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p request_fd=%d\n", __func__, avctx, data, req->request_fd);
++
++    if (req->request_fd >= 0)
++        close(req->request_fd);
++
++    v4l2_request_buffer_free(&req->capture);
++    v4l2_request_buffer_free(&req->output);
++
++    av_free(data);
++}
++
++static AVBufferRef *v4l2_request_frame_alloc(void *opaque, int size)
++{
++    AVCodecContext *avctx = opaque;
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req;
++    AVBufferRef *ref;
++    uint8_t *data;
++    int ret;
++
++    data = av_mallocz(size);
++    if (!data)
++        return NULL;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
++
++    ref = av_buffer_create(data, size, v4l2_request_frame_free, avctx, 0);
++    if (!ref) {
++        av_freep(&data);
++        return NULL;
++    }
++
++    req = (V4L2RequestDescriptor*)data;
++    req->request_fd = -1;
++    req->output.fd = -1;
++    req->capture.fd = -1;
++
++    ret = v4l2_request_buffer_alloc(avctx, &req->output, ctx->output_type);
++    if (ret < 0) {
++        av_buffer_unref(&ref);
++        return NULL;
++    }
++
++    ret = v4l2_request_buffer_alloc(avctx, &req->capture, ctx->format.type);
++    if (ret < 0) {
++        av_buffer_unref(&ref);
++        return NULL;
++    }
++
++    ret = ioctl(ctx->media_fd, MEDIA_IOC_REQUEST_ALLOC, &req->request_fd);
++    if (ret < 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: request alloc failed, %s (%d)\n", __func__, strerror(errno), errno);
++        av_buffer_unref(&ref);
++        return NULL;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p request_fd=%d\n", __func__, avctx, size, data, req->request_fd);
++    return ref;
++}
++
++static void v4l2_request_pool_free(void *opaque)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
++}
++
++static void v4l2_request_hwframe_ctx_free(AVHWFramesContext *hwfc)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
++
++    av_buffer_pool_flush(hwfc->pool);
++    av_buffer_pool_uninit(&hwfc->pool);
++}
++
++int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++    V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data;
++    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
++
++    hwfc->format = AV_PIX_FMT_DRM_PRIME;
++    hwfc->sw_format = AV_PIX_FMT_NV12;
++    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++        hwfc->width = ctx->format.fmt.pix_mp.width;
++        hwfc->height = ctx->format.fmt.pix_mp.height;
++    } else {
++        hwfc->width = ctx->format.fmt.pix.width;
++        hwfc->height = ctx->format.fmt.pix.height;
++#if CONFIG_SAND
++        if (ctx->format.fmt.pix.pixelformat == V4L2_PIX_FMT_NV12_COL128) {
++            hwfc->sw_format = AV_PIX_FMT_RPI4_8;
++        }
++        else if (ctx->format.fmt.pix.pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++            hwfc->sw_format = AV_PIX_FMT_RPI4_10;
++        }
++#endif
++    }
++
++    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2RequestDescriptor), avctx, v4l2_request_frame_alloc, v4l2_request_pool_free);
++    if (!hwfc->pool)
++        return AVERROR(ENOMEM);
++
++    hwfc->free = v4l2_request_hwframe_ctx_free;
++
++    hwfc->initial_pool_size = 1;
++
++    switch (avctx->codec_id) {
++    case AV_CODEC_ID_VP9:
++        hwfc->initial_pool_size += 8;
++        break;
++    case AV_CODEC_ID_VP8:
++        hwfc->initial_pool_size += 3;
++        break;
++    default:
++        hwfc->initial_pool_size += 2;
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
++
++    return 0;
++}
+diff --git a/libavcodec/v4l2_request.h b/libavcodec/v4l2_request.h
+new file mode 100644
+index 0000000000..20b56cfbfb
+--- /dev/null
++++ b/libavcodec/v4l2_request.h
+@@ -0,0 +1,96 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_V4L2_REQUEST_H
++#define AVCODEC_V4L2_REQUEST_H
++
++#include <linux/videodev2.h>
++
++#include "libavutil/hwcontext_drm.h"
++#include "v4l2_phase.h"
++
++typedef struct V4L2RequestContext {
++    int video_fd;
++    int media_fd;
++    enum v4l2_buf_type output_type;
++    struct v4l2_format format;
++    int timestamp;
++} V4L2RequestContext;
++
++typedef struct V4L2RequestBuffer {
++    int index;
++    int fd;
++    uint8_t *addr;
++    uint32_t width;
++    uint32_t height;
++    uint32_t size;
++    uint32_t used;
++    uint32_t capabilities;
++    struct v4l2_buffer buffer;
++} V4L2RequestBuffer;
++
++struct V4l2PhaseControl;
++
++typedef struct V4L2PhaseEnv {
++    struct V4L2PhaseEnv * next;
++    struct V4L2PhaseControl * ctrl;
++    unsigned int order;
++} V4L2PhaseEnv;
++
++typedef struct V4L2RequestDescriptor {
++    AVDRMFrameDescriptor drm;
++    int request_fd;
++    V4L2RequestBuffer output;
++    V4L2RequestBuffer capture;
++
++    // Phase control
++    V4L2PhaseInfo phase;
++} V4L2RequestDescriptor;
++
++uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame);
++
++// Sets phase control on this frame & gives it an order
++int ff_v4l2_request_start_phase_control(AVFrame *frame, struct V4L2PhaseControl * phase);
++
++// Had error - release all phases
++void ff_v4l2_request_abort_phase_control(AVFrame *frame);
++
++
++int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame);
++
++int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size);
++
++int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_get_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_query_control(AVCodecContext *avctx, struct v4l2_query_ext_ctrl *control);
++
++int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id);
++
++int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice);
++
++int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count);
++
++int ff_v4l2_request_uninit(AVCodecContext *avctx);
++
++int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++
++#endif /* AVCODEC_V4L2_REQUEST_H */
+diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c
+new file mode 100644
+index 0000000000..5b0f21a60d
+--- /dev/null
++++ b/libavcodec/v4l2_request_h264.c
+@@ -0,0 +1,468 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "h264dec.h"
++#include "hwconfig.h"
++#include "v4l2_request.h"
++#include "h264-ctrls.h"
++
++typedef struct V4L2RequestControlsH264 {
++    struct v4l2_ctrl_h264_sps sps;
++    struct v4l2_ctrl_h264_pps pps;
++    struct v4l2_ctrl_h264_scaling_matrix scaling_matrix;
++    struct v4l2_ctrl_h264_decode_params decode_params;
++    struct v4l2_ctrl_h264_slice_params slice_params[MAX_SLICES];
++    int first_slice;
++} V4L2RequestControlsH264;
++
++typedef struct V4L2RequestContextH264 {
++    V4L2RequestContext base;
++    int decode_mode;
++    int start_code;
++    int max_slices;
++} V4L2RequestContextH264;
++
++static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++static void fill_weight_factors(struct v4l2_h264_weight_factors *factors, int list, const H264SliceContext *sl)
++{
++    for (int i = 0; i < sl->ref_count[list]; i++) {
++        if (sl->pwt.luma_weight_flag[list]) {
++            factors->luma_weight[i] = sl->pwt.luma_weight[i][list][0];
++            factors->luma_offset[i] = sl->pwt.luma_weight[i][list][1];
++        } else {
++            factors->luma_weight[i] = 1 << sl->pwt.luma_log2_weight_denom;
++            factors->luma_offset[i] = 0;
++        }
++        for (int j = 0; j < 2; j++) {
++            if (sl->pwt.chroma_weight_flag[list]) {
++                factors->chroma_weight[i][j] = sl->pwt.chroma_weight[i][list][j][0];
++                factors->chroma_offset[i][j] = sl->pwt.chroma_weight[i][list][j][1];
++            } else {
++                factors->chroma_weight[i][j] = 1 << sl->pwt.chroma_log2_weight_denom;
++                factors->chroma_offset[i][j] = 0;
++            }
++        }
++    }
++}
++
++static void fill_dpb_entry(struct v4l2_h264_dpb_entry *entry, const H264Picture *pic)
++{
++    entry->reference_ts = ff_v4l2_request_get_capture_timestamp(pic->f);
++    entry->frame_num = pic->frame_num;
++    entry->pic_num = pic->pic_id;
++    entry->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID;
++    if (pic->reference) {
++        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE;
++        if (pic->reference != PICT_FRAME) {
++            entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_FIELD;
++            if (pic->reference == PICT_BOTTOM_FIELD)
++                entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_BOTTOM_FIELD;
++        }
++    }
++    if (pic->long_ref)
++        entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM;
++    if (pic->field_poc[0] != INT_MAX)
++        entry->top_field_order_cnt = pic->field_poc[0];
++    if (pic->field_poc[1] != INT_MAX)
++        entry->bottom_field_order_cnt = pic->field_poc[1];
++}
++
++static void fill_dpb(struct v4l2_ctrl_h264_decode_params *decode, const H264Context *h)
++{
++    int entries = 0;
++
++    for (int i = 0; i < h->short_ref_count; i++) {
++        const H264Picture *pic = h->short_ref[i];
++        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
++            fill_dpb_entry(&decode->dpb[entries++], pic);
++    }
++
++    if (!h->long_ref_count)
++        return;
++
++    for (int i = 0; i < FF_ARRAY_ELEMS(h->long_ref); i++) {
++        const H264Picture *pic = h->long_ref[i];
++        if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX))
++            fill_dpb_entry(&decode->dpb[entries++], pic);
++    }
++}
++
++static uint8_t get_dpb_index(struct v4l2_ctrl_h264_decode_params *decode, const H264Ref *ref)
++{
++    uint64_t timestamp;
++
++    if (!ref->parent)
++        return 0;
++
++    timestamp = ff_v4l2_request_get_capture_timestamp(ref->parent->f);
++
++    for (uint8_t i = 0; i < FF_ARRAY_ELEMS(decode->dpb); i++) {
++        struct v4l2_h264_dpb_entry *entry = &decode->dpb[i];
++        if ((entry->flags & V4L2_H264_DPB_ENTRY_FLAG_VALID) &&
++            entry->reference_ts == timestamp)
++            // TODO: signal reference type, possible using top 2 bits
++            return i | ((ref->reference & 3) << 6);
++    }
++
++    return 0;
++}
++
++static void fill_sps(struct v4l2_ctrl_h264_sps *ctrl, const H264Context *h)
++{
++    const SPS *sps = h->ps.sps;
++
++    *ctrl = (struct v4l2_ctrl_h264_sps) {
++        .profile_idc = sps->profile_idc,
++        .constraint_set_flags = sps->constraint_set_flags,
++        .level_idc = sps->level_idc,
++        .seq_parameter_set_id = sps->sps_id,
++        .chroma_format_idc = sps->chroma_format_idc,
++        .bit_depth_luma_minus8 = sps->bit_depth_luma - 8,
++        .bit_depth_chroma_minus8 = sps->bit_depth_chroma - 8,
++        .log2_max_frame_num_minus4 = sps->log2_max_frame_num - 4,
++        .pic_order_cnt_type = sps->poc_type,
++        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++        .max_num_ref_frames = sps->ref_frame_count,
++        .num_ref_frames_in_pic_order_cnt_cycle = sps->poc_cycle_length,
++        //.offset_for_ref_frame[255] - not required? not set by libva-v4l2-request - copy sps->offset_for_ref_frame
++        .offset_for_non_ref_pic = sps->offset_for_non_ref_pic,
++        .offset_for_top_to_bottom_field = sps->offset_for_top_to_bottom_field,
++        .pic_width_in_mbs_minus1 = h->mb_width - 1,
++        .pic_height_in_map_units_minus1 = sps->frame_mbs_only_flag ? h->mb_height - 1 : h->mb_height / 2 - 1,
++    };
++
++    if (sps->residual_color_transform_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++    if (sps->transform_bypass)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS;
++    if (sps->delta_pic_order_always_zero_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO;
++    if (sps->gaps_in_frame_num_allowed_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED;
++    if (sps->frame_mbs_only_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY;
++    if (sps->mb_aff)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD;
++    if (sps->direct_8x8_inference_flag)
++        ctrl->flags |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE;
++}
++
++static void fill_pps(struct v4l2_ctrl_h264_pps *ctrl, const H264Context *h)
++{
++    const SPS *sps = h->ps.sps;
++    const PPS *pps = h->ps.pps;
++    const H264SliceContext *sl = &h->slice_ctx[0];
++    int qp_bd_offset = 6 * (sps->bit_depth_luma - 8);
++
++    *ctrl = (struct v4l2_ctrl_h264_pps) {
++        .pic_parameter_set_id = sl->pps_id,
++        .seq_parameter_set_id = pps->sps_id,
++        .num_slice_groups_minus1 = pps->slice_group_count - 1,
++        .num_ref_idx_l0_default_active_minus1 = pps->ref_count[0] - 1,
++        .num_ref_idx_l1_default_active_minus1 = pps->ref_count[1] - 1,
++        .weighted_bipred_idc = pps->weighted_bipred_idc,
++        .pic_init_qp_minus26 = pps->init_qp - 26 - qp_bd_offset,
++        .pic_init_qs_minus26 = pps->init_qs - 26 - qp_bd_offset,
++        .chroma_qp_index_offset = pps->chroma_qp_index_offset[0],
++        .second_chroma_qp_index_offset = pps->chroma_qp_index_offset[1],
++    };
++
++    if (pps->cabac)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE;
++    if (pps->pic_order_present)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT;
++    if (pps->weighted_pred)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED;
++    if (pps->deblocking_filter_parameters_present)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT;
++    if (pps->constrained_intra_pred)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++    if (pps->redundant_pic_cnt_present)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT;
++    if (pps->transform_8x8_mode)
++        ctrl->flags |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE;
++}
++
++static int v4l2_request_h264_start_frame(AVCodecContext *avctx,
++                                         av_unused const uint8_t *buffer,
++                                         av_unused uint32_t size)
++{
++    const H264Context *h = avctx->priv_data;
++    const PPS *pps = h->ps.pps;
++    const SPS *sps = h->ps.sps;
++    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
++
++    fill_sps(&controls->sps, h);
++    fill_pps(&controls->pps, h);
++
++    memcpy(controls->scaling_matrix.scaling_list_4x4, pps->scaling_matrix4, sizeof(controls->scaling_matrix.scaling_list_4x4));
++    memcpy(controls->scaling_matrix.scaling_list_8x8[0], pps->scaling_matrix8[0], sizeof(controls->scaling_matrix.scaling_list_8x8[0]));
++    memcpy(controls->scaling_matrix.scaling_list_8x8[1], pps->scaling_matrix8[3], sizeof(controls->scaling_matrix.scaling_list_8x8[1]));
++
++    if (sps->chroma_format_idc == 3) {
++        memcpy(controls->scaling_matrix.scaling_list_8x8[2], pps->scaling_matrix8[1], sizeof(controls->scaling_matrix.scaling_list_8x8[2]));
++        memcpy(controls->scaling_matrix.scaling_list_8x8[3], pps->scaling_matrix8[4], sizeof(controls->scaling_matrix.scaling_list_8x8[3]));
++        memcpy(controls->scaling_matrix.scaling_list_8x8[4], pps->scaling_matrix8[2], sizeof(controls->scaling_matrix.scaling_list_8x8[4]));
++        memcpy(controls->scaling_matrix.scaling_list_8x8[5], pps->scaling_matrix8[5], sizeof(controls->scaling_matrix.scaling_list_8x8[5]));
++    }
++
++    controls->decode_params = (struct v4l2_ctrl_h264_decode_params) {
++        .num_slices = 0,
++        .nal_ref_idc = h->nal_ref_idc,
++        .top_field_order_cnt = h->cur_pic_ptr->field_poc[0] != INT_MAX ? h->cur_pic_ptr->field_poc[0] : 0,
++        .bottom_field_order_cnt = h->cur_pic_ptr->field_poc[1] != INT_MAX ? h->cur_pic_ptr->field_poc[1] : 0,
++    };
++
++    if (h->picture_idr)
++        controls->decode_params.flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC;
++
++    fill_dpb(&controls->decode_params, h);
++
++    controls->first_slice = !FIELD_PICTURE(h) || h->first_field;
++
++    return ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
++}
++
++static int v4l2_request_h264_queue_decode(AVCodecContext *avctx, int last_slice)
++{
++    const H264Context *h = avctx->priv_data;
++    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
++    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
++            .ptr = &controls->sps,
++            .size = sizeof(controls->sps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
++            .ptr = &controls->pps,
++            .size = sizeof(controls->pps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS,
++            .ptr = &controls->slice_params,
++            .size = sizeof(controls->slice_params[0]) * FFMAX(FFMIN(controls->decode_params.num_slices, MAX_SLICES), ctx->max_slices),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS,
++            .ptr = &controls->decode_params,
++            .size = sizeof(controls->decode_params),
++        },
++    };
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED)
++        return ff_v4l2_request_decode_slice(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
++
++    return ff_v4l2_request_decode_frame(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const H264Context *h = avctx->priv_data;
++    const PPS *pps = h->ps.pps;
++    const H264SliceContext *sl = &h->slice_ctx[0];
++    V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private;
++    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->cur_pic_ptr->f->data[0];
++    int i, ret, count, slice = FFMIN(controls->decode_params.num_slices, MAX_SLICES - 1);
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED && slice) {
++        ret = v4l2_request_h264_queue_decode(avctx, 0);
++        if (ret)
++            return ret;
++
++        ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f);
++        slice = controls->decode_params.num_slices = 0;
++        controls->first_slice = 0;
++    }
++
++    controls->slice_params[slice] = (struct v4l2_ctrl_h264_slice_params) {
++        /* Size in bytes, including header */
++        .size = 0,
++        .start_byte_offset = req->output.used,
++        /* Offset in bits to slice_data() from the beginning of this slice. */
++        .header_bit_size = get_bits_count(&sl->gb),
++
++        .first_mb_in_slice = sl->first_mb_addr,
++        .slice_type = ff_h264_get_slice_type(sl),
++        .pic_parameter_set_id = sl->pps_id,
++        .colour_plane_id = 0, /* what is this? */
++        .frame_num = h->poc.frame_num,
++        .idr_pic_id = sl->idr_pic_id,
++        .pic_order_cnt_lsb = sl->poc_lsb,
++        .delta_pic_order_cnt_bottom = sl->delta_poc_bottom,
++        .delta_pic_order_cnt0 = sl->delta_poc[0],
++        .delta_pic_order_cnt1 = sl->delta_poc[1],
++        .redundant_pic_cnt = sl->redundant_pic_count,
++
++        /* Size in bits of dec_ref_pic_marking() syntax element. */
++        .dec_ref_pic_marking_bit_size = sl->ref_pic_marking_size_in_bits,
++        /* Size in bits of pic order count syntax. */
++        .pic_order_cnt_bit_size = sl->pic_order_cnt_bit_size,
++
++        .cabac_init_idc = sl->cabac_init_idc,
++        .slice_qp_delta = sl->qscale - pps->init_qp,
++        .slice_qs_delta = 0, /* XXX not implemented by FFmpeg */
++        .disable_deblocking_filter_idc = sl->deblocking_filter < 2 ? !sl->deblocking_filter : sl->deblocking_filter,
++        .slice_alpha_c0_offset_div2 = sl->slice_alpha_c0_offset / 2,
++        .slice_beta_offset_div2 = sl->slice_beta_offset / 2,
++        .slice_group_change_cycle = 0, /* what is this? */
++
++        .num_ref_idx_l0_active_minus1 = sl->list_count > 0 ? sl->ref_count[0] - 1 : 0,
++        .num_ref_idx_l1_active_minus1 = sl->list_count > 1 ? sl->ref_count[1] - 1 : 0,
++    };
++
++    if (FIELD_PICTURE(h))
++        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_FIELD_PIC;
++    if (h->picture_structure == PICT_BOTTOM_FIELD)
++        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_BOTTOM_FIELD;
++    if (sl->slice_type == AV_PICTURE_TYPE_B && sl->direct_spatial_mv_pred)
++        controls->slice_params[slice].flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED;
++
++    controls->slice_params[slice].pred_weight_table.chroma_log2_weight_denom = sl->pwt.chroma_log2_weight_denom;
++    controls->slice_params[slice].pred_weight_table.luma_log2_weight_denom = sl->pwt.luma_log2_weight_denom;
++
++    count = sl->list_count > 0 ? sl->ref_count[0] : 0;
++    for (i = 0; i < count; i++)
++        controls->slice_params[slice].ref_pic_list0[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[0][i]);
++    if (count)
++        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[0], 0, sl);
++
++    count = sl->list_count > 1 ? sl->ref_count[1] : 0;
++    for (i = 0; i < count; i++)
++        controls->slice_params[slice].ref_pic_list1[i] = get_dpb_index(&controls->decode_params, &sl->ref_list[1][i]);
++    if (count)
++        fill_weight_factors(&controls->slice_params[slice].pred_weight_table.weight_factors[1], 1, sl);
++
++    if (ctx->start_code == V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
++        ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, nalu_slice_start_code, 3);
++        if (ret)
++            return ret;
++    }
++
++    ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, buffer, size);
++    if (ret)
++        return ret;
++
++    controls->slice_params[slice].size = req->output.used - controls->slice_params[slice].start_byte_offset;
++    controls->decode_params.num_slices++;
++    return 0;
++}
++
++static int v4l2_request_h264_end_frame(AVCodecContext *avctx)
++{
++    const H264Context *h = avctx->priv_data;
++    return v4l2_request_h264_queue_decode(avctx, !FIELD_PICTURE(h) || !h->first_field);
++}
++
++static int v4l2_request_h264_set_controls(AVCodecContext *avctx)
++{
++    V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        { .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE, },
++        { .id = V4L2_CID_MPEG_VIDEO_H264_START_CODE, },
++    };
++    struct v4l2_query_ext_ctrl slice_params = {
++        .id = V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS,
++    };
++
++    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE);
++    if (ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED &&
++        ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
++        return AVERROR(EINVAL);
++    }
++
++    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_START_CODE);
++    if (ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_NONE &&
++        ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
++        return AVERROR(EINVAL);
++    }
++
++    ret = ff_v4l2_request_query_control(avctx, &slice_params);
++    if (ret)
++        return ret;
++
++    ctx->max_slices = slice_params.elems;
++    if (ctx->max_slices > MAX_SLICES) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
++        return AVERROR(EINVAL);
++    }
++
++    control[0].value = ctx->decode_mode;
++    control[1].value = ctx->start_code;
++
++    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_h264_init(AVCodecContext *avctx)
++{
++    const H264Context *h = avctx->priv_data;
++    struct v4l2_ctrl_h264_sps sps;
++    struct v4l2_ctrl_h264_pps pps;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_SPS,
++            .ptr = &sps,
++            .size = sizeof(sps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_H264_PPS,
++            .ptr = &pps,
++            .size = sizeof(pps),
++        },
++    };
++
++    fill_sps(&sps, h);
++    fill_pps(&pps, h);
++
++    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_H264_SLICE, 4 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    return v4l2_request_h264_set_controls(avctx);
++}
++
++const AVHWAccel ff_h264_v4l2request_hwaccel = {
++    .name           = "h264_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_H264,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_h264_start_frame,
++    .decode_slice   = v4l2_request_h264_decode_slice,
++    .end_frame      = v4l2_request_h264_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsH264),
++    .init           = v4l2_request_h264_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContextH264),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+new file mode 100644
+index 0000000000..3f813b8520
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -0,0 +1,606 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hevcdec.h"
++#include "hwconfig.h"
++#include "v4l2_request.h"
++#include "hevc-ctrls.h"
++#include "v4l2_phase.h"
++
++#define MAX_SLICES 16
++
++typedef struct V4L2RequestControlsHEVC {
++    struct v4l2_ctrl_hevc_sps sps;
++    struct v4l2_ctrl_hevc_pps pps;
++    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
++    struct v4l2_ctrl_hevc_slice_params slice_params[MAX_SLICES];
++    int first_slice;
++    int num_slices; //TODO: this should be in control
++} V4L2RequestControlsHEVC;
++
++typedef struct V4L2RequestContextHEVC {
++    V4L2RequestContext base;
++    int decode_mode;
++    int start_code;
++    int max_slices;
++
++    unsigned int order;
++    V4L2PhaseControl * pctrl;
++} V4L2RequestContextHEVC;
++
++static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++static void v4l2_request_hevc_fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
++{
++    int32_t luma_weight_denom, chroma_weight_denom;
++    const SliceHeader *sh = &h->sh;
++
++    if (sh->slice_type == HEVC_SLICE_I ||
++        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
++        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
++        return;
++
++    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
++
++    if (h->ps.sps->chroma_format_idc)
++        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
++
++    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
++    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
++        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
++        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
++        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
++        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
++        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
++    }
++
++    if (sh->slice_type != HEVC_SLICE_B)
++        return;
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
++        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
++        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
++        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
++        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
++        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
++    }
++}
++
++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
++{
++    const HEVCFrame *frame;
++    int i;
++
++    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
++        frame = h->rps[ST_CURR_BEF].ref[i];
++        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
++    }
++
++    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
++        frame = h->rps[ST_CURR_AFT].ref[i];
++        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
++    }
++
++    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
++        frame = h->rps[LT_CURR].ref[i];
++        if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
++    }
++
++    return 0;
++}
++
++static uint8_t get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
++                                 struct v4l2_ctrl_hevc_slice_params *slice_params)
++{
++    uint64_t timestamp;
++
++    if (!frame)
++        return 0;
++
++    timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
++
++    for (uint8_t i = 0; i < slice_params->num_active_dpb_entries; i++) {
++        struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[i];
++        if (entry->timestamp == timestamp)
++            return i;
++    }
++
++    return 0;
++}
++
++static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
++                                                struct v4l2_ctrl_hevc_slice_params *slice_params)
++{
++    const HEVCFrame *pic = h->ref;
++    const SliceHeader *sh = &h->sh;
++    int i, entries = 0;
++    RefPicList *rpl;
++
++    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
++        .bit_size = 0,
++        .data_bit_offset = get_bits_count(&h->HEVClc->gb),
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_segment_addr = sh->slice_segment_addr,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++        .nal_unit_type = h->nal_unit_type,
++        .nuh_temporal_id_plus1 = h->temporal_id + 1,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_type = sh->slice_type,
++        .colour_plane_id = sh->colour_plane_id,
++        .slice_pic_order_cnt = pic->poc,
++        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
++        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
++        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
++        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
++        .slice_qp_delta = sh->slice_qp_delta,
++        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
++        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
++        .slice_act_y_qp_offset = 0,
++        .slice_act_cb_qp_offset = 0,
++        .slice_act_cr_qp_offset = 0,
++        .slice_beta_offset_div2 = sh->beta_offset / 2,
++        .slice_tc_offset_div2 = sh->tc_offset / 2,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++        .pic_struct = h->sei.picture_timing.picture_struct,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++    };
++
++    if (sh->slice_sample_adaptive_offset_flag[0])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
++
++    if (sh->slice_sample_adaptive_offset_flag[1])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
++
++    if (sh->slice_temporal_mvp_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
++
++    if (sh->mvd_l1_zero_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
++
++    if (sh->cabac_init_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
++
++    if (sh->collocated_list == L0)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
++
++    if (sh->disable_deblocking_filter_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
++
++    if (sh->slice_loop_filter_across_slices_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    if (sh->dependent_slice_segment_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
++        const HEVCFrame *frame = &h->DPB[i];
++        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
++            struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[entries++];
++
++            entry->timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame);
++            entry->rps = find_frame_rps_type(h, entry->timestamp);
++            entry->field_pic = frame->frame->interlaced_frame;
++
++            /* TODO: Interleaved: Get the POC for each field. */
++            entry->pic_order_cnt[0] = frame->poc;
++            entry->pic_order_cnt[1] = frame->poc;
++        }
++    }
++
++    slice_params->num_active_dpb_entries = entries;
++
++    if (sh->slice_type != HEVC_SLICE_I) {
++        rpl = &h->ref->refPicList[0];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
++    }
++
++    if (sh->slice_type == HEVC_SLICE_B) {
++        rpl = &h->ref->refPicList[1];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], slice_params);
++    }
++
++    v4l2_request_hevc_fill_pred_table(h, &slice_params->pred_weight_table);
++
++    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++    if (slice_params->num_entry_point_offsets > 256) {
++        slice_params->num_entry_point_offsets = 256;
++        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
++    }
++
++    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
++        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
++}
++
++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCContext *h)
++{
++    const HEVCSPS *sps = h->ps.sps;
++
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++    *ctrl = (struct v4l2_ctrl_hevc_sps) {
++        .chroma_format_idc = sps->chroma_format_idc,
++        .pic_width_in_luma_samples = sps->width,
++        .pic_height_in_luma_samples = sps->height,
++        .bit_depth_luma_minus8 = sps->bit_depth - 8,
++        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
++        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
++        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
++        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
++        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
++        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
++        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
++        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
++        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
++        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
++        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
++        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
++        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
++        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
++        .num_short_term_ref_pic_sets = sps->nb_st_rps,
++        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
++    };
++
++    if (sps->separate_colour_plane_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++
++    if (sps->scaling_list_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
++
++    if (sps->amp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
++
++    if (sps->sao_enabled)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
++
++    if (sps->pcm_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
++
++    if (sps->pcm.loop_filter_disable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
++
++    if (sps->long_term_ref_pics_present_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
++
++    if (sps->sps_temporal_mvp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
++
++    if (sps->sps_strong_intra_smoothing_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
++}
++
++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
++                                         av_unused const uint8_t *buffer,
++                                         av_unused uint32_t size)
++{
++    const HEVCContext *h = avctx->priv_data;
++    const HEVCSPS *sps = h->ps.sps;
++    const HEVCPPS *pps = h->ps.pps;
++    const ScalingList *sl = pps->scaling_list_data_present_flag ?
++                            &pps->scaling_list :
++                            sps->scaling_list_enable_flag ?
++                            &sps->scaling_list : NULL;
++    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    int rv;
++
++    fill_sps(&controls->sps, h);
++
++    if (sl) {
++        for (int i = 0; i < 6; i++) {
++            for (int j = 0; j < 16; j++)
++                controls->scaling_matrix.scaling_list_4x4[i][j] = sl->sl[0][i][j];
++            for (int j = 0; j < 64; j++) {
++                controls->scaling_matrix.scaling_list_8x8[i][j]   = sl->sl[1][i][j];
++                controls->scaling_matrix.scaling_list_16x16[i][j] = sl->sl[2][i][j];
++                if (i < 2)
++                    controls->scaling_matrix.scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
++            }
++            controls->scaling_matrix.scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
++            if (i < 2)
++                controls->scaling_matrix.scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
++        }
++    }
++
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++    controls->pps = (struct v4l2_ctrl_hevc_pps) {
++        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
++        .init_qp_minus26 = pps->pic_init_qp_minus26,
++        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
++        .pps_cb_qp_offset = pps->cb_qp_offset,
++        .pps_cr_qp_offset = pps->cr_qp_offset,
++        .pps_beta_offset_div2 = pps->beta_offset / 2,
++        .pps_tc_offset_div2 = pps->tc_offset / 2,
++        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
++    };
++
++    if (pps->dependent_slice_segments_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT;
++
++    if (pps->output_flag_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
++
++    if (pps->sign_data_hiding_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
++
++    if (pps->cabac_init_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
++
++    if (pps->constrained_intra_pred_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++
++    if (pps->transform_skip_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
++
++    if (pps->cu_qp_delta_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
++
++    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
++
++    if (pps->weighted_pred_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
++
++    if (pps->weighted_bipred_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
++
++    if (pps->transquant_bypass_enable_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
++
++    if (pps->tiles_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
++
++    if (pps->entropy_coding_sync_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
++
++    if (pps->loop_filter_across_tiles_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
++
++    if (pps->seq_loop_filter_across_slices_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    if (pps->deblocking_filter_override_enabled_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
++
++    if (pps->disable_dbf)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
++
++    if (pps->lists_modification_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
++
++    if (pps->slice_header_extension_present_flag)
++        controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
++
++    if (pps->tiles_enabled_flag) {
++        controls->pps.num_tile_columns_minus1 = pps->num_tile_columns - 1;
++        controls->pps.num_tile_rows_minus1 = pps->num_tile_rows - 1;
++
++        for (int i = 0; i < pps->num_tile_columns; i++)
++            controls->pps.column_width_minus1[i] = pps->column_width[i] - 1;
++
++        for (int i = 0; i < pps->num_tile_rows; i++)
++            controls->pps.row_height_minus1[i] = pps->row_height[i] - 1;
++    }
++
++    controls->first_slice = 1;
++    controls->num_slices = 0;
++
++    if ((rv = ff_v4l2_request_reset_frame(avctx, h->ref->frame)) != 0)
++        return rv;
++
++    ff_v4l2_request_start_phase_control(h->ref->frame, ctx->pctrl);
++
++    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++    return 0;
++}
++
++static int v4l2_request_hevc_queue_decode(AVCodecContext *avctx, int last_slice)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
++            .ptr = &controls->sps,
++            .size = sizeof(controls->sps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
++            .ptr = &controls->pps,
++            .size = sizeof(controls->pps),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
++            .ptr = &controls->slice_params,
++            .size = sizeof(controls->slice_params[0]) * FFMAX(FFMIN(controls->num_slices, MAX_SLICES), ctx->max_slices),
++        },
++    };
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED)
++        return ff_v4l2_request_decode_slice(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice);
++
++    return ff_v4l2_request_decode_frame(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private;
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->ref->frame->data[0];
++    int ret, slice = FFMIN(controls->num_slices, MAX_SLICES - 1);
++
++    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && slice) {
++        ret = v4l2_request_hevc_queue_decode(avctx, 0);
++        if (ret)
++            return ret;
++
++        ff_v4l2_request_reset_frame(avctx, h->ref->frame);
++        slice = controls->num_slices = 0;
++        controls->first_slice = 0;
++    }
++
++    v4l2_request_hevc_fill_slice_params(h, &controls->slice_params[slice]);
++
++    if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
++        ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, nalu_slice_start_code, 3);
++        if (ret)
++            return ret;
++    }
++
++    ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, buffer, size);
++    if (ret)
++        return ret;
++
++    controls->slice_params[slice].bit_size = req->output.used * 8; //FIXME
++    controls->num_slices++;
++    return 0;
++}
++
++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) {
++    const HEVCContext *h = avctx->priv_data;
++
++    if (h->ref != NULL)
++        ff_v4l2_request_abort_phase_control(h->ref->frame);
++}
++
++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
++{
++    int rv = v4l2_request_hevc_queue_decode(avctx, 1);
++    if (rv < 0)
++        v4l2_request_hevc_abort_frame(avctx);
++    return rv;
++}
++
++static int v4l2_request_hevc_set_controls(AVCodecContext *avctx)
++{
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
++    };
++    struct v4l2_query_ext_ctrl slice_params = {
++        .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
++    };
++
++    ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE);
++    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
++        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
++        return AVERROR(EINVAL);
++    }
++
++    ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_START_CODE);
++    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
++        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
++        return AVERROR(EINVAL);
++    }
++
++    ret = ff_v4l2_request_query_control(avctx, &slice_params);
++    if (ret)
++        return ret;
++
++    ctx->max_slices = slice_params.elems;
++    if (ctx->max_slices > MAX_SLICES) {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
++        return AVERROR(EINVAL);
++    }
++
++    control[0].value = ctx->decode_mode;
++    control[1].value = ctx->start_code;
++
++    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    ff_v4l2_phase_control_deletez(&ctx->pctrl);
++    return ff_v4l2_request_uninit(avctx);
++}
++
++static int v4l2_request_hevc_init(AVCodecContext *avctx)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    struct v4l2_ctrl_hevc_sps sps;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
++            .ptr = &sps,
++            .size = sizeof(sps),
++        },
++    };
++
++    if ((ctx->pctrl = ff_v4l2_phase_control_new(2)) == NULL)
++        return AVERROR(ENOMEM);
++
++    fill_sps(&sps, h);
++
++    ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_HEVC_SLICE, 4 * 1024 * 1024, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    return v4l2_request_hevc_set_controls(avctx);
++}
++
++const AVHWAccel ff_hevc_v4l2request_hwaccel = {
++    .name           = "hevc_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_hevc_start_frame,
++    .decode_slice   = v4l2_request_hevc_decode_slice,
++    .end_frame      = v4l2_request_hevc_end_frame,
++    .abort_frame    = v4l2_request_hevc_abort_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsHEVC),
++    .init           = v4l2_request_hevc_init,
++    .uninit         = v4l2_request_hevc_uninit,
++    .priv_data_size = sizeof(V4L2RequestContextHEVC),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
+diff --git a/libavcodec/v4l2_request_mpeg2.c b/libavcodec/v4l2_request_mpeg2.c
+new file mode 100644
+index 0000000000..bc251a6fd2
+--- /dev/null
++++ b/libavcodec/v4l2_request_mpeg2.c
+@@ -0,0 +1,155 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hwconfig.h"
++#include "mpegvideo.h"
++#include "v4l2_request.h"
++#include "mpeg2-ctrls.h"
++
++typedef struct V4L2RequestControlsMPEG2 {
++    struct v4l2_ctrl_mpeg2_slice_params slice_params;
++    struct v4l2_ctrl_mpeg2_quantization quantization;
++} V4L2RequestControlsMPEG2;
++
++static int v4l2_request_mpeg2_start_frame(AVCodecContext *avctx,
++                                          av_unused const uint8_t *buffer,
++                                          av_unused uint32_t size)
++{
++    const MpegEncContext *s = avctx->priv_data;
++    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
++
++    controls->slice_params = (struct v4l2_ctrl_mpeg2_slice_params) {
++        .bit_size = 0,
++        .data_bit_offset = 0,
++
++        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */
++        .quantiser_scale_code = s->qscale >> 1,
++
++        .sequence = {
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */
++            .horizontal_size = s->width,
++            .vertical_size = s->height,
++            .vbv_buffer_size = req->output.size,
++
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */
++            .profile_and_level_indication = 0,
++            .progressive_sequence = s->progressive_sequence,
++            .chroma_format = s->chroma_format,
++        },
++
++        .picture = {
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */
++            .picture_coding_type = s->pict_type,
++
++            /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */
++            .f_code[0][0] = s->mpeg_f_code[0][0],
++            .f_code[0][1] = s->mpeg_f_code[0][1],
++            .f_code[1][0] = s->mpeg_f_code[1][0],
++            .f_code[1][1] = s->mpeg_f_code[1][1],
++            .intra_dc_precision = s->intra_dc_precision,
++            .picture_structure = s->picture_structure,
++            .top_field_first = s->top_field_first,
++            .frame_pred_frame_dct = s->frame_pred_frame_dct,
++            .concealment_motion_vectors = s->concealment_motion_vectors,
++            .q_scale_type = s->q_scale_type,
++            .intra_vlc_format = s->intra_vlc_format,
++            .alternate_scan = s->alternate_scan,
++            .repeat_first_field = s->repeat_first_field,
++            .progressive_frame = s->progressive_frame,
++        },
++    };
++
++    switch (s->pict_type) {
++    case AV_PICTURE_TYPE_B:
++        controls->slice_params.backward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->next_picture.f);
++        // fall-through
++    case AV_PICTURE_TYPE_P:
++        controls->slice_params.forward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->last_picture.f);
++    }
++
++    controls->quantization = (struct v4l2_ctrl_mpeg2_quantization) {
++        /* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */
++        .load_intra_quantiser_matrix = 1,
++        .load_non_intra_quantiser_matrix = 1,
++        .load_chroma_intra_quantiser_matrix = 1,
++        .load_chroma_non_intra_quantiser_matrix = 1,
++    };
++
++    for (int i = 0; i < 64; i++) {
++        int n = s->idsp.idct_permutation[ff_zigzag_direct[i]];
++        controls->quantization.intra_quantiser_matrix[i] = s->intra_matrix[n];
++        controls->quantization.non_intra_quantiser_matrix[i] = s->inter_matrix[n];
++        controls->quantization.chroma_intra_quantiser_matrix[i] = s->chroma_intra_matrix[n];
++        controls->quantization.chroma_non_intra_quantiser_matrix[i] = s->chroma_inter_matrix[n];
++    }
++
++    return ff_v4l2_request_reset_frame(avctx, s->current_picture_ptr->f);
++}
++
++static int v4l2_request_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const MpegEncContext *s = avctx->priv_data;
++
++    return ff_v4l2_request_append_output_buffer(avctx, s->current_picture_ptr->f, buffer, size);
++}
++
++static int v4l2_request_mpeg2_end_frame(AVCodecContext *avctx)
++{
++    const MpegEncContext *s = avctx->priv_data;
++    V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private;
++    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0];
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS,
++            .ptr = &controls->slice_params,
++            .size = sizeof(controls->slice_params),
++        },
++        {
++            .id = V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION,
++            .ptr = &controls->quantization,
++            .size = sizeof(controls->quantization),
++        },
++    };
++
++    controls->slice_params.bit_size = req->output.used * 8;
++
++    return ff_v4l2_request_decode_frame(avctx, s->current_picture_ptr->f, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_mpeg2_init(AVCodecContext *avctx)
++{
++    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_MPEG2_SLICE, 1024 * 1024, NULL, 0);
++}
++
++const AVHWAccel ff_mpeg2_v4l2request_hwaccel = {
++    .name           = "mpeg2_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_MPEG2VIDEO,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_mpeg2_start_frame,
++    .decode_slice   = v4l2_request_mpeg2_decode_slice,
++    .end_frame      = v4l2_request_mpeg2_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsMPEG2),
++    .init           = v4l2_request_mpeg2_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContext),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+diff --git a/libavcodec/v4l2_request_vp8.c b/libavcodec/v4l2_request_vp8.c
+new file mode 100644
+index 0000000000..ea2c55fa2f
+--- /dev/null
++++ b/libavcodec/v4l2_request_vp8.c
+@@ -0,0 +1,181 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hwconfig.h"
++#include "v4l2_request.h"
++#include "vp8.h"
++#include "vp8-ctrls.h"
++
++typedef struct V4L2RequestControlsVP8 {
++    struct v4l2_ctrl_vp8_frame_header ctrl;
++} V4L2RequestControlsVP8;
++
++static int v4l2_request_vp8_start_frame(AVCodecContext          *avctx,
++                                        av_unused const uint8_t *buffer,
++                                        av_unused uint32_t       size)
++{
++    const VP8Context *s = avctx->priv_data;
++    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
++
++    memset(&controls->ctrl, 0, sizeof(controls->ctrl));
++    return ff_v4l2_request_reset_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f);
++}
++
++static int v4l2_request_vp8_end_frame(AVCodecContext *avctx)
++{
++    const VP8Context *s = avctx->priv_data;
++    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER,
++            .ptr = &controls->ctrl,
++            .size = sizeof(controls->ctrl),
++        },
++    };
++
++    return ff_v4l2_request_decode_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f,
++                                        control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_vp8_decode_slice(AVCodecContext *avctx,
++                                         const uint8_t *buffer,
++                                         uint32_t size)
++{
++    const VP8Context *s = avctx->priv_data;
++    V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private;
++    struct v4l2_ctrl_vp8_frame_header *hdr = &controls->ctrl;
++    const uint8_t *data = buffer + 3 + 7 * s->keyframe;
++    unsigned int i, j, k;
++
++    hdr->version = s->profile & 0x3;
++    hdr->width = avctx->width;
++    hdr->height = avctx->height;
++    /* FIXME: set ->xx_scale */
++    hdr->prob_skip_false = s->prob->mbskip;
++    hdr->prob_intra = s->prob->intra;
++    hdr->prob_gf = s->prob->golden;
++    hdr->prob_last = s->prob->last;
++    hdr->first_part_size = s->header_partition_size;
++    hdr->first_part_header_bits = (8 * (s->coder_state_at_header_end.input - data) -
++                                   s->coder_state_at_header_end.bit_count - 8);
++    hdr->num_dct_parts = s->num_coeff_partitions;
++    for (i = 0; i < 8; i++)
++        hdr->dct_part_sizes[i] = s->coeff_partition_size[i];
++
++    hdr->coder_state.range = s->coder_state_at_header_end.range;
++    hdr->coder_state.value = s->coder_state_at_header_end.value;
++    hdr->coder_state.bit_count = s->coder_state_at_header_end.bit_count;
++    if (s->framep[VP56_FRAME_PREVIOUS])
++        hdr->last_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_PREVIOUS]->tf.f);
++    if (s->framep[VP56_FRAME_GOLDEN])
++        hdr->golden_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN]->tf.f);
++    if (s->framep[VP56_FRAME_GOLDEN2])
++        hdr->alt_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN2]->tf.f);
++    hdr->flags |= s->invisible ? 0 : V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME;
++    hdr->flags |= s->mbskip_enabled ? V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF : 0;
++    hdr->flags |= (s->profile & 0x4) ? V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL : 0;
++    hdr->flags |= s->keyframe ? V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME : 0;
++    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN : 0;
++    hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN2] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT : 0;
++    hdr->segment_header.flags |= s->segmentation.enabled ? V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED : 0;
++    hdr->segment_header.flags |= s->segmentation.update_map ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP : 0;
++    hdr->segment_header.flags |= s->segmentation.update_feature_data ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA : 0;
++    hdr->segment_header.flags |= s->segmentation.absolute_vals ? 0 : V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE;
++    for (i = 0; i < 4; i++) {
++        hdr->segment_header.quant_update[i] = s->segmentation.base_quant[i];
++        hdr->segment_header.lf_update[i] = s->segmentation.filter_level[i];
++    }
++
++    for (i = 0; i < 3; i++)
++        hdr->segment_header.segment_probs[i] = s->prob->segmentid[i];
++
++    hdr->lf_header.level = s->filter.level;
++    hdr->lf_header.sharpness_level = s->filter.sharpness;
++    hdr->lf_header.flags |= s->lf_delta.enabled ? V4L2_VP8_LF_HEADER_ADJ_ENABLE : 0;
++    hdr->lf_header.flags |= s->lf_delta.update ? V4L2_VP8_LF_HEADER_DELTA_UPDATE : 0;
++    hdr->lf_header.flags |= s->filter.simple ? V4L2_VP8_LF_FILTER_TYPE_SIMPLE : 0;
++    for (i = 0; i < 4; i++) {
++        hdr->lf_header.ref_frm_delta[i] = s->lf_delta.ref[i];
++        hdr->lf_header.mb_mode_delta[i] = s->lf_delta.mode[i + MODE_I4x4];
++    }
++
++    // Probabilites
++    if (s->keyframe) {
++        static const uint8_t keyframe_y_mode_probs[4] = {
++            145, 156, 163, 128
++        };
++        static const uint8_t keyframe_uv_mode_probs[3] = {
++            142, 114, 183
++        };
++
++        memcpy(hdr->entropy_header.y_mode_probs, keyframe_y_mode_probs,  4);
++        memcpy(hdr->entropy_header.uv_mode_probs, keyframe_uv_mode_probs, 3);
++    } else {
++        for (i = 0; i < 4; i++)
++            hdr->entropy_header.y_mode_probs[i] = s->prob->pred16x16[i];
++        for (i = 0; i < 3; i++)
++            hdr->entropy_header.uv_mode_probs[i] = s->prob->pred8x8c[i];
++    }
++    for (i = 0; i < 2; i++)
++        for (j = 0; j < 19; j++)
++            hdr->entropy_header.mv_probs[i][j] = s->prob->mvc[i][j];
++
++    for (i = 0; i < 4; i++) {
++        for (j = 0; j < 8; j++) {
++            static const int coeff_bands_inverse[8] = {
++                0, 1, 2, 3, 5, 6, 4, 15
++            };
++            int coeff_pos = coeff_bands_inverse[j];
++
++            for (k = 0; k < 3; k++) {
++                memcpy(hdr->entropy_header.coeff_probs[i][j][k],
++                       s->prob->token[i][coeff_pos][k], 11);
++            }
++        }
++    }
++
++    hdr->quant_header.y_ac_qi = s->quant.yac_qi;
++    hdr->quant_header.y_dc_delta = s->quant.ydc_delta;
++    hdr->quant_header.y2_dc_delta = s->quant.y2dc_delta;
++    hdr->quant_header.y2_ac_delta = s->quant.y2ac_delta;
++    hdr->quant_header.uv_dc_delta = s->quant.uvdc_delta;
++    hdr->quant_header.uv_ac_delta = s->quant.uvac_delta;
++
++    return ff_v4l2_request_append_output_buffer(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f, buffer, size);
++}
++
++static int v4l2_request_vp8_init(AVCodecContext *avctx)
++{
++    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP8_FRAME, 2 * 1024 * 1024, NULL, 0);
++}
++
++const AVHWAccel ff_vp8_v4l2request_hwaccel = {
++    .name           = "vp8_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_VP8,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_vp8_start_frame,
++    .decode_slice   = v4l2_request_vp8_decode_slice,
++    .end_frame      = v4l2_request_vp8_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsVP8),
++    .init           = v4l2_request_vp8_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContext),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+diff --git a/libavcodec/v4l2_request_vp9.c b/libavcodec/v4l2_request_vp9.c
+new file mode 100644
+index 0000000000..2e10b7ad1a
+--- /dev/null
++++ b/libavcodec/v4l2_request_vp9.c
+@@ -0,0 +1,353 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hwconfig.h"
++#include "v4l2_request.h"
++#include "vp9dec.h"
++#include "vp9-ctrls.h"
++
++typedef struct V4L2RequestControlsVP9 {
++    struct v4l2_ctrl_vp9_frame_decode_params decode_params;
++} V4L2RequestControlsVP9;
++
++static const uint8_t ff_to_v4l2_intramode[] = {
++    [VERT_PRED] = V4L2_VP9_INTRA_PRED_MODE_V,
++    [HOR_PRED] = V4L2_VP9_INTRA_PRED_MODE_H,
++    [DC_PRED] = V4L2_VP9_INTRA_PRED_MODE_DC,
++    [DIAG_DOWN_LEFT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D45,
++    [DIAG_DOWN_RIGHT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D135,
++    [VERT_RIGHT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D117,
++    [HOR_DOWN_PRED] = V4L2_VP9_INTRA_PRED_MODE_D153,
++    [VERT_LEFT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D63,
++    [HOR_UP_PRED] = V4L2_VP9_INTRA_PRED_MODE_D207,
++    [TM_VP8_PRED] = V4L2_VP9_INTRA_PRED_MODE_TM,
++};
++
++static int v4l2_request_vp9_set_frame_ctx(AVCodecContext *avctx, unsigned int id)
++{
++    VP9Context *s = avctx->priv_data;
++    struct v4l2_ctrl_vp9_frame_ctx fctx = {};
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(id),
++            .ptr = &fctx,
++            .size = sizeof(fctx),
++        },
++    };
++
++    memcpy(fctx.probs.tx8, s->prob_ctx[id].p.tx8p, sizeof(s->prob_ctx[id].p.tx8p));
++    memcpy(fctx.probs.tx16, s->prob_ctx[id].p.tx16p, sizeof(s->prob_ctx[id].p.tx16p));
++    memcpy(fctx.probs.tx32, s->prob_ctx[id].p.tx32p, sizeof(s->prob_ctx[id].p.tx32p));
++    memcpy(fctx.probs.coef, s->prob_ctx[id].coef, sizeof(s->prob_ctx[id].coef));
++    memcpy(fctx.probs.skip, s->prob_ctx[id].p.skip, sizeof(s->prob_ctx[id].p.skip));
++    memcpy(fctx.probs.inter_mode, s->prob_ctx[id].p.mv_mode, sizeof(s->prob_ctx[id].p.mv_mode));
++    memcpy(fctx.probs.interp_filter, s->prob_ctx[id].p.filter, sizeof(s->prob_ctx[id].p.filter));
++    memcpy(fctx.probs.is_inter, s->prob_ctx[id].p.intra, sizeof(s->prob_ctx[id].p.intra));
++    memcpy(fctx.probs.comp_mode, s->prob_ctx[id].p.comp, sizeof(s->prob_ctx[id].p.comp));
++    memcpy(fctx.probs.single_ref, s->prob_ctx[id].p.single_ref, sizeof(s->prob_ctx[id].p.single_ref));
++    memcpy(fctx.probs.comp_ref, s->prob_ctx[id].p.comp_ref, sizeof(s->prob_ctx[id].p.comp_ref));
++    memcpy(fctx.probs.y_mode, s->prob_ctx[id].p.y_mode, sizeof(s->prob_ctx[id].p.y_mode));
++    for (unsigned i = 0; i < 10; i++)
++        memcpy(fctx.probs.uv_mode[ff_to_v4l2_intramode[i]], s->prob_ctx[id].p.uv_mode[i], sizeof(s->prob_ctx[id].p.uv_mode[0]));
++    for (unsigned i = 0; i < 4; i++)
++        memcpy(fctx.probs.partition[i * 4], s->prob_ctx[id].p.partition[3 - i], sizeof(s->prob_ctx[id].p.partition[0]));
++    memcpy(fctx.probs.mv.joint, s->prob_ctx[id].p.mv_joint, sizeof(s->prob_ctx[id].p.mv_joint));
++    for (unsigned i = 0; i < 2; i++) {
++         fctx.probs.mv.sign[i] = s->prob_ctx[id].p.mv_comp[i].sign;
++         memcpy(fctx.probs.mv.class[i], s->prob_ctx[id].p.mv_comp[i].classes, sizeof(s->prob_ctx[id].p.mv_comp[0].classes));
++         fctx.probs.mv.class0_bit[i] = s->prob_ctx[id].p.mv_comp[i].class0;
++         memcpy(fctx.probs.mv.bits[i], s->prob_ctx[id].p.mv_comp[i].bits, sizeof(s->prob_ctx[id].p.mv_comp[0].bits));
++         memcpy(fctx.probs.mv.class0_fr[i], s->prob_ctx[id].p.mv_comp[i].class0_fp, sizeof(s->prob_ctx[id].p.mv_comp[0].class0_fp));
++         memcpy(fctx.probs.mv.fr[i], s->prob_ctx[id].p.mv_comp[i].fp, sizeof(s->prob_ctx[id].p.mv_comp[0].fp));
++         fctx.probs.mv.class0_hp[i] = s->prob_ctx[id].p.mv_comp[i].class0_hp;
++         fctx.probs.mv.hp[i] = s->prob_ctx[id].p.mv_comp[i].hp;
++    }
++
++    return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control));
++}
++
++static int v4l2_request_vp9_get_frame_ctx(AVCodecContext *avctx, unsigned int id)
++{
++    VP9Context *s = avctx->priv_data;
++    struct v4l2_ctrl_vp9_frame_ctx fctx = {};
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(id),
++            .ptr = &fctx,
++            .size = sizeof(fctx),
++        },
++    };
++
++    int ret = ff_v4l2_request_get_controls(avctx, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    memcpy(s->prob_ctx[id].p.tx8p, fctx.probs.tx8, sizeof(s->prob_ctx[id].p.tx8p));
++    memcpy(s->prob_ctx[id].p.tx16p, fctx.probs.tx16, sizeof(s->prob_ctx[id].p.tx16p));
++    memcpy(s->prob_ctx[id].p.tx32p, fctx.probs.tx32, sizeof(s->prob_ctx[id].p.tx32p));
++    memcpy(s->prob_ctx[id].coef, fctx.probs.coef, sizeof(s->prob_ctx[id].coef));
++    memcpy(s->prob_ctx[id].p.skip, fctx.probs.skip, sizeof(s->prob_ctx[id].p.skip));
++    memcpy(s->prob_ctx[id].p.mv_mode, fctx.probs.inter_mode, sizeof(s->prob_ctx[id].p.mv_mode));
++    memcpy(s->prob_ctx[id].p.filter, fctx.probs.interp_filter, sizeof(s->prob_ctx[id].p.filter));
++    memcpy(s->prob_ctx[id].p.intra, fctx.probs.is_inter, sizeof(s->prob_ctx[id].p.intra));
++    memcpy(s->prob_ctx[id].p.comp, fctx.probs.comp_mode, sizeof(s->prob_ctx[id].p.comp));
++    memcpy(s->prob_ctx[id].p.single_ref, fctx.probs.single_ref, sizeof(s->prob_ctx[id].p.single_ref));
++    memcpy(s->prob_ctx[id].p.comp_ref, fctx.probs.comp_ref, sizeof(s->prob_ctx[id].p.comp_ref));
++    memcpy(s->prob_ctx[id].p.y_mode, fctx.probs.y_mode, sizeof(s->prob_ctx[id].p.y_mode));
++    for (unsigned i = 0; i < 10; i++)
++        memcpy(s->prob_ctx[id].p.uv_mode[i], fctx.probs.uv_mode[ff_to_v4l2_intramode[i]], sizeof(s->prob_ctx[id].p.uv_mode[0]));
++    for (unsigned i = 0; i < 4; i++)
++        memcpy(s->prob_ctx[id].p.partition[3 - i], fctx.probs.partition[i * 4], sizeof(s->prob_ctx[id].p.partition[0]));
++    memcpy(s->prob_ctx[id].p.mv_joint, fctx.probs.mv.joint, sizeof(s->prob_ctx[id].p.mv_joint));
++    for (unsigned i = 0; i < 2; i++) {
++         s->prob_ctx[id].p.mv_comp[i].sign = fctx.probs.mv.sign[i];
++         memcpy(s->prob_ctx[id].p.mv_comp[i].classes, fctx.probs.mv.class[i], sizeof(s->prob_ctx[id].p.mv_comp[0].classes));
++         s->prob_ctx[id].p.mv_comp[i].class0 = fctx.probs.mv.class0_bit[i];
++         memcpy(s->prob_ctx[id].p.mv_comp[i].bits, fctx.probs.mv.bits[i], sizeof(s->prob_ctx[id].p.mv_comp[0].bits));
++         memcpy(s->prob_ctx[id].p.mv_comp[i].class0_fp, fctx.probs.mv.class0_fr[i], sizeof(s->prob_ctx[id].p.mv_comp[0].class0_fp));
++         memcpy(s->prob_ctx[id].p.mv_comp[i].fp, fctx.probs.mv.fr[i], sizeof(s->prob_ctx[id].p.mv_comp[0].fp));
++         s->prob_ctx[id].p.mv_comp[i].class0_hp = fctx.probs.mv.class0_hp[i];
++         s->prob_ctx[id].p.mv_comp[i].hp = fctx.probs.mv.hp[i];
++    }
++
++    return 0;
++}
++
++static int v4l2_request_vp9_start_frame(AVCodecContext *avctx,
++                                        av_unused const uint8_t *buffer,
++                                        av_unused uint32_t size)
++{
++    const VP9Context *s = avctx->priv_data;
++    const VP9Frame *f = &s->s.frames[CUR_FRAME];
++    V4L2RequestControlsVP9 *controls = f->hwaccel_picture_private;
++    struct v4l2_ctrl_vp9_frame_decode_params *dec_params = &controls->decode_params;
++    int ret;
++
++    if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) {
++        for (unsigned i = 0; i < 4; i++) {
++            ret = v4l2_request_vp9_set_frame_ctx(avctx, i);
++            if (ret)
++                return ret;
++        }
++    } else if (s->s.h.intraonly && s->s.h.resetctx == 2) {
++        ret = v4l2_request_vp9_set_frame_ctx(avctx, s->s.h.framectxid);
++        if (ret)
++            return ret;
++    }
++
++    if (s->s.h.keyframe)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_KEY_FRAME;
++    if (!s->s.h.invisible)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_SHOW_FRAME;
++    if (s->s.h.errorres)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT;
++    if (s->s.h.intraonly)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_INTRA_ONLY;
++    if (!s->s.h.keyframe && s->s.h.highprecisionmvs)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV;
++    if (s->s.h.refreshctx)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX;
++    if (s->s.h.parallelmode)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE;
++    if (s->ss_h)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING;
++    if (s->ss_v)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING;
++    if (avctx->color_range == AVCOL_RANGE_JPEG)
++        dec_params->flags |= V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING;
++
++    dec_params->compressed_header_size = s->s.h.compressed_header_size;
++    dec_params->uncompressed_header_size = s->s.h.uncompressed_header_size;
++    dec_params->profile = s->s.h.profile;
++    dec_params->reset_frame_context = s->s.h.resetctx > 0 ? s->s.h.resetctx - 1 : 0;
++    dec_params->frame_context_idx = s->s.h.framectxid;
++    dec_params->bit_depth = s->s.h.bpp;
++
++    dec_params->interpolation_filter = s->s.h.filtermode ^ (s->s.h.filtermode <= 1);
++    dec_params->tile_cols_log2 = s->s.h.tiling.log2_tile_cols;
++    dec_params->tile_rows_log2 = s->s.h.tiling.log2_tile_rows;
++    dec_params->tx_mode = s->s.h.txfmmode;
++    dec_params->reference_mode = s->s.h.comppredmode;
++    dec_params->frame_width_minus_1 = s->w - 1;
++    dec_params->frame_height_minus_1 = s->h - 1;
++    //dec_params->render_width_minus_1 = avctx->width - 1;
++    //dec_params->render_height_minus_1 = avctx->height - 1;
++
++    for (unsigned i = 0; i < 3; i++) {
++        const ThreadFrame *ref = &s->s.refs[s->s.h.refidx[i]];
++        if (ref->f && ref->f->buf[0])
++            dec_params->refs[i] = ff_v4l2_request_get_capture_timestamp(ref->f);
++    }
++
++    if (s->s.h.lf_delta.enabled)
++        dec_params->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED;
++    if (s->s.h.lf_delta.updated)
++        dec_params->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE;
++
++    dec_params->lf.level = s->s.h.filter.level;
++    dec_params->lf.sharpness = s->s.h.filter.sharpness;
++    for (unsigned i = 0; i < 4; i++)
++        dec_params->lf.ref_deltas[i] = s->s.h.lf_delta.ref[i];
++    for (unsigned i = 0; i < 2; i++)
++        dec_params->lf.mode_deltas[i] = s->s.h.lf_delta.mode[i];
++    for (unsigned i = 0; i < 8; i++) {
++        for (unsigned j = 0; j < 4; j++)
++            memcpy(dec_params->lf.level_lookup[i][j], s->s.h.segmentation.feat[i].lflvl[j], sizeof(dec_params->lf.level_lookup[0][0]));
++    }
++
++    dec_params->quant.base_q_idx = s->s.h.yac_qi;
++    dec_params->quant.delta_q_y_dc = s->s.h.ydc_qdelta;
++    dec_params->quant.delta_q_uv_dc = s->s.h.uvdc_qdelta;
++    dec_params->quant.delta_q_uv_ac = s->s.h.uvac_qdelta;
++
++    if (s->s.h.segmentation.enabled)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ENABLED;
++    if (s->s.h.segmentation.update_map)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP;
++    if (s->s.h.segmentation.temporal)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE;
++    if (s->s.h.segmentation.update_data)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA;
++    if (s->s.h.segmentation.absolute_vals)
++        dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE;
++
++    for (unsigned i = 0; i < 7; i++)
++        dec_params->seg.tree_probs[i] = s->s.h.segmentation.prob[i];
++
++    if (s->s.h.segmentation.temporal) {
++        for (unsigned i = 0; i < 3; i++)
++            dec_params->seg.pred_probs[i] = s->s.h.segmentation.pred_prob[i];
++    } else {
++        memset(dec_params->seg.pred_probs, 255, sizeof(dec_params->seg.pred_probs));
++    }
++
++    for (unsigned i = 0; i < 8; i++) {
++        if (s->s.h.segmentation.feat[i].q_enabled) {
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_QP_DELTA;
++            dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_QP_DELTA] = s->s.h.segmentation.feat[i].q_val;
++        }
++
++        if (s->s.h.segmentation.feat[i].lf_enabled) {
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_LF;
++            dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_LF] = s->s.h.segmentation.feat[i].lf_val;
++        }
++
++        if (s->s.h.segmentation.feat[i].ref_enabled) {
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_REF_FRAME;
++            dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_REF_FRAME] = s->s.h.segmentation.feat[i].ref_val;
++        }
++
++        if (s->s.h.segmentation.feat[i].skip_enabled)
++            dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_SKIP;
++    }
++
++    memcpy(dec_params->probs.tx8, s->prob.p.tx8p, sizeof(s->prob.p.tx8p));
++    memcpy(dec_params->probs.tx16, s->prob.p.tx16p, sizeof(s->prob.p.tx16p));
++    memcpy(dec_params->probs.tx32, s->prob.p.tx32p, sizeof(s->prob.p.tx32p));
++    for (unsigned i = 0; i < 4; i++) {
++        for (unsigned j = 0; j < 2; j++) {
++            for (unsigned k = 0; k < 2; k++) {
++                for (unsigned l = 0; l < 6; l++) {
++                    for (unsigned m = 0; m < 6; m++) {
++                        memcpy(dec_params->probs.coef[i][j][k][l][m], s->prob.coef[i][j][k][l][m], sizeof(dec_params->probs.coef[0][0][0][0][0]));
++                    }
++                }
++            }
++        }
++    }
++    memcpy(dec_params->probs.skip, s->prob.p.skip, sizeof(s->prob.p.skip));
++    memcpy(dec_params->probs.inter_mode, s->prob.p.mv_mode, sizeof(s->prob.p.mv_mode));
++    memcpy(dec_params->probs.interp_filter, s->prob.p.filter, sizeof(s->prob.p.filter));
++    memcpy(dec_params->probs.is_inter, s->prob.p.intra, sizeof(s->prob.p.intra));
++    memcpy(dec_params->probs.comp_mode, s->prob.p.comp, sizeof(s->prob.p.comp));
++    memcpy(dec_params->probs.single_ref, s->prob.p.single_ref, sizeof(s->prob.p.single_ref));
++    memcpy(dec_params->probs.comp_ref, s->prob.p.comp_ref, sizeof(s->prob.p.comp_ref));
++    memcpy(dec_params->probs.y_mode, s->prob.p.y_mode, sizeof(s->prob.p.y_mode));
++    for (unsigned i = 0; i < 10; i++)
++        memcpy(dec_params->probs.uv_mode[ff_to_v4l2_intramode[i]], s->prob.p.uv_mode[i], sizeof(s->prob.p.uv_mode[0]));
++    for (unsigned i = 0; i < 4; i++)
++        memcpy(dec_params->probs.partition[i * 4], s->prob.p.partition[3 - i], sizeof(s->prob.p.partition[0]));
++    memcpy(dec_params->probs.mv.joint, s->prob.p.mv_joint, sizeof(s->prob.p.mv_joint));
++    for (unsigned i = 0; i < 2; i++) {
++         dec_params->probs.mv.sign[i] = s->prob.p.mv_comp[i].sign;
++         memcpy(dec_params->probs.mv.class[i], s->prob.p.mv_comp[i].classes, sizeof(s->prob.p.mv_comp[0].classes));
++         dec_params->probs.mv.class0_bit[i] = s->prob.p.mv_comp[i].class0;
++         memcpy(dec_params->probs.mv.bits[i], s->prob.p.mv_comp[i].bits, sizeof(s->prob.p.mv_comp[0].bits));
++         memcpy(dec_params->probs.mv.class0_fr[i], s->prob.p.mv_comp[i].class0_fp, sizeof(s->prob.p.mv_comp[0].class0_fp));
++         memcpy(dec_params->probs.mv.fr[i], s->prob.p.mv_comp[i].fp, sizeof(s->prob.p.mv_comp[0].fp));
++         dec_params->probs.mv.class0_hp[i] = s->prob.p.mv_comp[i].class0_hp;
++         dec_params->probs.mv.hp[i] = s->prob.p.mv_comp[i].hp;
++    }
++
++    return ff_v4l2_request_reset_frame(avctx, f->tf.f);
++}
++
++static int v4l2_request_vp9_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const VP9Context *s = avctx->priv_data;
++    const VP9Frame *f = &s->s.frames[CUR_FRAME];
++
++    return ff_v4l2_request_append_output_buffer(avctx, f->tf.f, buffer, size);
++}
++
++static int v4l2_request_vp9_end_frame(AVCodecContext *avctx)
++{
++    const VP9Context *s = avctx->priv_data;
++    const VP9Frame *f = &s->s.frames[CUR_FRAME];
++    V4L2RequestControlsVP9 *controls = f->hwaccel_picture_private;
++    int ret;
++
++    struct v4l2_ext_control control[] = {
++        {
++            .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_DECODE_PARAMS,
++            .ptr = &controls->decode_params,
++            .size = sizeof(controls->decode_params),
++        },
++    };
++
++    ret = ff_v4l2_request_decode_frame(avctx, f->tf.f, control, FF_ARRAY_ELEMS(control));
++    if (ret)
++        return ret;
++
++    if (!s->s.h.refreshctx)
++        return 0;
++
++    return v4l2_request_vp9_get_frame_ctx(avctx, s->s.h.framectxid);
++}
++
++static int v4l2_request_vp9_init(AVCodecContext *avctx)
++{
++    // TODO: check V4L2_CID_MPEG_VIDEO_VP9_PROFILE
++    return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP9_FRAME, 3 * 1024 * 1024, NULL, 0);
++}
++
++const AVHWAccel ff_vp9_v4l2request_hwaccel = {
++    .name           = "vp9_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_VP9,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .start_frame    = v4l2_request_vp9_start_frame,
++    .decode_slice   = v4l2_request_vp9_decode_slice,
++    .end_frame      = v4l2_request_vp9_end_frame,
++    .frame_priv_data_size = sizeof(V4L2RequestControlsVP9),
++    .init           = v4l2_request_vp9_init,
++    .uninit         = ff_v4l2_request_uninit,
++    .priv_data_size = sizeof(V4L2RequestContext),
++    .frame_params   = ff_v4l2_request_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE,
++};
+diff --git a/libavcodec/vp8-ctrls.h b/libavcodec/vp8-ctrls.h
+new file mode 100644
+index 0000000000..53cba826e4
+--- /dev/null
++++ b/libavcodec/vp8-ctrls.h
+@@ -0,0 +1,112 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the VP8 state controls for use with stateless VP8
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _VP8_CTRLS_H_
++#define _VP8_CTRLS_H_
++
++#include <linux/types.h>
++
++#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F')
++
++#define V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (V4L2_CID_MPEG_BASE + 2000)
++#define V4L2_CTRL_TYPE_VP8_FRAME_HEADER 0x301
++
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED              0x01
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP           0x02
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA  0x04
++#define V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE     0x08
++
++struct v4l2_vp8_segment_header {
++	__s8 quant_update[4];
++	__s8 lf_update[4];
++	__u8 segment_probs[3];
++	__u8 padding;
++	__u32 flags;
++};
++
++#define V4L2_VP8_LF_HEADER_ADJ_ENABLE	0x01
++#define V4L2_VP8_LF_HEADER_DELTA_UPDATE	0x02
++#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE	0x04
++struct v4l2_vp8_loopfilter_header {
++	__s8 ref_frm_delta[4];
++	__s8 mb_mode_delta[4];
++	__u8 sharpness_level;
++	__u8 level;
++	__u16 padding;
++	__u32 flags;
++};
++
++struct v4l2_vp8_quantization_header {
++	__u8 y_ac_qi;
++	__s8 y_dc_delta;
++	__s8 y2_dc_delta;
++	__s8 y2_ac_delta;
++	__s8 uv_dc_delta;
++	__s8 uv_ac_delta;
++	__u16 padding;
++};
++
++struct v4l2_vp8_entropy_header {
++	__u8 coeff_probs[4][8][3][11];
++	__u8 y_mode_probs[4];
++	__u8 uv_mode_probs[3];
++	__u8 mv_probs[2][19];
++	__u8 padding[3];
++};
++
++struct v4l2_vp8_entropy_coder_state {
++	__u8 range;
++	__u8 value;
++	__u8 bit_count;
++	__u8 padding;
++};
++
++#define V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME		0x01
++#define V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL		0x02
++#define V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME		0x04
++#define V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF	0x08
++#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN	0x10
++#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT	0x20
++
++#define VP8_FRAME_IS_KEY_FRAME(hdr) \
++	(!!((hdr)->flags & V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME))
++
++struct v4l2_ctrl_vp8_frame_header {
++	struct v4l2_vp8_segment_header segment_header;
++	struct v4l2_vp8_loopfilter_header lf_header;
++	struct v4l2_vp8_quantization_header quant_header;
++	struct v4l2_vp8_entropy_header entropy_header;
++	struct v4l2_vp8_entropy_coder_state coder_state;
++
++	__u16 width;
++	__u16 height;
++
++	__u8 horizontal_scale;
++	__u8 vertical_scale;
++
++	__u8 version;
++	__u8 prob_skip_false;
++	__u8 prob_intra;
++	__u8 prob_last;
++	__u8 prob_gf;
++	__u8 num_dct_parts;
++
++	__u32 first_part_size;
++	__u32 first_part_header_bits;
++	__u32 dct_part_sizes[8];
++
++	__u64 last_frame_ts;
++	__u64 golden_frame_ts;
++	__u64 alt_frame_ts;
++
++	__u64 flags;
++};
++
++#endif
+diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
+index bab4223aca..0e1edb46fb 100644
+--- a/libavcodec/vp8.c
++++ b/libavcodec/vp8.c
+@@ -175,6 +175,9 @@ static enum AVPixelFormat get_pixel_format(VP8Context *s)
+ #endif
+ #if CONFIG_VP8_NVDEC_HWACCEL
+         AV_PIX_FMT_CUDA,
++#endif
++#if CONFIG_VP8_V4L2REQUEST_HWACCEL
++        AV_PIX_FMT_DRM_PRIME,
+ #endif
+         AV_PIX_FMT_YUV420P,
+         AV_PIX_FMT_NONE,
+@@ -198,7 +201,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
+             return ret;
+     }
+ 
+-    if (!s->actually_webp && !is_vp7) {
++    if (!s->actually_webp && !is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
+         s->pix_fmt = get_pixel_format(s);
+         if (s->pix_fmt < 0)
+             return AVERROR(EINVAL);
+@@ -2968,6 +2971,9 @@ AVCodec ff_vp8_decoder = {
+ #endif
+ #if CONFIG_VP8_NVDEC_HWACCEL
+                                HWACCEL_NVDEC(vp8),
++#endif
++#if CONFIG_VP8_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(vp8),
+ #endif
+                                NULL
+                            },
+diff --git a/libavcodec/vp9-ctrls.h b/libavcodec/vp9-ctrls.h
+new file mode 100644
+index 0000000000..0cdea8a18b
+--- /dev/null
++++ b/libavcodec/vp9-ctrls.h
+@@ -0,0 +1,485 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the VP9 state controls for use with stateless VP9
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _VP9_CTRLS_H_
++#define _VP9_CTRLS_H_
++
++#include <linux/types.h>
++
++#define V4L2_PIX_FMT_VP9_FRAME v4l2_fourcc('V', 'P', '9', 'F')
++
++#define V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(i)	(V4L2_CID_MPEG_BASE + 4000 + (i))
++#define V4L2_CID_MPEG_VIDEO_VP9_FRAME_DECODE_PARAMS	(V4L2_CID_MPEG_BASE + 4004)
++#define V4L2_CTRL_TYPE_VP9_FRAME_CONTEXT		0x400
++#define V4L2_CTRL_TYPE_VP9_FRAME_DECODE_PARAMS		0x404
++
++/**
++ * enum v4l2_vp9_loop_filter_flags - VP9 loop filter flags
++ *
++ * @V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED: the filter level depends on
++ *					     the mode and reference frame used
++ *					     to predict a block
++ * @V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE: the bitstream contains additional
++ *					    syntax elements that specify which
++ *					    mode and reference frame deltas
++ *					    are to be updated
++ *
++ * Those are the flags you should pass to &v4l2_vp9_loop_filter.flags. See
++ * section '7.2.8 Loop filter semantics' of the VP9 specification for more
++ * details.
++ */
++enum v4l2_vp9_loop_filter_flags {
++	V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED = 1 << 0,
++	V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE = 1 << 1,
++};
++
++/**
++ * struct v4l2_vp9_loop_filter - VP9 loop filter parameters
++ *
++ * @flags: combination of V4L2_VP9_LOOP_FILTER_FLAG_* flags
++ * @level: indicates the loop filter strength
++ * @sharpness: indicates the sharpness level
++ * @ref_deltas: contains the adjustment needed for the filter level based on
++ *		the chosen reference frame
++ * @mode_deltas: contains the adjustment needed for the filter level based on
++ *		 the chosen mode
++ * @level_lookup: level lookup table
++ *
++ * This structure contains all loop filter related parameters. See sections
++ * '7.2.8 Loop filter semantics' and '8.8.1 Loop filter frame init process'
++ * of the VP9 specification for more details.
++ */
++struct v4l2_vp9_loop_filter {
++	__u8 flags;
++	__u8 level;
++	__u8 sharpness;
++	__s8 ref_deltas[4];
++	__s8 mode_deltas[2];
++	__u8 level_lookup[8][4][2];
++};
++
++/**
++ * struct v4l2_vp9_quantization - VP9 quantization parameters
++ *
++ * @base_q_idx: indicates the base frame qindex
++ * @delta_q_y_dc: indicates the Y DC quantizer relative to base_q_idx
++ * @delta_q_uv_dc: indicates the UV DC quantizer relative to base_q_idx
++ * @delta_q_uv_ac indicates the UV AC quantizer relative to base_q_idx
++ * @padding: padding bytes to align things on 64 bits. Must be set to 0
++ *
++ * Encodes the quantization parameters. See section '7.2.9 Quantization params
++ * syntax' of the VP9 specification for more details.
++ */
++struct v4l2_vp9_quantization {
++	__u8 base_q_idx;
++	__s8 delta_q_y_dc;
++	__s8 delta_q_uv_dc;
++	__s8 delta_q_uv_ac;
++	__u8 padding[4];
++};
++
++/**
++ * enum v4l2_vp9_segmentation_flags - VP9 segmentation flags
++ *
++ * @V4L2_VP9_SEGMENTATION_FLAG_ENABLED: indicates that this frame makes use of
++ *					the segmentation tool
++ * @V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP: indicates that the segmentation map
++ *					   should be updated during the
++ *					   decoding of this frame
++ * @V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE: indicates that the updates to
++ *						the segmentation map are coded
++ *						relative to the existing
++ *						segmentation map
++ * @V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA: indicates that new parameters are
++ *					    about to be specified for each
++ *					    segment
++ * @V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE: indicates that the
++ *						    segmentation parameters
++ *						    represent the actual values
++ *						    to be used
++ *
++ * Those are the flags you should pass to &v4l2_vp9_segmentation.flags. See
++ * section '7.2.10 Segmentation params syntax' of the VP9 specification for
++ * more details.
++ */
++enum v4l2_vp9_segmentation_flags {
++	V4L2_VP9_SEGMENTATION_FLAG_ENABLED = 1 << 0,
++	V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP = 1 << 1,
++	V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE = 1 << 2,
++	V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA = 1 << 3,
++	V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE = 1 << 4,
++};
++
++#define V4L2_VP9_SEGMENT_FEATURE_ENABLED(id)	(1 << (id))
++#define V4L2_VP9_SEGMENT_FEATURE_ENABLED_MASK	0xf
++
++/**
++ * enum v4l2_vp9_segment_feature - VP9 segment feature IDs
++ *
++ * @V4L2_VP9_SEGMENT_FEATURE_QP_DELTA: QP delta segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_LF: loop filter segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_REF_FRAME: reference frame segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_SKIP: skip segment feature
++ * @V4L2_VP9_SEGMENT_FEATURE_CNT: number of segment features
++ *
++ * Segment feature IDs. See section '7.2.10 Segmentation params syntax' of the
++ * VP9 specification for more details.
++ */
++enum v4l2_vp9_segment_feature {
++	V4L2_VP9_SEGMENT_FEATURE_QP_DELTA,
++	V4L2_VP9_SEGMENT_FEATURE_LF,
++	V4L2_VP9_SEGMENT_FEATURE_REF_FRAME,
++	V4L2_VP9_SEGMENT_FEATURE_SKIP,
++	V4L2_VP9_SEGMENT_FEATURE_CNT,
++};
++
++/**
++ * struct v4l2_vp9_segmentation - VP9 segmentation parameters
++ *
++ * @flags: combination of V4L2_VP9_SEGMENTATION_FLAG_* flags
++ * @tree_probs: specifies the probability values to be used when
++ *              decoding a Segment-ID. See '5.15. Segmentation map'
++ *              section of the VP9 specification for more details.
++ * @pred_prob: specifies the probability values to be used when decoding a
++ *	       Predicted-Segment-ID. See '6.4.14. Get segment id syntax'
++ *	       section of :ref:`vp9` for more details..
++ * @padding: padding used to make things aligned on 64 bits. Shall be zero
++ *	     filled
++ * @feature_enabled: bitmask defining which features are enabled in each
++ *		     segment
++ * @feature_data: data attached to each feature. Data entry is only valid if
++ *		  the feature is enabled
++ *
++ * Encodes the quantization parameters. See section '7.2.10 Segmentation
++ * params syntax' of the VP9 specification for more details.
++ */
++struct v4l2_vp9_segmentation {
++	__u8 flags;
++	__u8 tree_probs[7];
++	__u8 pred_probs[3];
++	__u8 padding[5];
++	__u8 feature_enabled[8];
++	__s16 feature_data[8][4];
++};
++
++/**
++ * enum v4l2_vp9_intra_prediction_mode - VP9 Intra prediction modes
++ *
++ * @V4L2_VP9_INTRA_PRED_DC: DC intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_V: vertical intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_H: horizontal intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D45: D45 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D135: D135 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D117: D117 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D153: D153 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D207: D207 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_D63: D63 intra prediction
++ * @V4L2_VP9_INTRA_PRED_MODE_TM: True Motion intra prediction
++ *
++ * See section '7.4.5 Intra frame mode info semantics' for more details.
++ */
++enum v4l2_vp9_intra_prediction_mode {
++	V4L2_VP9_INTRA_PRED_MODE_DC,
++	V4L2_VP9_INTRA_PRED_MODE_V,
++	V4L2_VP9_INTRA_PRED_MODE_H,
++	V4L2_VP9_INTRA_PRED_MODE_D45,
++	V4L2_VP9_INTRA_PRED_MODE_D135,
++	V4L2_VP9_INTRA_PRED_MODE_D117,
++	V4L2_VP9_INTRA_PRED_MODE_D153,
++	V4L2_VP9_INTRA_PRED_MODE_D207,
++	V4L2_VP9_INTRA_PRED_MODE_D63,
++	V4L2_VP9_INTRA_PRED_MODE_TM,
++};
++
++/**
++ * struct v4l2_vp9_mv_probabilities - VP9 Motion vector probabilities
++ * @joint: motion vector joint probabilities
++ * @sign: motion vector sign probabilities
++ * @class: motion vector class probabilities
++ * @class0_bit: motion vector class0 bit probabilities
++ * @bits: motion vector bits probabilities
++ * @class0_fr: motion vector class0 fractional bit probabilities
++ * @fr: motion vector fractional bit probabilities
++ * @class0_hp: motion vector class0 high precision fractional bit probabilities
++ * @hp: motion vector high precision fractional bit probabilities
++ */
++struct v4l2_vp9_mv_probabilities {
++	__u8 joint[3];
++	__u8 sign[2];
++	__u8 class[2][10];
++	__u8 class0_bit[2];
++	__u8 bits[2][10];
++	__u8 class0_fr[2][2][3];
++	__u8 fr[2][3];
++	__u8 class0_hp[2];
++	__u8 hp[2];
++};
++
++/**
++ * struct v4l2_vp9_probabilities - VP9 Probabilities
++ *
++ * @tx8: TX 8x8 probabilities
++ * @tx16: TX 16x16 probabilities
++ * @tx32: TX 32x32 probabilities
++ * @coef: coefficient probabilities
++ * @skip: skip probabilities
++ * @inter_mode: inter mode probabilities
++ * @interp_filter: interpolation filter probabilities
++ * @is_inter: is inter-block probabilities
++ * @comp_mode: compound prediction mode probabilities
++ * @single_ref: single ref probabilities
++ * @comp_ref: compound ref probabilities
++ * @y_mode: Y prediction mode probabilities
++ * @uv_mode: UV prediction mode probabilities
++ * @partition: partition probabilities
++ * @mv: motion vector probabilities
++ *
++ * Structure containing most VP9 probabilities. See the VP9 specification
++ * for more details.
++ */
++struct v4l2_vp9_probabilities {
++	__u8 tx8[2][1];
++	__u8 tx16[2][2];
++	__u8 tx32[2][3];
++	__u8 coef[4][2][2][6][6][3];
++	__u8 skip[3];
++	__u8 inter_mode[7][3];
++	__u8 interp_filter[4][2];
++	__u8 is_inter[4];
++	__u8 comp_mode[5];
++	__u8 single_ref[5][2];
++	__u8 comp_ref[5];
++	__u8 y_mode[4][9];
++	__u8 uv_mode[10][9];
++	__u8 partition[16][3];
++
++	struct v4l2_vp9_mv_probabilities mv;
++};
++
++/**
++ * enum v4l2_vp9_reset_frame_context - Valid values for
++ *			&v4l2_ctrl_vp9_frame_decode_params->reset_frame_context
++ *
++ * @V4L2_VP9_RESET_FRAME_CTX_NONE: don't reset any frame context
++ * @V4L2_VP9_RESET_FRAME_CTX_SPEC: reset the frame context pointed by
++ *			&v4l2_ctrl_vp9_frame_decode_params.frame_context_idx
++ * @V4L2_VP9_RESET_FRAME_CTX_ALL: reset all frame contexts
++ *
++ * See section '7.2 Uncompressed header semantics' of the VP9 specification
++ * for more details.
++ */
++enum v4l2_vp9_reset_frame_context {
++	V4L2_VP9_RESET_FRAME_CTX_NONE,
++	V4L2_VP9_RESET_FRAME_CTX_SPEC,
++	V4L2_VP9_RESET_FRAME_CTX_ALL,
++};
++
++/**
++ * enum v4l2_vp9_interpolation_filter - VP9 interpolation filter types
++ *
++ * @V4L2_VP9_INTERP_FILTER_8TAP: height tap filter
++ * @V4L2_VP9_INTERP_FILTER_8TAP_SMOOTH: height tap smooth filter
++ * @V4L2_VP9_INTERP_FILTER_8TAP_SHARP: height tap sharp filter
++ * @V4L2_VP9_INTERP_FILTER_BILINEAR: bilinear filter
++ * @V4L2_VP9_INTERP_FILTER_SWITCHABLE: filter selection is signaled at the
++ *				       block level
++ *
++ * See section '7.2.7 Interpolation filter semantics' of the VP9 specification
++ * for more details.
++ */
++enum v4l2_vp9_interpolation_filter {
++	V4L2_VP9_INTERP_FILTER_8TAP,
++	V4L2_VP9_INTERP_FILTER_8TAP_SMOOTH,
++	V4L2_VP9_INTERP_FILTER_8TAP_SHARP,
++	V4L2_VP9_INTERP_FILTER_BILINEAR,
++	V4L2_VP9_INTERP_FILTER_SWITCHABLE,
++};
++
++/**
++ * enum v4l2_vp9_reference_mode - VP9 reference modes
++ *
++ * @V4L2_VP9_REF_MODE_SINGLE: indicates that all the inter blocks use only a
++ *			      single reference frame to generate motion
++ *			      compensated prediction
++ * @V4L2_VP9_REF_MODE_COMPOUND: requires all the inter blocks to use compound
++ *				mode. Single reference frame prediction is not
++ *				allowed
++ * @V4L2_VP9_REF_MODE_SELECT: allows each individual inter block to select
++ *			      between single and compound prediction modes
++ *
++ * See section '7.3.6 Frame reference mode semantics' of the VP9 specification
++ * for more details.
++ */
++enum v4l2_vp9_reference_mode {
++	V4L2_VP9_REF_MODE_SINGLE,
++	V4L2_VP9_REF_MODE_COMPOUND,
++	V4L2_VP9_REF_MODE_SELECT,
++};
++
++/**
++ * enum v4l2_vp9_tx_mode - VP9 TX modes
++ *
++ * @V4L2_VP9_TX_MODE_ONLY_4X4: transform size is 4x4
++ * @V4L2_VP9_TX_MODE_ALLOW_8X8: transform size can be up to 8x8
++ * @V4L2_VP9_TX_MODE_ALLOW_16X16: transform size can be up to 16x16
++ * @V4L2_VP9_TX_MODE_ALLOW_32X32: transform size can be up to 32x32
++ * @V4L2_VP9_TX_MODE_SELECT: bitstream contains transform size for each block
++ *
++ * See section '7.3.1 Tx mode semantics' of the VP9 specification for more
++ * details.
++ */
++enum v4l2_vp9_tx_mode {
++	V4L2_VP9_TX_MODE_ONLY_4X4,
++	V4L2_VP9_TX_MODE_ALLOW_8X8,
++	V4L2_VP9_TX_MODE_ALLOW_16X16,
++	V4L2_VP9_TX_MODE_ALLOW_32X32,
++	V4L2_VP9_TX_MODE_SELECT,
++};
++
++/**
++ * enum v4l2_vp9_ref_id - VP9 Reference frame IDs
++ *
++ * @V4L2_REF_ID_LAST: last reference frame
++ * @V4L2_REF_ID_GOLDEN: golden reference frame
++ * @V4L2_REF_ID_ALTREF: alternative reference frame
++ * @V4L2_REF_ID_CNT: number of reference frames
++ *
++ * See section '7.4.12 Ref frames semantics' of the VP9 specification for more
++ * details.
++ */
++enum v4l2_vp9_ref_id {
++	V4L2_REF_ID_LAST,
++	V4L2_REF_ID_GOLDEN,
++	V4L2_REF_ID_ALTREF,
++	V4L2_REF_ID_CNT,
++};
++
++/**
++ * enum v4l2_vp9_frame_flags - VP9 frame flags
++ * @V4L2_VP9_FRAME_FLAG_KEY_FRAME: the frame is a key frame
++ * @V4L2_VP9_FRAME_FLAG_SHOW_FRAME: the frame should be displayed
++ * @V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT: the decoding should be error resilient
++ * @V4L2_VP9_FRAME_FLAG_INTRA_ONLY: the frame does not reference other frames
++ * @V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV: the frame might can high precision
++ *					    motion vectors
++ * @V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX: frame context should be updated
++ *					   after decoding
++ * @V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE: parallel decoding is used
++ * @V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING: vertical subsampling is enabled
++ * @V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING: horizontal subsampling is enabled
++ * @V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING: full UV range is used
++ *
++ * Check the VP9 specification for more details.
++ */
++enum v4l2_vp9_frame_flags {
++	V4L2_VP9_FRAME_FLAG_KEY_FRAME = 1 << 0,
++	V4L2_VP9_FRAME_FLAG_SHOW_FRAME = 1 << 1,
++	V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT = 1 << 2,
++	V4L2_VP9_FRAME_FLAG_INTRA_ONLY = 1 << 3,
++	V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV = 1 << 4,
++	V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX = 1 << 5,
++	V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE = 1 << 6,
++	V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING = 1 << 7,
++	V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING = 1 << 8,
++	V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING = 1 << 9,
++};
++
++#define V4L2_VP9_PROFILE_MAX		3
++
++/**
++ * struct v4l2_ctrl_vp9_frame_decode_params - VP9 frame decoding control
++ *
++ * @flags: combination of V4L2_VP9_FRAME_FLAG_* flags
++ * @compressed_header_size: compressed header size in bytes
++ * @uncompressed_header_size: uncompressed header size in bytes
++ * @profile: VP9 profile. Can be 0, 1, 2 or 3
++ * @reset_frame_context: specifies whether the frame context should be reset
++ *			 to default values. See &v4l2_vp9_reset_frame_context
++ *			 for more details
++ * @frame_context_idx: frame context that should be used/updated
++ * @bit_depth: bits per components. Can be 8, 10 or 12. Note that not all
++ *	       profiles support 10 and/or 12 bits depths
++ * @interpolation_filter: specifies the filter selection used for performing
++ *			  inter prediction. See &v4l2_vp9_interpolation_filter
++ *			  for more details
++ * @tile_cols_log2: specifies the base 2 logarithm of the width of each tile
++ *		    (where the width is measured in units of 8x8 blocks).
++ *		    Shall be less than or equal to 6
++ * @tile_rows_log2: specifies the base 2 logarithm of the height of each tile
++ *		    (where the height is measured in units of 8x8 blocks)
++ * @tx_mode: specifies the TX mode. See &v4l2_vp9_tx_mode for more details
++ * @reference_mode: specifies the type of inter prediction to be used. See
++ *		    &v4l2_vp9_reference_mode for more details
++ * @padding: needed to make this struct 64 bit aligned. Shall be filled with
++ *	     zeros
++ * @frame_width_minus_1: add 1 to it and you'll get the frame width expressed
++ *			 in pixels
++ * @frame_height_minus_1: add 1 to it and you'll get the frame height expressed
++ *			  in pixels
++ * @frame_width_minus_1: add 1 to it and you'll get the expected render width
++ *			 expressed in pixels. This is not used during the
++ *			 decoding process but might be used by HW scalers to
++ *			 prepare a frame that's ready for scanout
++ * @frame_height_minus_1: add 1 to it and you'll get the expected render height
++ *			 expressed in pixels. This is not used during the
++ *			 decoding process but might be used by HW scalers to
++ *			 prepare a frame that's ready for scanout
++ * @refs: array of reference frames. See &v4l2_vp9_ref_id for more details
++ * @lf: loop filter parameters. See &v4l2_vp9_loop_filter for more details
++ * @quant: quantization parameters. See &v4l2_vp9_quantization for more details
++ * @seg: segmentation parameters. See &v4l2_vp9_segmentation for more details
++ * @probs: probabilities. See &v4l2_vp9_probabilities for more details
++ */
++struct v4l2_ctrl_vp9_frame_decode_params {
++	__u32 flags;
++	__u16 compressed_header_size;
++	__u16 uncompressed_header_size;
++	__u8 profile;
++	__u8 reset_frame_context;
++	__u8 frame_context_idx;
++	__u8 bit_depth;
++	__u8 interpolation_filter;
++	__u8 tile_cols_log2;
++	__u8 tile_rows_log2;
++	__u8 tx_mode;
++	__u8 reference_mode;
++	__u8 padding[6];
++	__u16 frame_width_minus_1;
++	__u16 frame_height_minus_1;
++	__u16 render_width_minus_1;
++	__u16 render_height_minus_1;
++	__u64 refs[V4L2_REF_ID_CNT];
++	struct v4l2_vp9_loop_filter lf;
++	struct v4l2_vp9_quantization quant;
++	struct v4l2_vp9_segmentation seg;
++	struct v4l2_vp9_probabilities probs;
++};
++
++#define V4L2_VP9_NUM_FRAME_CTX	4
++
++/**
++ * struct v4l2_ctrl_vp9_frame_ctx - VP9 frame context control
++ *
++ * @probs: VP9 probabilities
++ *
++ * This control is accessed in both direction. The user should initialize the
++ * 4 contexts with default values just after starting the stream. Then before
++ * decoding a frame it should query the current frame context (the one passed
++ * through &v4l2_ctrl_vp9_frame_decode_params.frame_context_idx) to initialize
++ * &v4l2_ctrl_vp9_frame_decode_params.probs. The probs are then adjusted based
++ * on the bitstream info and passed to the kernel. The codec should update
++ * the frame context after the frame has been decoded, so that next time
++ * userspace query this context it contains the updated probabilities.
++ */
++struct v4l2_ctrl_vp9_frame_ctx {
++	struct v4l2_vp9_probabilities probs;
++};
++
++#endif /* _VP9_CTRLS_H_ */
+diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
+index fd0bab14a2..434f905c62 100644
+--- a/libavcodec/vp9.c
++++ b/libavcodec/vp9.c
+@@ -191,6 +191,7 @@ static int update_size(AVCodecContext *avctx, int w, int h)
+ #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + \
+                      CONFIG_VP9_D3D11VA_HWACCEL * 2 + \
+                      CONFIG_VP9_NVDEC_HWACCEL + \
++                     CONFIG_VP9_V4L2REQUEST_HWACCEL + \
+                      CONFIG_VP9_VAAPI_HWACCEL + \
+                      CONFIG_VP9_VDPAU_HWACCEL)
+     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
+@@ -223,6 +224,9 @@ static int update_size(AVCodecContext *avctx, int w, int h)
+ #endif
+ #if CONFIG_VP9_VAAPI_HWACCEL
+             *fmtp++ = AV_PIX_FMT_VAAPI;
++#endif
++#if CONFIG_VP9_V4L2REQUEST_HWACCEL
++            *fmtp++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+             break;
+         case AV_PIX_FMT_YUV420P12:
+@@ -231,6 +235,9 @@ static int update_size(AVCodecContext *avctx, int w, int h)
+ #endif
+ #if CONFIG_VP9_VAAPI_HWACCEL
+             *fmtp++ = AV_PIX_FMT_VAAPI;
++#endif
++#if CONFIG_VP9_V4L2REQUEST_HWACCEL
++            *fmtp++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+             break;
+         }
+@@ -700,7 +707,8 @@ static int decode_frame_header(AVCodecContext *avctx,
+                                          get_bits(&s->gb, 8) : 255;
+         }
+ 
+-        if (get_bits1(&s->gb)) {
++        s->s.h.segmentation.update_data = get_bits1(&s->gb);
++        if (s->s.h.segmentation.update_data) {
+             s->s.h.segmentation.absolute_vals = get_bits1(&s->gb);
+             for (i = 0; i < 8; i++) {
+                 if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
+@@ -1909,6 +1917,9 @@ AVCodec ff_vp9_decoder = {
+ #endif
+ #if CONFIG_VP9_VDPAU_HWACCEL
+                                HWACCEL_VDPAU(vp9),
++#endif
++#if CONFIG_VP9_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(vp9),
+ #endif
+                                NULL
+                            },
+diff --git a/libavcodec/vp9shared.h b/libavcodec/vp9shared.h
+index 54726df742..fee3568736 100644
+--- a/libavcodec/vp9shared.h
++++ b/libavcodec/vp9shared.h
+@@ -131,6 +131,7 @@ typedef struct VP9BitstreamHeader {
+         uint8_t temporal;
+         uint8_t absolute_vals;
+         uint8_t update_map;
++        uint8_t update_data;
+         uint8_t prob[7];
+         uint8_t pred_prob[3];
+         struct {
+diff --git a/libavdevice/Makefile b/libavdevice/Makefile
+index 6ea62b914e..19f7f5353c 100644
+--- a/libavdevice/Makefile
++++ b/libavdevice/Makefile
+@@ -45,6 +45,8 @@ OBJS-$(CONFIG_SNDIO_INDEV)               += sndio_dec.o sndio.o
+ OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
+ OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
+ OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
++OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
++OBJS-$(CONFIG_VOUT_RPI_OUTDEV)           += rpi_vout.o
+ OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
+ OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
+ OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
+diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
+index 8633433254..1df47be492 100644
+--- a/libavdevice/alldevices.c
++++ b/libavdevice/alldevices.c
+@@ -52,6 +52,8 @@ extern AVOutputFormat ff_sndio_muxer;
+ extern AVInputFormat  ff_v4l2_demuxer;
+ extern AVOutputFormat ff_v4l2_muxer;
+ extern AVInputFormat  ff_vfwcap_demuxer;
++extern AVOutputFormat ff_vout_drm_muxer;
++extern AVOutputFormat ff_vout_rpi_muxer;
+ extern AVInputFormat  ff_xcbgrab_demuxer;
+ extern AVOutputFormat ff_xv_muxer;
+ 
+diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
+new file mode 100644
+index 0000000000..8f93619651
+--- /dev/null
++++ b/libavdevice/drm_vout.c
+@@ -0,0 +1,608 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++//     limited to testing.
++//     Amongst other issues it doesn't wait for the pic to be displayed before
++//     returning the buffer so flikering does occur.
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <stdatomic.h>
++
++#include "drm_fourcc.h"
++#include <drm.h>
++#include <drm_mode.h>
++#include <xf86drm.h>
++#include <xf86drmMode.h>
++
++#include "libavutil/rpi_sand_fns.h"
++
++#define TRACE_ALL 0
++
++#define NUM_BUFFERS 4
++#define RPI_DISPLAY_ALL 0
++
++#define DRM_MODULE "vc4"
++
++#define ERRSTR strerror(errno)
++
++struct drm_setup {
++   int conId;
++   uint32_t crtcId;
++   int crtcIdx;
++   uint32_t planeId;
++   unsigned int out_fourcc;
++   struct {
++       int x, y, width, height;
++   } compose;
++};
++
++typedef struct drm_aux_s {
++    int fd;
++    uint32_t bo_handles[4];
++    unsigned int fb_handle;
++} drm_aux_t;
++
++typedef struct drm_display_env_s
++{
++    AVClass *class;
++
++    int drm_fd;
++    uint32_t con_id;
++    struct drm_setup setup;
++    enum AVPixelFormat avfmt;
++
++    drm_aux_t aux[32];
++
++    pthread_t q_thread;
++    pthread_mutex_t q_lock;
++    sem_t q_sem;
++    int q_terminate;
++    AVFrame * q_this;
++    AVFrame * q_next;
++
++} drm_display_env_t;
++
++
++static int drm_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++    return 0;
++}
++
++static int drm_vout_write_header(AVFormatContext *s)
++{
++    const AVCodecParameters * const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++    if (   s->nb_streams > 1
++        || par->codec_type != AVMEDIA_TYPE_VIDEO
++        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
++        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++
++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * const frame)
++{
++    int ret = 0;
++
++    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
++    drm_aux_t * da = NULL;
++    unsigned int i;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++
++    for (i = 0; i != 32; ++i) {
++        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
++            da = de->aux + i;
++            break;
++        }
++    }
++
++    if (da == NULL) {
++        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
++        return AVERROR(EINVAL);
++    }
++
++    if (da->fd == -1) {
++        uint32_t pitches[4] = {0};
++        uint32_t offsets[4] = {0};
++        uint64_t modifiers[4] = {0};
++        uint32_t bo_plane_handles[4] = {0};
++        int i, j, n;
++
++        for (i = 0; i < desc->nb_objects; ++i) {
++            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
++                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle failed: %s\n", ERRSTR);
++                return -1;
++            }
++        }
++
++        n = 0;
++        for (i = 0; i < desc->nb_layers; ++i) {
++            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
++                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
++                pitches[n] = p->pitch;
++                offsets[n] = p->offset;
++                modifiers[n] = obj->format_modifier;
++                bo_plane_handles[n] = da->bo_handles[p->object_index];
++                ++n;
++            }
++        }
++
++#if 0
++        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
++               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
++               av_frame_cropped_width(frame),
++               av_frame_cropped_height(frame),
++               desc->layers[0].format,
++               bo_plane_handles[0],
++               bo_plane_handles[1],
++               bo_plane_handles[2],
++               bo_plane_handles[3],
++               pitches[0],
++               pitches[1],
++               pitches[2],
++               pitches[3],
++               offsets[0],
++               offsets[1],
++               offsets[2],
++               offsets[3],
++               (long long)modifiers[0],
++               (long long)modifiers[1],
++               (long long)modifiers[2],
++               (long long)modifiers[3]
++               );
++#endif
++
++        if (drmModeAddFB2WithModifiers(de->drm_fd,
++                                         av_frame_cropped_width(frame),
++                                         av_frame_cropped_height(frame),
++                                         desc->layers[0].format, bo_plane_handles,
++                                         pitches, offsets, modifiers,
++                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
++            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
++            return -1;
++        }
++
++        da->fd = desc->objects[0].fd;
++    }
++
++    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
++                              da->fb_handle, 0,
++                de->setup.compose.x, de->setup.compose.y,
++                de->setup.compose.width,
++                de->setup.compose.height,
++                0, 0,
++                av_frame_cropped_width(frame) << 16,
++                av_frame_cropped_height(frame) << 16);
++
++    if (ret != 0) {
++        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
++    }
++
++    return ret;
++}
++
++static void * display_thread(void * v)
++{
++    AVFormatContext * const s = v;
++    drm_display_env_t * const de = s->priv_data;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++
++    for (;;) {
++        AVFrame * frame;
++
++        while (sem_wait(&de->q_sem) != 0) {
++            av_assert0(errno == EINTR);
++        }
++
++        if (de->q_terminate)
++            break;
++
++        pthread_mutex_lock(&de->q_lock);
++        frame = de->q_next;
++        de->q_next = NULL;
++        pthread_mutex_unlock(&de->q_lock);
++
++        do_display(s, de, frame);
++
++        av_frame_free(&de->q_this);
++        de->q_this = frame;
++    }
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
++#endif
++
++    return NULL;
++}
++
++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++    const AVFrame * const src_frame = (AVFrame *)pkt->data;
++    AVFrame * frame;
++    drm_display_env_t * const de = s->priv_data;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++        frame = av_frame_alloc();
++        av_frame_ref(frame, src_frame);
++    }
++    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++        frame = av_frame_alloc();
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++        if (av_hwframe_map(frame, src_frame, 0) != 0)
++        {
++            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++            av_frame_free(&frame);
++            return AVERROR(EINVAL);
++        }
++    }
++    else {
++        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++        return AVERROR(EINVAL);
++    }
++
++
++    pthread_mutex_lock(&de->q_lock);
++    {
++        AVFrame * const t = de->q_next;
++        de->q_next = frame;
++        frame = t;
++    }
++    pthread_mutex_unlock(&de->q_lock);
++
++    if (frame == NULL)
++        sem_post(&de->q_sem);
++    else
++        av_frame_free(&frame);
++
++    return 0;
++}
++
++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++                          unsigned flags)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++    /* drm_vout_write_header() should have accepted only supported formats */
++    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++        return 0;
++
++    return 0;
++}
++
++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++    switch(type) {
++    case AV_APP_TO_DEV_WINDOW_REPAINT:
++        return 0;
++    default:
++        break;
++    }
++    return AVERROR(ENOSYS);
++}
++
++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
++{
++   int ret = -1;
++   int i;
++   drmModeRes *res = drmModeGetResources(drmfd);
++   drmModeConnector *c;
++
++   if(!res)
++   {
++      printf( "drmModeGetResources failed: %s\n", ERRSTR);
++      return -1;
++   }
++
++   if (res->count_crtcs <= 0)
++   {
++      printf( "drm: no crts\n");
++      goto fail_res;
++   }
++
++   if (!s->conId) {
++      fprintf(stderr,
++         "No connector ID specified.  Choosing default from list:\n");
++
++      for (i = 0; i < res->count_connectors; i++) {
++         drmModeConnector *con =
++            drmModeGetConnector(drmfd, res->connectors[i]);
++         drmModeEncoder *enc = NULL;
++         drmModeCrtc *crtc = NULL;
++
++         if (con->encoder_id) {
++            enc = drmModeGetEncoder(drmfd, con->encoder_id);
++            if (enc->crtc_id) {
++               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
++            }
++         }
++
++         if (!s->conId && crtc) {
++            s->conId = con->connector_id;
++            s->crtcId = crtc->crtc_id;
++         }
++
++         av_log(avctx, AV_LOG_INFO, "Connector %d (crtc %d): type %d, %dx%d%s\n",
++                con->connector_id,
++                crtc ? crtc->crtc_id : 0,
++                con->connector_type,
++                crtc ? crtc->width : 0,
++                crtc ? crtc->height : 0,
++                (s->conId == (int)con->connector_id ?
++            " (chosen)" : ""));
++      }
++
++      if (!s->conId) {
++         av_log(avctx, AV_LOG_ERROR,
++            "No suitable enabled connector found.\n");
++         return -1;;
++      }
++   }
++
++   s->crtcIdx = -1;
++
++   for (i = 0; i < res->count_crtcs; ++i) {
++      if (s->crtcId == res->crtcs[i]) {
++         s->crtcIdx = i;
++         break;
++      }
++   }
++
++   if (s->crtcIdx == -1)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
++       goto fail_res;
++   }
++
++   if (res->count_connectors <= 0)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
++       goto fail_res;
++   }
++
++   c = drmModeGetConnector(drmfd, s->conId);
++   if (!c)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
++       goto fail_res;
++   }
++
++   if (!c->count_modes)
++   {
++       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
++       goto fail_conn;
++   }
++
++   {
++      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
++      s->compose.x = crtc->x;
++      s->compose.y = crtc->y;
++      s->compose.width = crtc->width;
++      s->compose.height = crtc->height;
++      drmModeFreeCrtc(crtc);
++   }
++
++   if (pConId)
++      *pConId = c->connector_id;
++   ret = 0;
++
++fail_conn:
++   drmModeFreeConnector(c);
++
++fail_res:
++   drmModeFreeResources(res);
++
++   return ret;
++}
++
++static int find_plane(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s)
++{
++   drmModePlaneResPtr planes;
++   drmModePlanePtr plane;
++   unsigned int i;
++   unsigned int j;
++   int ret = 0;
++
++   planes = drmModeGetPlaneResources(drmfd);
++   if (!planes)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
++       return -1;
++   }
++
++   for (i = 0; i < planes->count_planes; ++i) {
++      plane = drmModeGetPlane(drmfd, planes->planes[i]);
++      if (!planes)
++      {
++          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
++          break;
++      }
++
++      if (!(plane->possible_crtcs & (1 << s->crtcIdx))) {
++         drmModeFreePlane(plane);
++         continue;
++      }
++
++      for (j = 0; j < plane->count_formats; ++j) {
++         if (plane->formats[j] == s->out_fourcc)
++            break;
++      }
++
++      if (j == plane->count_formats) {
++         drmModeFreePlane(plane);
++         continue;
++      }
++
++      s->planeId = plane->plane_id;
++      drmModeFreePlane(plane);
++      break;
++   }
++
++   if (i == planes->count_planes)
++      ret = -1;
++
++   drmModeFreePlaneResources(planes);
++   return ret;
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int drm_vout_init(struct AVFormatContext * s)
++{
++    drm_display_env_t * const de = s->priv_data;
++    unsigned int i;
++
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++
++    de->drm_fd = -1;
++    de->con_id = 0;
++    de->setup = (struct drm_setup){0};
++
++    de->setup.out_fourcc = DRM_FORMAT_NV12; // **** Need some sort of select
++
++    for (i = 0; i != 32; ++i) {
++        de->aux[i].fd = -1;
++    }
++
++    if ((de->drm_fd = drmOpen(DRM_MODULE, NULL)) < 0)
++    {
++        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s\n", DRM_MODULE);
++        return -1;
++    }
++
++    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
++    {
++        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
++        return -1;
++    }
++
++    if (find_plane(s, de->drm_fd, &de->setup) != 0)
++    {
++        av_log(s, AV_LOG_ERROR, "failed to find compatible plane\n");
++        return -1;
++    }
++
++    de->q_terminate = 0;
++    pthread_mutex_init(&de->q_lock, NULL);
++    sem_init(&de->q_sem, 0, 0);
++    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
++
++    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
++
++    return 0;
++}
++
++static void drm_vout_deinit(struct AVFormatContext * s)
++{
++    drm_display_env_t * const de = s->priv_data;
++
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++
++    de->q_terminate = 1;
++    sem_post(&de->q_sem);
++    pthread_join(de->q_thread, NULL);
++    sem_destroy(&de->q_sem);
++    pthread_mutex_destroy(&de->q_lock);
++
++    av_frame_free(&de->q_next);
++    av_frame_free(&de->q_this);
++
++    if (de->drm_fd >= 0) {
++        close(de->drm_fd);
++        de->drm_fd = -1;
++    }
++
++    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
++}
++
++
++#define OFFSET(x) offsetof(drm_display_env_t, x)
++static const AVOption options[] = {
++#if 0
++    { "display_name", "set display name",       OFFSET(display_name), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_id",    "set existing window id", OFFSET(window_id),    AV_OPT_TYPE_INT64,  {.i64 = 0 }, 0, INT64_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_title", "set window title",       OFFSET(window_title), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++#endif
++    { NULL }
++
++};
++
++static const AVClass drm_vout_class = {
++    .class_name = "drm vid outdev",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_drm_muxer = {
++    .name           = "vout_drm",
++    .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
++    .priv_data_size = sizeof(drm_display_env_t),
++    .audio_codec    = AV_CODEC_ID_NONE,
++    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
++    .write_header   = drm_vout_write_header,
++    .write_packet   = drm_vout_write_packet,
++    .write_uncoded_frame = drm_vout_write_frame,
++    .write_trailer  = drm_vout_write_trailer,
++    .control_message = drm_vout_control_message,
++    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++    .priv_class     = &drm_vout_class,
++    .init           = drm_vout_init,
++    .deinit         = drm_vout_deinit,
++};
+diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c
+new file mode 100644
+index 0000000000..60fe8a7075
+--- /dev/null
++++ b/libavdevice/rpi_vout.c
+@@ -0,0 +1,534 @@
++/*
++ * Copyright (c) 2013 Jeff Moguillansky
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * XVideo output device
++ *
++ * TODO:
++ * - add support to more formats
++ */
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include <stdatomic.h>
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/mmal_parameters_camera.h>
++#include <interface/mmal/mmal_buffer.h>
++#include <interface/mmal/mmal_port.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++#include <interface/mmal/util/mmal_connection.h>
++#include <interface/mmal/util/mmal_util_params.h>
++#pragma GCC diagnostic pop
++#include "libavutil/rpi_sand_fns.h"
++#include "libavcodec/rpi_zc.h"
++
++#define TRACE_ALL 0
++
++#define RPI_DISPLAY_ALL 0
++#define DISPLAY_PORT_DEPTH 4
++
++typedef struct rpi_display_env_s
++{
++    AVClass *class;
++
++    MMAL_COMPONENT_T* display;
++    MMAL_COMPONENT_T* isp;
++    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
++    MMAL_CONNECTION_T * conn;
++
++    MMAL_POOL_T *rpi_pool;
++    volatile int rpi_display_count;
++
++    MMAL_FOURCC_T req_fmt;
++    MMAL_VIDEO_FORMAT_T req_vfmt;
++
++    AVZcEnvPtr zc;
++
++    int window_width, window_height;
++    int window_x, window_y;
++    int layer, fullscreen;
++} rpi_display_env_t;
++
++
++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
++    mmal_buffer_header_release(buffer);
++}
++
++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
++    mmal_buffer_header_release(buffer);
++}
++
++
++static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
++{
++    switch (fmt) {
++    case AV_PIX_FMT_SAND128:
++    case AV_PIX_FMT_RPI4_8:
++        return MMAL_ENCODING_YUVUV128;
++    case AV_PIX_FMT_RPI4_10:
++        return MMAL_ENCODING_YUV10_COL;
++    case AV_PIX_FMT_SAND64_10:
++        return MMAL_ENCODING_YUVUV64_10;
++    case AV_PIX_FMT_SAND64_16:
++        return MMAL_ENCODING_YUVUV64_16;
++    case AV_PIX_FMT_YUV420P:
++        return MMAL_ENCODING_I420;
++
++    default:
++        break;
++    }
++    return 0;
++}
++
++
++static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
++                                       const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
++{
++    MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
++    const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
++    if (av_rpi_is_sand_format(geo->format)) {
++        // Sand formats are a bit "special"
++        // stride1 implicit in format
++        // width = stride2
++        vfmt->width = geo->stripe_is_yc ?
++            geo->height_y + geo->height_c : geo->height_y;
++//        es->height = geo->video_height;  //*** When we get the FLAG this will change
++        vfmt->height = geo->height_y;
++        es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
++    }
++    else {
++        vfmt->width = geo->stride_y / geo->bytes_per_pel;
++        vfmt->height = geo->height_y;
++        es_fmt->flags = 0;
++    }
++
++    es_fmt->type = MMAL_ES_TYPE_VIDEO;
++    es_fmt->encoding = mmfmt_from_avfmt(geo->format);
++    es_fmt->encoding_variant = 0;
++    es_fmt->bitrate = 0;
++
++    vfmt->crop.x = frame->crop_left;
++    vfmt->crop.y = frame->crop_top;
++    vfmt->crop.width = av_frame_cropped_width(frame);
++    vfmt->crop.height = av_frame_cropped_height(frame);
++
++    vfmt->frame_rate.den = 0;  // Don't think I know it here
++    vfmt->frame_rate.num = 0;
++
++    vfmt->par.den = frame->sample_aspect_ratio.den;
++    vfmt->par.num = frame->sample_aspect_ratio.num;
++
++    vfmt->color_space = 0;  // Unknown currently
++}
++
++static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
++{
++    rpi_display_env_t * const de = userdata;
++    if (buf->user_data != NULL) {
++        av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
++        buf->user_data = NULL;
++    }
++    atomic_fetch_add(&de->rpi_display_count, -1);
++    return MMAL_FALSE;
++}
++
++static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
++{
++    return avfmt == AV_PIX_FMT_SAND64_10;
++}
++
++static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
++{
++    if (de->isp != NULL)
++    {
++        if (de->isp->input[0]->is_enabled)
++            mmal_port_disable(de->isp->input[0]);
++        if (de->isp->control->is_enabled)
++            mmal_port_disable(de->isp->control);
++    }
++    if (de->conn != NULL) {
++        mmal_connection_destroy(de->conn);
++        de->conn = NULL;
++    }
++    if (de->isp != NULL) {
++        mmal_component_destroy(de->isp);
++        de->isp = NULL;
++    }
++}
++
++static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
++{
++    MMAL_BUFFER_HEADER_T* buf = NULL;
++    AVRpiZcRefPtr fr_buf = NULL;
++
++    if (de == NULL)
++        return;
++
++    if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
++        return;
++    }
++
++    if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
++        return;
++    }
++
++    buf = mmal_queue_get(de->rpi_pool->queue);
++    if (!buf) {
++        // Running too fast so drop the frame (unexpected)
++        goto fail;
++    }
++
++    buf->cmd = 0;
++    buf->offset = 0;
++    buf->flags = 0;
++    mmal_buffer_header_reset(buf);
++
++    atomic_fetch_add(&de->rpi_display_count, 1);  // Deced on release
++    mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
++
++    buf->user_data = fr_buf;
++    buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
++    buf->offset = av_rpi_zc_offset(fr_buf);
++    buf->length = av_rpi_zc_length(fr_buf);
++    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++
++#if RPI_DISPLAY_ALL
++    while (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++        usleep(5000);
++    }
++#endif
++
++    {
++        MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
++        MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
++		MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
++
++        video_format_from_zc_frame(&new_es, fr, fr_buf);
++        if (de->req_fmt != new_es.encoding ||
++            de->req_vfmt.width       != new_vfmt->width ||
++            de->req_vfmt.height      != new_vfmt->height ||
++            de->req_vfmt.crop.x      != new_vfmt->crop.x ||
++            de->req_vfmt.crop.y      != new_vfmt->crop.y ||
++            de->req_vfmt.crop.width  != new_vfmt->crop.width ||
++            de->req_vfmt.crop.height != new_vfmt->crop.height) {
++            // Something has changed
++
++            // If we have an ISP tear it down
++            isp_remove(s, de);
++            de->port_in = de->display->input[0];
++
++            // If we still need an ISP create it now
++            if (avfmt_needs_isp(fr->format))
++            {
++                if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
++                {
++                    av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
++                    goto fail;
++                }
++                de->port_in = de->isp->input[0];
++            }
++
++            mmal_format_copy(de->port_in->format, &new_es);
++
++            if (mmal_port_format_commit(de->port_in)) {
++                av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
++                goto fail;
++            }
++
++            // If we have an ISP then we must want to use it
++            if (de->isp != NULL) {
++                MMAL_PORT_T * const port_out = de->isp->output[0];
++                MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
++                MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
++
++                port_out->format->type = MMAL_ES_TYPE_VIDEO;
++                port_out->format->encoding  = MMAL_ENCODING_YUVUV128;
++                port_out->format->encoding_variant = 0;
++                port_out->format->bitrate = 0;
++                port_out->format->flags = 0;
++                port_out->format->extradata = NULL;
++                port_out->format->extradata_size = 0;
++
++                vfmt_out->width       = (vfmt_in->crop.width + 31) & ~31;
++                vfmt_out->height      = (vfmt_in->crop.height + 15) & ~15;
++                vfmt_out->crop.x      = 0;
++                vfmt_out->crop.y      = 0;
++                vfmt_out->crop.width  = vfmt_in->crop.width;
++                vfmt_out->crop.height = vfmt_in->crop.height;
++                vfmt_out->frame_rate  = vfmt_in->frame_rate;
++                vfmt_out->par         = vfmt_in->par;
++                vfmt_out->color_space = vfmt_in->color_space;
++
++                if (mmal_port_format_commit(port_out)) {
++                    av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
++                    goto fail;
++                }
++
++                if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
++                    av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
++                    goto fail;
++                }
++                if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
++                    av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
++                    goto fail;
++                }
++                mmal_port_enable(de->isp->control,display_cb_control);
++                mmal_component_enable(de->isp);
++            }
++
++            // Number of slots in my port Q
++            de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
++            // Size to keep it happy - isn't used for anything other than error checking
++            de->port_in->buffer_size = buf->alloc_size;
++            if (!de->port_in->is_enabled)
++            {
++                mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
++                if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
++                    av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
++                    goto fail;
++                }
++            }
++
++            de->req_fmt  = new_es.encoding;
++            de->req_vfmt = *new_vfmt;
++        }
++    }
++
++    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
++    {
++        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
++        goto fail;
++    }
++    return;
++
++fail:
++    // If we have a buf then fr_buf is held by that
++    if (buf != NULL)
++        mmal_buffer_header_release(buf);
++    else if (fr_buf != NULL)
++        av_rpi_zc_unref(fr_buf);
++}
++
++
++static int xv_write_trailer(AVFormatContext *s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++    if (de->port_in != NULL && de->port_in->is_enabled) {
++        mmal_port_disable(de->port_in);
++    }
++
++    // The above disable should kick out all buffers - check that
++    if (atomic_load(&de->rpi_display_count) != 0) {
++        av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
++    }
++
++    isp_remove(s, de);
++    if (de->rpi_pool != NULL) {
++        mmal_pool_destroy(de->rpi_pool);
++        de->rpi_pool = NULL;
++    }
++    if (de->display != NULL) {
++        mmal_component_destroy(de->display);
++        de->display = NULL;
++    }
++
++    return 0;
++}
++
++static int xv_write_header(AVFormatContext *s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++    const AVCodecParameters * const par = s->streams[0]->codecpar;
++    const unsigned int w = de->window_width ? de->window_width : par->width;
++    const unsigned int h = de->window_height ? de->window_height : par->height;
++    const unsigned int x = de->window_x;
++    const unsigned int y = de->window_y;
++    const int layer = de->layer ? de->layer : 2;
++    const MMAL_BOOL_T fullscreen = de->fullscreen;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
++#endif
++    if (   s->nb_streams > 1
++        || par->codec_type != AVMEDIA_TYPE_VIDEO
++        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
++        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++        return AVERROR(EINVAL);
++    }
++
++    {
++        MMAL_DISPLAYREGION_T region =
++        {
++            .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
++            .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
++                MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
++            .layer = layer,
++            .fullscreen = fullscreen,
++            .dest_rect = {x, y, w, h},
++            .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
++        };
++
++        bcm_host_init();  // Needs to be done by someone...
++
++        if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
++            goto fail;
++        }
++        de->port_in = de->display->input[0];
++
++        mmal_port_parameter_set(de->display->input[0], &region.hdr);
++
++        if (mmal_component_enable(de->display) != MMAL_SUCCESS)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
++            goto fail;
++        }
++        if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
++            goto fail;
++        }
++
++        if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
++            goto fail;
++        }
++    }
++
++    return 0;
++
++fail:
++    xv_write_trailer(s);
++    return AVERROR_UNKNOWN;
++}
++
++static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++    AVFrame * const frame = (AVFrame *)pkt->data;
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++    display_frame(s, s->priv_data, frame);
++    return 0;
++}
++
++static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++                          unsigned flags)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++    /* xv_write_header() should have accepted only supported formats */
++    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++        return 0;
++//    return write_picture(s, (*frame)->data, (*frame)->linesize);
++
++    display_frame(s, s->priv_data, *ppframe);
++    return 0;
++}
++
++static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++    switch(type) {
++    case AV_APP_TO_DEV_WINDOW_REPAINT:
++        return 0;
++    default:
++        break;
++    }
++    return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int rpi_vout_init(struct AVFormatContext * s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++
++    // Get a ZC context in case we need one - has little overhead if unused
++    if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
++        return 1;
++
++    return 0;
++}
++
++static void rpi_vout_deinit(struct AVFormatContext * s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++
++    av_rpi_zc_int_env_freep(&de->zc);
++}
++
++
++#define OFFSET(x) offsetof(rpi_display_env_t, x)
++static const AVOption options[] = {
++    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "display_layer","set display layer",      OFFSET(layer),        AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { NULL }
++
++};
++
++static const AVClass xv_class = {
++    .class_name = "rpi vid outdev",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_rpi_muxer = {
++    .name           = "vout_rpi",
++    .long_name      = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
++    .priv_data_size = sizeof(rpi_display_env_t),
++    .audio_codec    = AV_CODEC_ID_NONE,
++    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
++    .write_header   = xv_write_header,
++    .write_packet   = xv_write_packet,
++    .write_uncoded_frame = xv_write_frame,
++    .write_trailer  = xv_write_trailer,
++    .control_message = xv_control_message,
++    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++    .priv_class     = &xv_class,
++    .init           = rpi_vout_init,
++    .deinit         = rpi_vout_deinit,
++};
+diff --git a/libavfilter/Makefile b/libavfilter/Makefile
+index 5123540653..17ccea3150 100644
+--- a/libavfilter/Makefile
++++ b/libavfilter/Makefile
+@@ -434,6 +434,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)       += vf_transpose_opencl.o opencl.o o
+ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
+ OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
+ OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
++OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
+ OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
+ OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
+                                                 opencl/unsharp.o
+diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
+index 1183e40267..2f569057dd 100644
+--- a/libavfilter/allfilters.c
++++ b/libavfilter/allfilters.c
+@@ -414,6 +414,7 @@ extern AVFilter ff_vf_transpose_opencl;
+ extern AVFilter ff_vf_transpose_vaapi;
+ extern AVFilter ff_vf_trim;
+ extern AVFilter ff_vf_unpremultiply;
++extern AVFilter ff_vf_unsand;
+ extern AVFilter ff_vf_unsharp;
+ extern AVFilter ff_vf_unsharp_opencl;
+ extern AVFilter ff_vf_untile;
+diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
+index 2fe4f0b0f9..5a8e6b3f24 100644
+--- a/libavfilter/avfiltergraph.c
++++ b/libavfilter/avfiltergraph.c
+@@ -32,6 +32,9 @@
+ #include "libavutil/internal.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/pixdesc.h"
++#if CONFIG_UNSAND_FILTER
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ #define FF_INTERNAL_FIELDS 1
+ #include "framequeue.h"
+@@ -429,6 +432,19 @@ static int can_merge_formats(AVFilterFormats *a_arg,
+     }
+ }
+ 
++#if CONFIG_UNSAND_FILTER
++static int has_sand_format(const AVFilterFormats * const ff)
++{
++    int i;
++    for (i = 0; i != ff->nb_formats; ++i) {
++        if (av_rpi_is_sand_format(ff->formats[i])) {
++            return 1;
++        }
++    }
++    return 0;
++}
++#endif
++
+ /**
+  * Perform one round of query_formats() and merging formats lists on the
+  * filter graph.
+@@ -469,6 +485,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+         for (j = 0; j < filter->nb_inputs; j++) {
+             AVFilterLink *link = filter->inputs[j];
+             int convert_needed = 0;
++            unsigned int extra_convert_tried = 0;
+ 
+             if (!link)
+                 continue;
+@@ -516,11 +533,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+             )
+ #undef MERGE_DISPATCH
+ 
+-            if (convert_needed) {
++            while (convert_needed) {
+                 AVFilterContext *convert;
+                 const AVFilter *filter;
+                 AVFilterLink *inlink, *outlink;
+                 char inst_name[30];
++                int can_retry = 0;
++
++                convert_needed = 0;
+ 
+                 if (graph->disable_auto_convert) {
+                     av_log(log_ctx, AV_LOG_ERROR,
+@@ -533,19 +553,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+                 /* couldn't merge format lists. auto-insert conversion filter */
+                 switch (link->type) {
+                 case AVMEDIA_TYPE_VIDEO:
+-                    if (!(filter = avfilter_get_by_name("scale"))) {
+-                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
+-                               "not present, cannot convert pixel formats.\n");
+-                        return AVERROR(EINVAL);
+-                    }
+-
+-                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
+-                             scaler_count++);
++#if CONFIG_UNSAND_FILTER
++                    // Only try each extra conversion once
++                    // The unsand output pad should never trigger has_sand_format
++                    // but it is better to be safe
++                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
++                        if (!(filter = avfilter_get_by_name("unsand"))) {
++                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
++                                   "not present, cannot convert pixel formats.\n");
++                            return AVERROR(EINVAL);
++                        }
++
++                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
++                                 scaler_count++);
++
++                        if ((ret = avfilter_graph_create_filter(&convert, filter,
++                                                                inst_name, "", NULL,
++                                                                graph)) < 0)
++                            return ret;
+ 
+-                    if ((ret = avfilter_graph_create_filter(&convert, filter,
+-                                                            inst_name, graph->scale_sws_opts, NULL,
+-                                                            graph)) < 0)
+-                        return ret;
++                        extra_convert_tried |= 1;
++                        can_retry = 1;
++                    }
++                    else
++#endif
++                    {
++                        if (!(filter = avfilter_get_by_name("scale"))) {
++                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
++                                   "not present, cannot convert pixel formats.\n");
++                            return AVERROR(EINVAL);
++                        }
++
++                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
++                                 scaler_count++);
++
++                        if ((ret = avfilter_graph_create_filter(&convert, filter,
++                                                                inst_name, graph->scale_sws_opts, NULL,
++                                                                graph)) < 0)
++                            return ret;
++                    }
+                     break;
+                 case AVMEDIA_TYPE_AUDIO:
+                     if (!(filter = avfilter_get_by_name("aresample"))) {
+@@ -587,9 +633,19 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+                     av_assert0(outlink-> in_channel_layouts->refcount > 0);
+                     av_assert0(outlink->out_channel_layouts->refcount > 0);
+                 }
+-                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type) ||
+-                    !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++                // If we have added an extra filter we must merge the input
++                // side but we can have another go at the output
++                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type))
++                    ret = AVERROR(ENOSYS);
++                else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++                {
++                    if (can_retry) {
++                        link = outlink;
++                        convert_needed = 1;
++                        continue;
++                    }
+                     ret = AVERROR(ENOSYS);
++                }
+                 if (inlink->type == AVMEDIA_TYPE_AUDIO &&
+                     (!ff_merge_samplerates(inlink->in_samplerates,
+                                            inlink->out_samplerates) ||
+diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
+index bf30f54177..eb5dfa22f8 100644
+--- a/libavfilter/buffersrc.c
++++ b/libavfilter/buffersrc.c
+@@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_internal(AVFilterContext *ctx,
+ 
+         switch (ctx->outputs[0]->type) {
+         case AVMEDIA_TYPE_VIDEO:
+-            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
++            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
+                                      frame->format, frame->pts);
+             break;
+         case AVMEDIA_TYPE_AUDIO:
+diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
+new file mode 100644
+index 0000000000..fbea56dd09
+--- /dev/null
++++ b/libavfilter/vf_unsand.c
+@@ -0,0 +1,234 @@
++/*
++ * Copyright (c) 2007 Bobby Bingham
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * format and noformat video filters
++ */
++
++#include <string.h>
++
++#include "libavutil/internal.h"
++#include "libavutil/mem.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/opt.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct UnsandContext {
++    const AVClass *class;
++} UnsandContext;
++
++static av_cold void uninit(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++
++    return 0;
++}
++
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterLink * const outlink = link->dst->outputs[0];
++    AVFrame *out = NULL;
++    int rv = 0;
++
++    if (outlink->format == in->format) {
++        // If nothing to do then do nothing
++        out = in;
++    }
++    else
++    {
++        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
++        {
++            rv = AVERROR(ENOMEM);
++            goto fail;
++        }
++        if (av_rpi_sand_to_planar_frame(out, in) != 0)
++        {
++            rv = -1;
++            goto fail;
++        }
++
++        av_frame_free(&in);
++    }
++
++    return ff_filter_frame(outlink, out);
++
++fail:
++    av_frame_free(&out);
++    av_frame_free(&in);
++    return rv;
++}
++
++#if 0
++static void dump_fmts(const AVFilterFormats * fmts)
++{
++    int i;
++    if (fmts== NULL) {
++        printf("NULL\n");
++        return;
++    }
++    for (i = 0; i < fmts->nb_formats; ++i) {
++        printf(" %d", fmts->formats[i]);
++    }
++    printf("\n");
++}
++#endif
++
++static int query_formats(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++    int ret;
++
++    // If we aren't connected at both ends then just do nothing
++    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
++        return 0;
++
++//    printf("Unsand: %s in: ", __func__);
++//    dump_fmts(ctx->inputs[0]->in_formats);
++//    printf("Unsand: %s out: ", __func__);
++//    dump_fmts(ctx->outputs[0]->out_formats);
++
++    // Our output formats depend on our input formats and we can't/don't
++    // want to convert between bit depths so we need to wait for the source
++    // to have an opinion before we do
++    if (ctx->inputs[0]->in_formats == NULL)
++        return AVERROR(EAGAIN);
++
++    // Accept anything
++    if (ctx->inputs[0]->out_formats == NULL &&
++        (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
++        return ret;
++
++    // Filter out sand formats
++
++    // Generate a container if we don't already have one
++    if (ctx->outputs[0]->in_formats == NULL)
++    {
++        // Somewhat rubbish way of ensuring we have a good structure
++        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
++        AVFilterFormats *formats = ff_make_format_list(out_fmts);
++
++        if (formats == NULL)
++            return AVERROR(ENOMEM);
++        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
++            return ret;
++    }
++
++    // Replace old format list with new filtered list derived from what our
++    // input says it can do
++    {
++        const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
++        AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
++        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
++        int i;
++        int n = 0;
++        int seen_420p = 0;
++        int seen_420p10 = 0;
++
++        for (i = 0; i < src_ff->nb_formats; ++i) {
++            const enum AVPixelFormat f = src_ff->formats[i];
++
++            switch (f){
++                case AV_PIX_FMT_YUV420P:
++                case AV_PIX_FMT_SAND128:
++                case AV_PIX_FMT_RPI4_8:
++                    if (!seen_420p) {
++                        seen_420p = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
++                    }
++                    break;
++                case AV_PIX_FMT_SAND64_10:
++                case AV_PIX_FMT_YUV420P10:
++                case AV_PIX_FMT_RPI4_10:
++                    if (!seen_420p10) {
++                        seen_420p10 = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
++                    }
++                    break;
++                default:
++                    dst_fmts[n++] = f;
++                    break;
++            }
++        }
++
++        av_freep(&dst_ff->formats);
++        dst_ff->formats = dst_fmts;
++        dst_ff->nb_formats = n;
++    }
++
++//    printf("Unsand: %s calc: ", __func__);
++//    dump_fmts(ctx->outputs[0]->in_formats);
++
++    return 0;
++}
++
++
++#define OFFSET(x) offsetof(UnsandContext, x)
++static const AVOption unsand_options[] = {
++    { NULL }
++};
++
++
++AVFILTER_DEFINE_CLASS(unsand);
++
++static const AVFilterPad avfilter_vf_unsand_inputs[] = {
++    {
++        .name             = "default",
++        .type             = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++    { NULL }
++};
++
++static const AVFilterPad avfilter_vf_unsand_outputs[] = {
++    {
++        .name = "default",
++        .type = AVMEDIA_TYPE_VIDEO
++    },
++    { NULL }
++};
++
++AVFilter ff_vf_unsand = {
++    .name          = "unsand",
++    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
++
++    .init          = init,
++    .uninit        = uninit,
++
++    .query_formats = query_formats,
++
++    .priv_size     = sizeof(UnsandContext),
++    .priv_class    = &unsand_class,
++
++    .inputs        = avfilter_vf_unsand_inputs,
++    .outputs       = avfilter_vf_unsand_outputs,
++};
++
+diff --git a/libavformat/utils.c b/libavformat/utils.c
+index 667249362c..436b98c4ff 100644
+--- a/libavformat/utils.c
++++ b/libavformat/utils.c
+@@ -3044,6 +3044,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
+     return 1;
+ }
+ 
++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
++// This should be quite general purpose but avoid possible conflicts
++// by limiting usage to cases wehere we know it works.
++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
++{
++    // Only try fallback if we know it is supported (HEVC only)
++    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
++        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
++    int err;
++
++    // Failed to find fallback or we are already at the fallback
++    if (new_codec == NULL || new_codec == old_codec)
++    {
++        return AVERROR_DECODER_NOT_FOUND;
++    }
++
++    // * This may be dodgy - header says to not use this fn,
++    //   especially if we are going to reopen the context...
++    //   (but it does seem to work for our cases)
++    if (avcodec_is_open(avctx)) {
++        avcodec_close(avctx);
++    }
++
++    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
++    {
++        return err;
++    }
++
++    return 0;
++}
++#else
++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
++#endif
++
+ /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
+ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+                             const AVPacket *avpkt, AVDictionary **options)
+@@ -3078,7 +3112,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+         av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
+         if (s->codec_whitelist)
+             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
+-        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
++        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
++        {
++            // Try fallback if if looks worth a try
++            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
++        }
+         if (!options)
+             av_dict_free(&thread_opt);
+         if (ret < 0) {
+@@ -3109,6 +3147,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
+             ret = avcodec_send_packet(avctx, &pkt);
++
++            // If we are going to want to fall back we should know here
++            if (ret == AVERROR_DECODER_NOT_FOUND) {
++                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
++                    break;
++                continue;
++            }
++
+             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
+                 break;
+             if (ret >= 0)
+@@ -3719,9 +3765,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
+         // Try to just open decoders, in case this is enough to get parameters.
+         if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
+             if (codec && !avctx->codec)
+-                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
+-                    av_log(ic, AV_LOG_WARNING,
+-                           "Failed to open codec in %s\n",__FUNCTION__);
++            {
++                int err;
++
++                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
++                {
++                    if (err == AVERROR_DECODER_NOT_FOUND) {
++                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
++                    }
++                    if (err < 0) {
++                        av_log(ic, AV_LOG_WARNING,
++                               "Failed to open codec in %s\n",__FUNCTION__);
++                    }
++                }
++            }
+         }
+         if (!options)
+             av_dict_free(&thread_opt);
+diff --git a/libavutil/Makefile b/libavutil/Makefile
+index 9b08372eb2..b0b5be0fa6 100644
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -68,6 +68,7 @@ HEADERS = adler32.h                                                     \
+           rational.h                                                    \
+           replaygain.h                                                  \
+           ripemd.h                                                      \
++	  rpi_sand_fns.h                                                \
+           samplefmt.h                                                   \
+           sha.h                                                         \
+           sha512.h                                                      \
+@@ -86,6 +87,7 @@ HEADERS = adler32.h                                                     \
+           tx.h                                                          \
+ 
+ HEADERS-$(CONFIG_LZO)                   += lzo.h
++HEADERS-$(CONFIG-RPI)                   += rpi_sand_fn_pw.h
+ 
+ ARCH_HEADERS = bswap.h                                                  \
+                intmath.h                                                \
+@@ -180,10 +182,12 @@ OBJS-$(CONFIG_LZO)                      += lzo.o
+ OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
+ OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
+ OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
++OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
+ OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
+ OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
+ OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
+ OBJS-$(CONFIG_VULKAN)                   += hwcontext_vulkan.o
++OBJS-$(CONFIG_RPI)                      += rpi_sand_fns.o
+ 
+ OBJS += $(COMPAT_OBJS:%=../compat/%)
+ 
+diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
+index 5da44b0542..b74b7c4e2f 100644
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
+ 
+ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
+              arm/float_dsp_neon.o                                       \
++             arm/rpi_sand_neon.o                                        \
+diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..750af9064f
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,69 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "libavutil/arm/asm.S"
++
++@ void rpi_sand128b_stripe_to_8_10(
++@   uint8_t * dest,             [r0]
++@   const uint8_t * src1,       [r1]
++@   const uint8_t * src2,       [r2]
++@   unsigned int lines);        [r3]
++
++.macro  stripe2_to_8, bit_depth
++        vpush    {q4-q7}
++1:
++        vldm     r1!, {q0-q7}
++        subs     r3, #1
++        vldm     r2!, {q8-q15}
++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
++        vqrshrn.u16 d10, q10, #\bit_depth - 8
++        vqrshrn.u16 d11, q11, #\bit_depth - 8
++        vqrshrn.u16 d12, q12, #\bit_depth - 8
++        vqrshrn.u16 d13, q13, #\bit_depth - 8
++        vqrshrn.u16 d14, q14, #\bit_depth - 8
++        vqrshrn.u16 d15, q15, #\bit_depth - 8
++        vstm     r0!, {q0-q7}
++        bne      1b
++        vpop     {q4-q7}
++        bx       lr
++.endm
++
++function rpi_sand128b_stripe_to_8_10, export=1
++        stripe2_to_8     10
++endfunc
++
+diff --git a/libavutil/buffer.c b/libavutil/buffer.c
+index 38a554208a..b0fedabc3e 100644
+--- a/libavutil/buffer.c
++++ b/libavutil/buffer.c
+@@ -273,6 +273,19 @@ static void buffer_pool_free(AVBufferPool *pool)
+     av_freep(&pool);
+ }
+ 
++void av_buffer_pool_flush(AVBufferPool *pool)
++{
++    ff_mutex_lock(&pool->mutex);
++    while (pool->pool) {
++        BufferPoolEntry *buf = pool->pool;
++        pool->pool = buf->next;
++
++        buf->free(buf->opaque, buf->data);
++        av_freep(&buf);
++    }
++    ff_mutex_unlock(&pool->mutex);
++}
++
+ void av_buffer_pool_uninit(AVBufferPool **ppool)
+ {
+     AVBufferPool *pool;
+diff --git a/libavutil/buffer.h b/libavutil/buffer.h
+index c0f3f6cc9a..998beec9ac 100644
+--- a/libavutil/buffer.h
++++ b/libavutil/buffer.h
+@@ -267,6 +267,11 @@ AVBufferPool *av_buffer_pool_init2(int size, void *opaque,
+                                    AVBufferRef* (*alloc)(void *opaque, int size),
+                                    void (*pool_free)(void *opaque));
+ 
++/**
++ * Free all available buffers in a buffer pool.
++ */
++ void av_buffer_pool_flush(AVBufferPool *pool);
++
+ /**
+  * Mark the pool as being available for freeing. It will actually be freed only
+  * once all the allocated buffers associated with the pool are released. Thus it
+diff --git a/libavutil/frame.c b/libavutil/frame.c
+index 2e952edd29..96e8bf5b3e 100644
+--- a/libavutil/frame.c
++++ b/libavutil/frame.c
+@@ -16,6 +16,8 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "config.h"
++
+ #include "channel_layout.h"
+ #include "avassert.h"
+ #include "buffer.h"
+@@ -26,6 +28,9 @@
+ #include "mem.h"
+ #include "samplefmt.h"
+ #include "hwcontext.h"
++#if CONFIG_SAND
++#include "rpi_sand_fns.h"
++#endif
+ 
+ #if FF_API_FRAME_GET_SET
+ MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
+@@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
+         (frame->crop_top + frame->crop_bottom) >= frame->height)
+         return AVERROR(ERANGE);
+ 
++#if CONFIG_SAND
++    // Sand cannot be cropped - do not try
++    if (av_rpi_is_sand_format(frame->format))
++        return 0;
++#endif
++
+     desc = av_pix_fmt_desc_get(frame->format);
+     if (!desc)
+         return AVERROR_BUG;
+diff --git a/libavutil/frame.h b/libavutil/frame.h
+index fc67db0f6c..b1a7eb4858 100644
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
+  */
+ const char *av_frame_side_data_name(enum AVFrameSideDataType type);
+ 
++
++static inline int av_frame_cropped_width(const AVFrame * const frame)
++{
++    return frame->width - (frame->crop_left + frame->crop_right);
++}
++static inline int av_frame_cropped_height(const AVFrame * const frame)
++{
++    return frame->height - (frame->crop_top + frame->crop_bottom);
++}
++
+ /**
+  * @}
+  */
+diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
+index 32cbde82eb..9ba8b7b2dd 100644
+--- a/libavutil/hwcontext_drm.c
++++ b/libavutil/hwcontext_drm.c
+@@ -21,6 +21,7 @@
+ #include <unistd.h>
+ 
+ #include <drm.h>
++#include <drm/drm_fourcc.h>
+ #include <xf86drm.h>
+ 
+ #include "avassert.h"
+@@ -28,6 +29,7 @@
+ #include "hwcontext_drm.h"
+ #include "hwcontext_internal.h"
+ #include "imgutils.h"
++#include "libavutil/rpi_sand_fns.h"
+ 
+ 
+ static void drm_device_free(AVHWDeviceContext *hwdev)
+@@ -43,6 +45,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
+     AVDRMDeviceContext *hwctx = hwdev->hwctx;
+     drmVersionPtr version;
+ 
++    if (device == NULL) {
++      hwctx->fd = -1;
++      return 0;
++    }
++
+     hwctx->fd = open(device, O_RDWR);
+     if (hwctx->fd < 0)
+         return AVERROR(errno);
+@@ -120,6 +127,9 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
+     if (flags & AV_HWFRAME_MAP_WRITE)
+         mmap_prot |= PROT_WRITE;
+ 
++    if (dst->format == AV_PIX_FMT_NONE)
++        dst->format = hwfc->sw_format;
++
+     av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES);
+     for (i = 0; i < desc->nb_objects; i++) {
+         addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED,
+@@ -151,6 +161,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
+ 
+     dst->width  = src->width;
+     dst->height = src->height;
++    dst->crop_top    = src->crop_top;
++    dst->crop_bottom = src->crop_bottom;
++    dst->crop_left   = src->crop_left;
++    dst->crop_right  = src->crop_right;
++
++#if CONFIG_SAND
++    // Rework for sand frames
++    if (av_rpi_is_sand_frame(dst)) {
++        // As it stands the sand formats hold stride2 in linesize[3]
++        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
++        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
++        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
++        dst->linesize[0] = 128;
++        dst->linesize[1] = 128;
++        // *** Are we sure src->height is actually what we want ???
++    }
++#endif
+ 
+     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
+                                 &drm_unmap_frame, map);
+@@ -178,7 +205,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
+     if (!pix_fmts)
+         return AVERROR(ENOMEM);
+ 
+-    pix_fmts[0] = ctx->sw_format;
++    // **** Offer native sand too ????
++    pix_fmts[0] =
++#if CONFIG_SAND
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
++            AV_PIX_FMT_YUV420P :
++        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
++            AV_PIX_FMT_YUV420P10LE :
++#endif
++            ctx->sw_format;
+     pix_fmts[1] = AV_PIX_FMT_NONE;
+ 
+     *formats = pix_fmts;
+@@ -197,18 +232,82 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
+     map = av_frame_alloc();
+     if (!map)
+         return AVERROR(ENOMEM);
+-    map->format = dst->format;
+ 
++    // Map to default
++    map->format = AV_PIX_FMT_NONE;
+     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
+     if (err)
+         goto fail;
+ 
+-    map->width  = dst->width;
+-    map->height = dst->height;
++#if 0
++    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
++           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
++           map->width, map->height,
++           map->linesize[0],
++           map->linesize[1],
++           map->linesize[2],
++           map->linesize[3],
++           dst->width, dst->height,
++           dst->linesize[0],
++           dst->linesize[1],
++           dst->linesize[2]);
++#endif
++#if CONFIG_SAND
++    if (av_rpi_is_sand_frame(map)) {
++        unsigned int stride2 = map->linesize[3];
++        const unsigned int w = FFMIN(dst->width, av_frame_cropped_width(map));
++        const unsigned int h = FFMIN(dst->height, av_frame_cropped_height(map));
++
++        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
++            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                     map->data[0],
++                                     128, stride2,
++                                     map->crop_left, map->crop_top,
++                                     w, h);
++            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
++                                     dst->data[2], dst->linesize[2],
++                                     map->data[1],
++                                     128, stride2,
++                                     map->crop_left / 2, map->crop_top / 2,
++                                     w / 2, h / 2);
++        }
++        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
++            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
++                                     map->data[0],
++                                     128, stride2,
++                                     map->crop_left, map->crop_top,
++                                     w, h);  // *** ??? crop
++            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
++                                     dst->data[2], dst->linesize[2],
++                                     map->data[1],
++                                     128, stride2,
++                                     map->crop_left / 2, map->crop_top / 2,
++                                     w / 2, h / 2);
++        }
++        else
++        {
++            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
++            err = AVERROR(EINVAL);
++            goto fail;
++        }
++
++        dst->width = w;
++        dst->height = h;
++    }
++    else
++#endif
++    {
++        // Kludge mapped h/w s.t. frame_copy works
++        map->width  = dst->width;
++        map->height = dst->height;
++        err = av_frame_copy(dst, map);
++    }
+ 
+-    err = av_frame_copy(dst, map);
+     if (err)
++    {
++        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
+         goto fail;
++    }
+ 
+     err = 0;
+ fail:
+@@ -223,7 +322,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
+     int err;
+ 
+     if (src->width > hwfc->width || src->height > hwfc->height)
++    {
++        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
+         return AVERROR(EINVAL);
++    }
+ 
+     map = av_frame_alloc();
+     if (!map)
+diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
+index 9d61c52567..4e36a110c1 100644
+--- a/libavutil/pixdesc.c
++++ b/libavutil/pixdesc.c
+@@ -2073,6 +2073,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+         .name = "cuda",
+         .flags = AV_PIX_FMT_FLAG_HWACCEL,
+     },
++    [AV_PIX_FMT_RPI] = {
++        .name = "rpi",
++        .flags = AV_PIX_FMT_FLAG_HWACCEL,
++    },
++    [AV_PIX_FMT_RPI4_10] = {
++        .name = "rpi",
++        .flags = AV_PIX_FMT_FLAG_HWACCEL,
++    },
++    [AV_PIX_FMT_RPI4_8] = {
++        .name = "rpi",
++        .flags = AV_PIX_FMT_FLAG_HWACCEL,
++    },
+     [AV_PIX_FMT_AYUV64LE] = {
+         .name = "ayuv64le",
+         .nb_components = 4,
+@@ -2371,6 +2383,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+         .name = "vulkan",
+         .flags = AV_PIX_FMT_FLAG_HWACCEL,
+     },
++    [AV_PIX_FMT_SAND128] = {
++        .name = "sand128",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
++            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
++            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_SAND64_10] = {
++        .name = "sand64_10",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
++            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
++            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
++        },
++        .flags = 0,
++    },
+ };
+ #if FF_API_PLUS1_MINUS1
+ FF_ENABLE_DEPRECATION_WARNINGS
+diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
+index 1c625cfc8a..3400390a77 100644
+--- a/libavutil/pixfmt.h
++++ b/libavutil/pixfmt.h
+@@ -234,6 +234,11 @@ enum AVPixelFormat {
+      */
+     AV_PIX_FMT_CUDA,
+ 
++    /**
++     * HW acceleration through RPI.
++     */
++    AV_PIX_FMT_RPI,
++
+     AV_PIX_FMT_0RGB,        ///< packed RGB 8:8:8, 32bpp, XRGBXRGB...   X=unused/undefined
+     AV_PIX_FMT_RGB0,        ///< packed RGB 8:8:8, 32bpp, RGBXRGBX...   X=unused/undefined
+     AV_PIX_FMT_0BGR,        ///< packed BGR 8:8:8, 32bpp, XBGRXBGR...   X=unused/undefined
+@@ -357,6 +362,12 @@ enum AVPixelFormat {
+ 
+     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
+     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
++// RPI - not on ifdef so can be got at by calling progs
++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_RPI4_8,
++    AV_PIX_FMT_RPI4_10,
+ 
+     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+ };
+diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
+new file mode 100644
+index 0000000000..3133fe41ac
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,211 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x;
++    const unsigned int w = _w;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++            memcpy(dst, p, w);
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const uint8_t * p = p2;
++            uint8_t * d = dst;
++            memcpy(d, p1, w1);
++            d += w1;
++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++                memcpy(d, p, stride1);
++            }
++            memcpy(d, p, w3);
++        }
++    }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            const pixel * p = (const pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * p = (const pixel *)p1;
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *du++ = *p++;
++                    *dv++ = *p++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *p++ = *su++;
++                    *p++ = *sv++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
+new file mode 100644
+index 0000000000..7cb40c0de0
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,335 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "config.h"
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++#include "frame.h"
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if ARCH_ARM && HAVE_NEON
++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
++#endif
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    const unsigned int rnd = (1 << shr) >> 1;
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        *dst++ = (*src++ + rnd) >> shr;
++    }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    unsigned int rnd = (1 << shr) >> 1;
++    const unsigned int mask = ((1 << shr) - 1);
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        rnd = *src++ + (rnd & mask);
++        *dst++ = rnd >> shr;
++    }
++}
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint16_t * d = (uint16_t *)dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 10) & 0x3ff;
++            *d++ = (p3 >> 20) & 0x3ff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = p3 & 0x3ff;
++            *d++ = (p3 >> 10) & 0x3ff;
++            *d++ = (p3 >> 20) & 0x3ff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = p3 & 0x3ff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 10) & 0x3ff;
++        }
++    }
++}
++
++
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 8;
++    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint16_t * du = (uint16_t *)dst_u;
++        uint16_t * dv = (uint16_t *)dst_v;
++
++        if (xskip0 != 0) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            if (xskip0 == 1)
++            {
++                *du++ = (p3a >> 20) & 0x3ff;
++                *dv++ = (p3b >>  0) & 0x3ff;
++            }
++            *du++ = (p3b >> 10) & 0x3ff;
++            *dv++ = (p3b >> 20) & 0x3ff;
++
++            if (((x += 8) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            *du++ = p3a & 0x3ff;
++            *dv++ = (p3a >> 10) & 0x3ff;
++            *du++ = (p3a >> 20) & 0x3ff;
++            *dv++ = p3b & 0x3ff;
++            *du++ = (p3b >> 10) & 0x3ff;
++            *dv++ = (p3b >> 20) & 0x3ff;
++
++            if (((x += 8) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            *du++ = p3a & 0x3ff;
++            *dv++ = (p3a >> 10) & 0x3ff;
++            if (xrem1 == 2)
++            {
++                *du++ = (p3a >> 20) & 0x3ff;
++                *dv++ = p3b & 0x3ff;
++            }
++        }
++    }
++}
++
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr)
++{
++    const unsigned int n = dst_stride1 / 2;
++    unsigned int j;
++
++    // This is true for our current layouts
++    av_assert0(dst_stride1 == src_stride1);
++
++    // As we have the same stride1 for src & dest and src is wider than dest
++    // then if we loop on src we can always write contiguously to dest
++    // We make no effort to copy an exact width - round up to nearest src stripe
++    // as we will always have storage in dest for that
++
++#if ARCH_ARM && HAVE_NEON
++    if (shr == 3 && src_stride1 == 128) {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++        }
++    }
++    else
++#endif
++    {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++                cpy16_to_8(d, s1, n, shr);
++                cpy16_to_8(d + n, s2, n, shr);
++            }
++        }
++    }
++
++    // Fix up a trailing dest half stripe
++    if (j < w) {
++        uint8_t * d = dst + j * dst_stride2;
++        const uint8_t * s1 = src + j * 2 * src_stride2;
++
++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++            cpy16_to_8(d, s1, n, shr);
++        }
++    }
++}
++
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
++{
++    const int w = av_frame_cropped_width(src);
++    const int h = av_frame_cropped_height(src);
++    const int x = src->crop_left;
++    const int y = src->crop_top;
++
++    // We will crop as part of the conversion
++    dst->crop_top = 0;
++    dst->crop_left = 0;
++    dst->crop_bottom = 0;
++    dst->crop_right = 0;
++
++    switch (src->format){
++        case AV_PIX_FMT_SAND128:
++        case AV_PIX_FMT_RPI4_8:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2,  w/2, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        case AV_PIX_FMT_SAND64_10:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P10:
++                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x*2, y, w*2, h);
++                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y/2,  w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        case AV_PIX_FMT_RPI4_10:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P10:
++                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w/2, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        default:
++            return -1;
++    }
++
++    return av_frame_copy_props(dst, src);
++}
+diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
+new file mode 100644
+index 0000000000..634b55e800
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,183 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr);
++
++
++// dst must contain required pixel format & allocated data buffers
++// Cropping on the src buffer will be honoured and dst crop will be set to zero
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++#ifdef RPI_ZC_SAND128_ONLY
++    // If we are sure we only only support 128 byte sand formats replace the
++    // var with a constant which should allow for better optimisation
++    return 128;
++#else
++    return frame->linesize[0];
++#endif
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++
+diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
+new file mode 100644
+index 0000000000..7f16dff6a2
+--- /dev/null
++++ b/pi-util/BUILD.txt
+@@ -0,0 +1,29 @@
++Building Pi FFmpeg
++==================
++
++Configuration:
++=============
++
++These instructions work for cross compiles from Ubuntu 16.04 & Ubuntu
++18.04. I would expect most other linux environments to work but I haven't
++tried them.
++
++pi-util/conf_pi2.sh
++
++contains suitable options to build the code for Pi2/3.  It expects to find
++git clones of
++
++https://github.com/raspberrypi/tools
++https://github.com/raspberrypi/firmware
++
++in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
++lot of history you don't want.
++
++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
++rebuilt.  Otherwise the prebuilt .c & .h files will be used.
++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
++
++pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
++H265 QPU acceleration is broken on Pi1 and so it is disabled.
++
++
+diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt
+new file mode 100644
+index 0000000000..fcce72226a
+--- /dev/null
++++ b/pi-util/NOTES.txt
+@@ -0,0 +1,69 @@
++Notes on the hevc_rpi decoder & associated support code
++-------------------------------------------------------
++
++There are 3 main parts to the existing code:
++
++1) The decoder - this is all in libavcodec as rpi_hevc*.
++
++2) A few filters to deal with Sand frames and a small patch to
++automatically select the sand->i420 converter when required.
++
++3) A kludge in ffmpeg.c to display the decoded video. This could & should
++be converted into a proper ffmpeg display module.
++
++
++Decoder
++-------
++
++The decoder is a modified version of the existing ffmpeg hevc decoder.
++Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
++More complex bitstreams can be up to ~200% faster but particularly easy
++streams can cut its advantage down to ~50%.  This means that a Pi3+ can
++display nearly all 8-bit 1080p30 streams and with some overclocking it can
++display most lower bitrate 10-bit 1080p30 streams - this latter case is
++not helped by the requirement to downsample to 8-bit before display on a
++Pi.
++
++It has had co-processor offload added for inter-pred and large block
++residual transform.  Various parts have had optimized ARM NEON assembler
++added and the existing ARM asm sections have been profiled and
++re-optimized for A53. The main C code has been substantially reworked at
++its lower levels in an attempt to optimize it and minimize memory
++bandwidth. To some extent code paths that deal with frame types that it
++doesn't support have been pruned.
++
++It outputs frames in Broadcom Sand format. This is a somewhat annoying
++layout that doesn't fit into ffmpegs standard frame descriptions. It has
++vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
++the stripe followed by interleaved U & V, that is then followed by the Y
++for the next stripe, etc. The final stripe is always padded to
++stripe-width. This is used in an attempt to help with cache locality and
++cut down on the number of dram bank switches. It is annoying to use for
++inter-pred with conventional processing but the way the Pi QPU (which is
++used for inter-pred) works means that it has negligible downsides here and
++the improved memory performance exceeds the overhead of the increased
++complexity in the rest of the code.
++
++Frames must be allocated out of GPU memory (as otherwise they can't be
++accessed by the co-processors). Utility functions (in rpi_zc.c) have been
++written to make this easier. As the frames are already in GPU memory they
++can be displayed by the Pi h/w without any further copying.
++
++
++Known non-features
++------------------
++
++Frame allocation should probably be done in some other way in order to fit
++into the standard framework better.
++
++Sand frames are currently declared as software frames, there is an
++argument that they should be hardware frames but they aren't really.
++
++There must be a better way of auto-selecting the hevc_rpi decoder over the
++normal s/w hevc decoder, but I became confused by the existing h/w
++acceleration framework and what I wanted to do didn't seem to fit in
++neatly.
++
++Display should be a proper device rather than a kludge in ffmpeg.c
++
++
+diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
+new file mode 100644
+index 0000000000..4efd5d1c67
+--- /dev/null
++++ b/pi-util/conf_h265.2016.csv
+@@ -0,0 +1,195 @@
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
++1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
++0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
++1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
++1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
+diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
+new file mode 100644
+index 0000000000..6082641271
+--- /dev/null
++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
+@@ -0,0 +1,147 @@
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+new file mode 100644
+index 0000000000..fc14f2a3c2
+--- /dev/null
++++ b/pi-util/conf_h265.csv
+@@ -0,0 +1,144 @@
++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
+new file mode 100644
+index 0000000000..285bc1b99c
+--- /dev/null
++++ b/pi-util/conf_native.sh
+@@ -0,0 +1,41 @@
++echo "Configure for native build"
++
++RPI_OPT_VC=/opt/vc
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
++RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++USR_PREFIX=`pwd`/install
++LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf
++INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf
++
++./configure \
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ --arch=armv6t2\
++ --cpu=cortex-a7\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-mmal\
++ --enable-rpi\
++ --enable-v4l2-request\
++ --enable-libdrm\
++ --enable-libudev\
++ --enable-vout-drm\
++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++
++# --enable-shared\
++
++# --enable-decoder=hevc_rpi\
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
+new file mode 100755
+index 0000000000..400e7adcbf
+--- /dev/null
++++ b/pi-util/conf_pi1.sh
+@@ -0,0 +1,31 @@
++echo "Configure for Pi1"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --cpu=arm1176jzf-s\
++ --arch=arm\
++ --disable-neon\
++ --target-os=linux\
++ --disable-stripping\
++ --enable-mmal\
++ --enable-shared\
++ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
+new file mode 100755
+index 0000000000..e44c6857eb
+--- /dev/null
++++ b/pi-util/conf_pi2.sh
+@@ -0,0 +1,41 @@
++echo "Configure for Pi2/3"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++USR_PREFIX=`pwd`/install
++LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf
++INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf
++
++./configure --enable-cross-compile\
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ --arch=armv6t2\
++ --cpu=cortex-a7\
++ --target-os=linux\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-mmal\
++ --enable-rpi\
++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++# --enable-shared\
++
++# --enable-decoder=hevc_rpi\
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+new file mode 100755
+index 0000000000..2e59e6ceb5
+--- /dev/null
++++ b/pi-util/ffconf.py
+@@ -0,0 +1,216 @@
++#!/usr/bin/env python
++
++import string
++import os
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++ffmpeg_exec = "./ffmpeg"
++
++CODEC_HEVC_RPI  = 1
++HWACCEL_RPI     = 2
++HWACCEL_DRM     = 3
++HWACCEL_VAAPI   = 4
++
++def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec):
++    hwaccel = ""
++    if dectype == HWACCEL_RPI:
++        hwaccel = "rpi"
++    elif dectype == HWACCEL_DRM:
++        hwaccel = "drm"
++    elif dectype == HWACCEL_VAAPI:
++        hwaccel = "vaapi"
++
++    pix_fmt = []
++    if pix == "8":
++        pix_fmt = ["-pix_fmt", "yuv420p"]
++    elif pix == "10":
++        pix_fmt = ["-pix_fmt", "yuv420p10le"]
++    elif pix == "12":
++        pix_fmt = ["-pix_fmt", "yuv420p12le"]
++
++    tmp_root = "/tmp"
++
++    names = srcname.split('/')
++    while len(names) > 1:
++        tmp_root = os.path.join(tmp_root, names[0])
++        del names[0]
++    name = names[0]
++
++    if not os.path.exists(tmp_root):
++        os.makedirs(tmp_root)
++
++    dec_file = os.path.join(tmp_root, name + ".dec.md5")
++    try:
++        os.remove(dec_file)
++    except:
++        pass
++
++    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
++
++    ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
++
++    # Unaligned needed for cropping conformance
++    if hwaccel:
++        rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
++    else:
++        rstr = subprocess.call(
++            [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
++            stdout=flog, stderr=subprocess.STDOUT)
++
++    try:
++        m1 = None
++        m2 = None
++        with open(os.path.join(fileroot, md5_file)) as f:
++            for line in f:
++                m1 = re.search("[0-9a-f]{32}", line.lower())
++                if m1:
++                    break
++
++        with open(dec_file) as f:
++            m2 = re.search("[0-9a-f]{32}", f.readline())
++    except:
++        pass
++
++    if  m1 and m2 and m1.group() == m2.group():
++        print >> flog, "Match: " + m1.group()
++        rv = 0
++    elif not m1:
++        print >> flog, "****** Cannot find m1"
++        rv = 3
++    elif not m2:
++        print >> flog, "****** Cannot find m2"
++        rv = 2
++    else:
++        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
++        rv = 1
++    flog.close()
++    return rv
++
++def scandir(root):
++    aconf = []
++    ents = os.listdir(root)
++    ents.sort(key=str.lower)
++    for name in ents:
++        test_path = os.path.join(root, name)
++        if S_ISDIR(os.stat(test_path).st_mode):
++            files = os.listdir(test_path)
++            es_file = "?"
++            md5_file = "?"
++            for f in files:
++                (base, ext) = os.path.splitext(f)
++                if base[0] == '.':
++                    pass
++                elif ext == ".bit" or ext == ".bin":
++                    es_file = f
++                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
++                    if md5_file == "?":
++                        md5_file = f
++                    elif base[-3:] == "yuv":
++                        md5_file = f
++            aconf.append((1, name, es_file, md5_file))
++    return aconf
++
++def runtest(name, tests):
++    if not tests:
++        return True
++    for t in tests:
++        if name[0:len(t)] == t or name.find("/" + t) != -1:
++            return True
++    return False
++
++def doconf(csva, tests, test_root, vcodec, dectype):
++    unx_failures = []
++    unx_success = []
++    failures = 0
++    successes = 0
++    for a in csva:
++        exp_test = int(a[0])
++        if (exp_test and runtest(a[1], tests)):
++            name = a[1]
++            print "==== ", name,
++            sys.stdout.flush()
++
++            rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec)
++            if (rv == 0):
++                successes += 1
++            else:
++                failures += 1
++
++            if (rv == 0):
++                if exp_test == 2:
++                    print ": * OK *"
++                    unx_success.append(name)
++                else:
++                    print ": ok"
++            elif exp_test == 2 and rv == 1:
++                print ": fail"
++            elif exp_test == 3 and rv == 2:
++                # Call an expected "crash" an abort
++                print ": abort"
++            else:
++                unx_failures.append(name)
++                if rv == 1:
++                    print ": * FAIL *"
++                elif (rv == 2) :
++                    print ": * CRASH *"
++                elif (rv == 3) :
++                    print ": * MD5 MISSING *"
++                else :
++                    print ": * BANG *"
++
++    if unx_failures or unx_success:
++        print "Unexpected Failures:", unx_failures
++        print "Unexpected Success: ", unx_success
++    else:
++        print "All tests normal:", successes, "ok,", failures, "failed"
++
++
++class ConfCSVDialect(csv.Dialect):
++    delimiter = ','
++    doublequote = True
++    lineterminator = '\n'
++    quotechar='"'
++    quoting = csv.QUOTE_MINIMAL
++    skipinitialspace = True
++    strict = True
++
++if __name__ == '__main__':
++
++    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
++    argp.add_argument("tests", nargs='*')
++    argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
++    argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
++    argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
++    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
++    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
++    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
++    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
++    args = argp.parse_args()
++
++    if args.csvgen:
++        csv.writer(sys.stdout).writerows(scandir(args.test_root))
++        exit(0)
++
++    with open(args.csv, 'rt') as csvfile:
++        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
++
++    dectype = CODEC_HEVC_RPI
++    if os.path.exists("/dev/rpivid-hevcmem"):
++        dectype = HWACCEL_RPI
++    if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
++        dectype = HWACCEL_DRM
++
++    if args.pi4:
++        dectype = HWACCEL_RPI
++    elif args.drm:
++        dectype = HWACCEL_DRM
++    elif args.vaapi:
++        dectype = HWACCEL_VAAPI
++
++    doconf(csva, args.tests, args.test_root, args.vcodec, dectype)
++
+diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
+new file mode 100755
+index 0000000000..2fabe98c32
+--- /dev/null
++++ b/pi-util/ffperf.py
+@@ -0,0 +1,127 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++    close_threshold = 0.01
++
++    def __init__(self, stats_dict=None):
++        if stats_dict != None:
++            self.name = stats_dict["name"]
++            self.elapsed = float(stats_dict["elapsed"])
++            self.user = float(stats_dict["user"])
++            self.sys = float(stats_dict["sys"])
++
++    def times_str(self):
++        ctime = self.sys + self.user
++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++    def dict(self):
++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++    def is_close(self, other):
++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++    def __lt__(self, other):
++        return self.elapsed < other.elapsed
++    def __gt__(self, other):
++        return self.elapsed > other.elapsed
++
++    def time_file(name, prefix):
++        stats = tstats()
++        stats.name = name
++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        cproc = subprocess.Popen(["./ffmpeg",
++                                  "-hwaccel", "rpi",
++                                  "-t", "30", "-i", prefix + name,
++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++        pinfo = os.wait4(cproc.pid, 0)
++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        stats.elapsed = end_time - start_time
++        stats.user = pinfo[2].ru_utime
++        stats.sys = pinfo[2].ru_stime
++        return stats
++
++
++def common_prefix(s1, s2):
++    for i in range(min(len(s1),len(s2))):
++        if s1[i] != s2[i]:
++            return s1[:i]
++    return s1[:i+1]
++
++def main():
++    global flog
++
++    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
++To blank the screen before starting use "xdg-screensaver activate"
++(For some reason this doesn't seem to work from within python).
++""")
++
++    argp.add_argument("streams", nargs='*')
++    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
++    argp.add_argument("--csv_in", help="CSV input filename")
++    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
++    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
++
++    args = argp.parse_args()
++
++    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
++    csv_out.writeheader()
++
++    stats_in = {}
++    if args.csv_in != None:
++        with open(args.csv_in, 'r', newline='') as f_in:
++            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
++
++    streams = args.streams
++    if not streams:
++        if not stats_in:
++            print ("No source streams specified")
++            return 1
++        prefix = "" if args.prefix == None else args.prefix
++        streams = [k for k in stats_in]
++    elif args.prefix != None:
++        prefix = args.prefix
++    else:
++        prefix = streams[0]
++        for f in streams[1:]:
++            prefix = common_prefix(prefix, f)
++        pp = prefix.rpartition(os.sep)
++        prefix = pp[0] + pp[1]
++        streams = [s[len(prefix):] for s in streams]
++
++    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
++        print ("====", f)
++
++        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
++        for i in range(args.repeat):
++            t = tstats.time_file(f, prefix)
++            print ("...", t.times_str())
++            if t0 > t:
++                t0 = t
++
++        if t0.name in stats_in:
++            pstat = stats_in[t0.name]
++            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
++
++        csv_out.writerow(t0.dict())
++
++        print ()
++
++    return 0
++
++
++if __name__ == '__main__':
++    exit(main())
++
+diff --git a/pi-util/genpatch.sh b/pi-util/genpatch.sh
+new file mode 100644
+index 0000000000..0948a68a7a
+--- /dev/null
++++ b/pi-util/genpatch.sh
+@@ -0,0 +1,35 @@
++set -e
++
++NOPATCH=
++if [ "$1" == "--notag" ]; then
++  shift
++  NOPATCH=1
++fi
++
++if [ "$1" == "" ]; then
++  echo Usage: $0 [--notag] \<patch_tag\>
++  echo e.g.: $0 mmal_4
++  exit 1
++fi
++
++VERSION=`cat RELEASE`
++if [ "$VERSION" == "" ]; then
++  echo Can\'t find version RELEASE
++  exit 1
++fi
++
++PATCHFILE=../ffmpeg-$VERSION-$1.patch
++
++if [ $NOPATCH ]; then
++  echo Not tagged
++else
++  # Only continue if we are all comitted
++  git diff --name-status --exit-code
++
++  PATCHTAG=pi/$VERSION/$1
++  echo Tagging: $PATCHTAG
++
++  git tag $PATCHTAG
++fi
++echo Generating patch: $PATCHFILE
++git diff n$VERSION -- > $PATCHFILE
+diff --git a/pi-util/make_array.py b/pi-util/make_array.py
+new file mode 100755
+index 0000000000..67b22d2d51
+--- /dev/null
++++ b/pi-util/make_array.py
+@@ -0,0 +1,23 @@
++#!/usr/bin/env python
++
++# Usage
++#   make_array file.bin
++#   Produces file.h with array of bytes.
++#
++import sys
++for file in sys.argv[1:]:
++  prefix,suffix = file.split('.')
++  assert suffix=='bin'
++  name=prefix.split('/')[-1]
++  print 'Converting',file
++  with open(prefix+'.h','wb') as out:
++    print >>out, 'static const unsigned char',name,'[] = {'
++    with open(file,'rb') as fd:
++      i = 0
++      for byte in fd.read():
++        print >>out, '0x%02x, ' % ord(byte),
++        i = i + 1
++        if i % 8 == 0:
++          print >>out, ' // %04x' % (i - 8)
++    print >>out,'};'
++
+diff --git a/pi-util/mkinst.sh b/pi-util/mkinst.sh
+new file mode 100755
+index 0000000000..271a39e846
+--- /dev/null
++++ b/pi-util/mkinst.sh
+@@ -0,0 +1,5 @@
++set -e
++
++make install
++
++cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
+diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py
+new file mode 100755
+index 0000000000..e44cfa0c3c
+--- /dev/null
++++ b/pi-util/perfcmp.py
+@@ -0,0 +1,101 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++    close_threshold = 0.01
++
++    def __init__(self, stats_dict=None):
++        if stats_dict != None:
++            self.name = stats_dict["name"]
++            self.elapsed = float(stats_dict["elapsed"])
++            self.user = float(stats_dict["user"])
++            self.sys = float(stats_dict["sys"])
++
++    def times_str(self):
++        ctime = self.sys + self.user
++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++    def dict(self):
++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++    def is_close(self, other):
++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++    def __lt__(self, other):
++        return self.elapsed < other.elapsed
++    def __gt__(self, other):
++        return self.elapsed > other.elapsed
++
++    def time_file(name, prefix):
++        stats = tstats()
++        stats.name = name
++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++        pinfo = os.wait4(cproc.pid, 0)
++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        stats.elapsed = end_time - start_time
++        stats.user = pinfo[2].ru_utime
++        stats.sys = pinfo[2].ru_stime
++        return stats
++
++
++def common_prefix(s1, s2):
++    for i in range(min(len(s1),len(s2))):
++        if s1[i] != s2[i]:
++            return s1[:i]
++    return s1[:i+1]
++
++def main():
++    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
++
++    argp.add_argument("stream0", help="CSV to compare")
++    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
++
++    args = argp.parse_args()
++
++    with open(args.stream0, 'r', newline='') as f_in:
++        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++    with open(args.stream1, 'r', newline='') as f_in:
++        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++    print (args.stream0, "<<-->>", args.stream1)
++    print ()
++
++    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
++       if not (f in stats0) :
++           print ("           XX               :", f)
++           continue
++       if not (f in stats1) :
++           print ("       XX                   :", f)
++           continue
++
++       s0 = stats0[f]
++       s1 = stats1[f]
++
++       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
++       thresh = 0.3
++       tc = 6
++
++       nchar = min(tc - 1, int(abs(pcent) / thresh))
++       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
++
++       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
++           (s0.elapsed, cc, s1.elapsed, pcent, f))
++
++    return 0
++
++
++if __name__ == '__main__':
++    exit(main())
++
+diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+new file mode 100755
+index 0000000000..a4dbb6eacd
+--- /dev/null
++++ b/pi-util/qem.sh
+@@ -0,0 +1,9 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ ../local/bin/qasm.py
++SRC_FILE=libavcodec/rpi_hevc_shader.qasm
++DST_BASE=shader
++
++cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
+diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
+new file mode 100755
+index 0000000000..5935a11ca5
+--- /dev/null
++++ b/pi-util/v3dusage.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def do_logparse(logname):
++
++    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
++    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
++    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
++
++    ttotal = {'idle':0.0}
++    tstart = {}
++    qctotal = {}
++    qtstotal = {}
++    l2hits = {}
++    l2total = {}
++    time0 = None
++    idle_start = None
++    qpu_op_no = 0
++    op_count = 0
++
++    with open(logname, "rt") as infile:
++        for line in infile:
++            match = rmatch.match(line)
++            if match:
++#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++                time = float(match.group(1))
++                unit = match.group(3)
++                opstart = not match.group(2)
++                optype = match.group(7)
++                hascb = match.group(8) != "0"
++
++                if unit == 'qpu1':
++                    unit = unit + "." + str(qpu_op_no)
++                    if not opstart:
++                        if hascb or optype == 'EXECUTE_SYNC':
++                            qpu_op_no = 0
++                        else:
++                            qpu_op_no += 1
++
++                # Ignore sync type
++                if optype == 'EXECUTE_SYNC':
++                    continue
++
++                if not time0:
++                    time0 = time
++
++                if opstart:
++                    tstart[unit] = time;
++                elif unit in tstart:
++                    op_count += 1
++                    if not unit in ttotal:
++                        ttotal[unit] = 0.0
++                    ttotal[unit] += time - tstart[unit]
++                    del tstart[unit]
++
++                if not idle_start and not tstart:
++                    idle_start = time
++                elif idle_start and tstart:
++                    ttotal['idle'] += time - idle_start
++                    idle_start = None
++
++            match = rqcycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qctotal:
++                    qctotal[unit] = 0
++                qctotal[unit] += int(match.group(2))
++
++            match = rqtscycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qtstotal:
++                    qtstotal[unit] = 0
++                qtstotal[unit] += int(match.group(2))
++
++            match = rl2hits.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in l2total:
++                    l2total[unit] = 0
++                    l2hits[unit] = 0
++                l2total[unit] += int(match.group(3))
++                if match.group(2) == "hits":
++                    l2hits[unit] += int(match.group(3))
++
++
++    if not time0:
++        print "No v3d profile records found"
++    else:
++        tlogged = time - time0
++
++        print "Logged time:", tlogged, "  Op count:", op_count
++        for unit in sorted(ttotal):
++            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++        print
++        for unit in sorted(qctotal):
++            if not unit in qtstotal:
++                qtstotal[unit] = 0;
++            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
++            if unit in l2total:
++                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
++
++
++
++if __name__ == '__main__':
++    argp = argparse.ArgumentParser(
++        formatter_class=argparse.RawDescriptionHelpFormatter,
++        description="QPU/VPU perf summary from VC logging",
++        epilog = """
++Will also summarise TMU stalls if logging requests set in qpu noflush param
++in the profiled code.
++
++Example use:
++  vcgencmd set_logging level=0xc0
++  <command to profile>
++  sudo vcdbg log msg >& t.log
++  v3dusage.py t.log
++""")
++
++    argp.add_argument("logfile")
++    args = argp.parse_args()
++
++    do_logparse(args.logfile)
++

From 8dbeb6faecb064fad4e623d30145ba86613e4f5c Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 9 Jul 2020 13:35:00 +0200
Subject: [PATCH 08/10] ffmpeg: update to 4.3-Matrix-Alpha1

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 packages/multimedia/ffmpeg/package.mk | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 80127527c0..416e68121a 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -3,9 +3,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="ffmpeg"
-# Current branch is: release/4.2-kodi
-PKG_VERSION="4.2.2-Matrix-Alpha1"
-PKG_SHA256="0dba571f9809588cfbdc29d6a551dab4cd5736701653d9263847c9ac67bcde86"
+PKG_VERSION="4.3-Matrix-Alpha1"
+PKG_SHA256="8e159cdf1dfd3de2ac838fdaecb1fbb315e47f2ee2c542cd9d6efeb545c2b916"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
 PKG_URL="https://github.com/xbmc/FFmpeg/archive/${PKG_VERSION}.tar.gz"

From 014f61eed09d4809243afb93b927139f173d5482 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 9 Jul 2020 13:47:47 +0200
Subject: [PATCH 09/10] ffmpeg: reorganize patchset and configure option
 handling

Apply v4l2-drmprime and v4l2-request patches for all projects/devices
except RPi4 and use configure options to enable/disable the required
features.

The RPi4 patch already includes v4l2-request patches, so only add the
v4l2-drmprime patch in addition to that - this is needed for H264
hardware decoding.

RPi4 configure options have been adapted to the updated RPi patch.

Misc LibreELEC patches are now in the "libreelec" patch dir and
included for all projects/devices.

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 packages/multimedia/ffmpeg/package.mk | 51 +++++++++++++++++----------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 416e68121a..2ca7ddfa96 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -17,25 +17,43 @@ get_graphicdrivers
 
 PKG_FFMPEG_HWACCEL="--enable-hwaccels"
 
+PKG_FFMPEG_RPI="--disable-mmal"
+
+if [ "${PROJECT}" = "RPi" -a "${DEVICE}" = "RPi4" ]; then
+  PKG_PATCH_DIRS="rpi v4l2-drmprime"
+  PKG_FFMPEG_RPI+=" --disable-rpi --enable-sand"
+else
+  PKG_PATCH_DIRS="v4l2-request v4l2-drmprime"
+fi
+
+PKG_PATCH_DIRS+=" libreelec"
+
 if [ "${V4L2_SUPPORT}" = "yes" ]; then
   PKG_DEPENDS_TARGET+=" libdrm"
   PKG_NEED_UNPACK+=" $(get_pkg_directory libdrm)"
   PKG_FFMPEG_V4L2="--enable-v4l2_m2m --enable-libdrm"
 
-  if [ "${PROJECT}" = "RPi" ]; then
-    PKG_FFMPEG_RPI="--disable-rpi --disable-mmal"
-    if [ "${DEVICE}" = "RPi4" ]; then
-      PKG_DEPENDS_TARGET+=" systemd"
-      PKG_NEED_UNPACK+=" $(get_pkg_directory systemd)"
-      PKG_FFMPEG_V4L2+=" --enable-libudev \
-		         --enable-v4l2-request"
-      PKG_FFMPEG_HWACCEL="--disable-hwaccel=h264_v4l2request \
-	                  --disable-hwaccel=mpeg2_v4l2request \
-		          --disable-hwaccel=vp8_v4l2request"
-    fi
+  if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" ]; then
+    PKG_V4L2_REQUEST="yes"
+  elif [ "${PROJECT}" = "RPi" -a "${DEVICE}" = "RPi4" ]; then
+    PKG_V4L2_REQUEST="yes"
+    PKG_FFMPEG_HWACCEL="--disable-hwaccel=h264_v4l2request \
+                        --disable-hwaccel=mpeg2_v4l2request \
+                        --disable-hwaccel=vp8_v4l2request \
+                        --disable-hwaccel=vp9_v4l2request"
+  else
+    PKG_V4L2_REQUEST="no"
+  fi
+
+  if [ "${PKG_V4L2_REQUEST}" = "yes" ]; then
+    PKG_DEPENDS_TARGET+=" systemd"
+    PKG_NEED_UNPACK+=" $(get_pkg_directory systemd)"
+    PKG_FFMPEG_V4L2+=" --enable-libudev --enable-v4l2-request"
+  else
+    PKG_FFMPEG_V4L2+=" --disable-libudev --disable-v4l2-request"
   fi
 else
-  PKG_FFMPEG_V4L2="--disable-v4l2_m2m"
+  PKG_FFMPEG_V4L2="--disable-v4l2_m2m --disable-libudev --disable-v4l2-request"
 fi
 
 if [ "${VAAPI_SUPPORT}" = "yes" ]; then
@@ -54,12 +72,6 @@ else
   PKG_FFMPEG_VDPAU="--disable-vdpau"
 fi
 
-if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" ]; then
-  PKG_DEPENDS_TARGET+=" libdrm systemd" # systemd is needed for libudev
-  PKG_NEED_UNPACK+=" $(get_pkg_directory libdrm) $(get_pkg_directory systemd)"
-  PKG_FFMPEG_V4L2_REQUEST="--enable-v4l2-request --enable-libudev --enable-libdrm"
-fi
-
 if build_with_debug; then
   PKG_FFMPEG_DEBUG="--enable-debug --disable-stripping"
 else
@@ -80,6 +92,8 @@ if target_has_feature "(neon|sse)"; then
   PKG_DEPENDS_TARGET+=" dav1d"
   PKG_NEED_UNPACK+=" $(get_pkg_directory dav1d)"
   PKG_FFMPEG_AV1="--enable-libdav1d"
+else
+  PKG_FFMPEG_AV1="--disable-libdav1d"
 fi
 
 pre_configure_target() {
@@ -141,7 +155,6 @@ configure_target() {
               ${PKG_FFMPEG_VAAPI} \
               ${PKG_FFMPEG_VDPAU} \
               ${PKG_FFMPEG_RPI} \
-              ${PKG_FFMPEG_V4L2_REQUEST} \
               --enable-runtime-cpudetect \
               --disable-hardcoded-tables \
               --disable-encoders \

From 777b3ecced1752357cfcb7eff8aaa2dcb5f09c8d Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Thu, 16 Jul 2020 11:17:47 +0200
Subject: [PATCH 10/10] ffmpeg: update to 4.3.1-Matrix-Alpha1-1

Signed-off-by: Matthias Reichl <hias@horus.com>
---
 packages/multimedia/ffmpeg/package.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 2ca7ddfa96..a5069bf263 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="ffmpeg"
-PKG_VERSION="4.3-Matrix-Alpha1"
-PKG_SHA256="8e159cdf1dfd3de2ac838fdaecb1fbb315e47f2ee2c542cd9d6efeb545c2b916"
+PKG_VERSION="4.3.1-Matrix-Alpha1-1"
+PKG_SHA256="a7d956dbbe3c2036a8a78976efaf43792e1c7c152a04182024f231f4ee2e7d7e"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
 PKG_URL="https://github.com/xbmc/FFmpeg/archive/${PKG_VERSION}.tar.gz"