mirror of
https://github.com/LibreELEC/LibreELEC.tv.git
synced 2025-08-05 08:57:50 +00:00
Merge pull request #5093 from HiassofT/le10-ffmpeg-rpi-5
ffmpeg: update rpi patch
This commit is contained in:
commit
4f6669c37a
@ -47543,7 +47543,7 @@ index 22a9532444..5588e4a460 100644
|
|||||||
/**
|
/**
|
||||||
* Enqueues a buffer to a V4L2Context from an AVFrame
|
* Enqueues a buffer to a V4L2Context from an AVFrame
|
||||||
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
|
diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
|
||||||
index e48b3a8ccf..b994e39ad6 100644
|
index e48b3a8ccf..092b750dc4 100644
|
||||||
--- a/libavcodec/v4l2_m2m.c
|
--- a/libavcodec/v4l2_m2m.c
|
||||||
+++ b/libavcodec/v4l2_m2m.c
|
+++ b/libavcodec/v4l2_m2m.c
|
||||||
@@ -328,7 +328,10 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
|
@@ -328,7 +328,10 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
|
||||||
@ -47577,7 +47577,7 @@ index e48b3a8ccf..b994e39ad6 100644
|
|||||||
+ av_packet_unref(&s->buf_pkt);
|
+ av_packet_unref(&s->buf_pkt);
|
||||||
+
|
+
|
||||||
+ if (s->fd >= 0) {
|
+ if (s->fd >= 0) {
|
||||||
+ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
|
+ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
|
||||||
+ if (ret)
|
+ if (ret)
|
||||||
+ av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
|
+ av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name);
|
||||||
+
|
+
|
||||||
@ -47664,7 +47664,7 @@ index 456281f48c..b08a5b38ac 100644
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
|
diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
|
||||||
index 3e17e0fcac..e774532e36 100644
|
index 3e17e0fcac..a02012bf44 100644
|
||||||
--- a/libavcodec/v4l2_m2m_dec.c
|
--- a/libavcodec/v4l2_m2m_dec.c
|
||||||
+++ b/libavcodec/v4l2_m2m_dec.c
|
+++ b/libavcodec/v4l2_m2m_dec.c
|
||||||
@@ -23,6 +23,10 @@
|
@@ -23,6 +23,10 @@
|
||||||
@ -47958,10 +47958,7 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
+ return ret;
|
+ return ret;
|
||||||
+ }
|
+ }
|
||||||
+ }
|
+ }
|
||||||
|
+
|
||||||
- /* cant recover */
|
|
||||||
- if (ret == AVERROR(ENOMEM))
|
|
||||||
- return ret;
|
|
||||||
+ // Start if we haven't
|
+ // Start if we haven't
|
||||||
+ {
|
+ {
|
||||||
+ const int ret2 = v4l2_try_start(avctx);
|
+ const int ret2 = v4l2_try_start(avctx);
|
||||||
@ -47971,7 +47968,9 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
+ }
|
+ }
|
||||||
+ }
|
+ }
|
||||||
|
|
||||||
- return 0;
|
- /* cant recover */
|
||||||
|
- if (ret == AVERROR(ENOMEM))
|
||||||
|
- return ret;
|
||||||
+ return ret;
|
+ return ret;
|
||||||
+}
|
+}
|
||||||
+
|
+
|
||||||
@ -48018,7 +48017,8 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
+ // Go again if we got a frame that we need to discard
|
+ // Go again if we got a frame that we need to discard
|
||||||
+ } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
|
+ } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
|
||||||
+ }
|
+ }
|
||||||
+
|
|
||||||
|
- return 0;
|
||||||
+ // Continue trying to enqueue packets if either
|
+ // Continue trying to enqueue packets if either
|
||||||
+ // (a) we succeeded last time OR
|
+ // (a) we succeeded last time OR
|
||||||
+ // (b) enqueue failed due to input Q full AND there is now room
|
+ // (b) enqueue failed due to input Q full AND there is now room
|
||||||
@ -48052,8 +48052,8 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
+ src_rv < 0 ? src_rv :
|
+ src_rv < 0 ? src_rv :
|
||||||
+ dst_rv < 0 ? dst_rv :
|
+ dst_rv < 0 ? dst_rv :
|
||||||
+ AVERROR(EAGAIN);
|
+ AVERROR(EAGAIN);
|
||||||
+}
|
}
|
||||||
+
|
|
||||||
+#if 0
|
+#if 0
|
||||||
+#include <time.h>
|
+#include <time.h>
|
||||||
+static int64_t us_time(void)
|
+static int64_t us_time(void)
|
||||||
@ -48061,8 +48061,8 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
+ struct timespec ts;
|
+ struct timespec ts;
|
||||||
+ clock_gettime(CLOCK_MONOTONIC, &ts);
|
+ clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||||
+ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
|
+ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
|
||||||
}
|
+}
|
||||||
|
+
|
||||||
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
|
+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
|
||||||
+{
|
+{
|
||||||
+ int ret;
|
+ int ret;
|
||||||
@ -48125,10 +48125,14 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,10 +519,59 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
|
@@ -223,10 +519,58 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
|
||||||
|
|
||||||
static av_cold int v4l2_decode_close(AVCodecContext *avctx)
|
static av_cold int v4l2_decode_close(AVCodecContext *avctx)
|
||||||
{
|
{
|
||||||
|
- V4L2m2mPriv *priv = avctx->priv_data;
|
||||||
|
- V4L2m2mContext *s = priv->context;
|
||||||
|
- av_packet_unref(&s->buf_pkt);
|
||||||
|
- return ff_v4l2_m2m_codec_end(priv);
|
||||||
+ int rv;
|
+ int rv;
|
||||||
+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
|
+ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
|
||||||
+ rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
|
+ rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
|
||||||
@ -48146,19 +48150,17 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
+ // possibly limited h/w resources and fails on a Pi for this reason unless
|
+ // possibly limited h/w resources and fails on a Pi for this reason unless
|
||||||
+ // more GPU mem is allocated than is the default.
|
+ // more GPU mem is allocated than is the default.
|
||||||
+
|
+
|
||||||
V4L2m2mPriv *priv = avctx->priv_data;
|
+ V4L2m2mPriv * const priv = avctx->priv_data;
|
||||||
- V4L2m2mContext *s = priv->context;
|
+ V4L2m2mContext * const s = priv->context;
|
||||||
- av_packet_unref(&s->buf_pkt);
|
+ V4L2Context * const output = &s->output;
|
||||||
- return ff_v4l2_m2m_codec_end(priv);
|
+ V4L2Context * const capture = &s->capture;
|
||||||
+ V4L2m2mContext* s = priv->context;
|
|
||||||
+ V4L2Context* output = &s->output;
|
|
||||||
+ V4L2Context* capture = &s->capture;
|
|
||||||
+ int ret, i;
|
+ int ret, i;
|
||||||
+
|
+
|
||||||
+ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
|
+ av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
|
||||||
+
|
+
|
||||||
+ if (!output->streamon)
|
+ // Reflushing everything is benign, quick and avoids having to worry about
|
||||||
+ goto done;
|
+ // states like EOS processing so don't try to optimize out (having got it
|
||||||
|
+ // wrong once)
|
||||||
+
|
+
|
||||||
+ ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
|
+ ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
|
||||||
+ if (ret < 0)
|
+ if (ret < 0)
|
||||||
@ -48182,13 +48184,11 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
+ capture->done = 0;
|
+ capture->done = 0;
|
||||||
+
|
+
|
||||||
+ // Stream on will occur when we actually submit a new frame
|
+ // Stream on will occur when we actually submit a new frame
|
||||||
+
|
|
||||||
+done:
|
|
||||||
+ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
|
+ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define OFFSET(x) offsetof(V4L2m2mPriv, x)
|
#define OFFSET(x) offsetof(V4L2m2mPriv, x)
|
||||||
@@ -235,10 +580,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
|
@@ -235,10 +579,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
|
||||||
static const AVOption options[] = {
|
static const AVOption options[] = {
|
||||||
V4L_M2M_DEFAULT_OPTS,
|
V4L_M2M_DEFAULT_OPTS,
|
||||||
{ "num_capture_buffers", "Number of buffers in the capture context",
|
{ "num_capture_buffers", "Number of buffers in the capture context",
|
||||||
@ -48206,7 +48206,7 @@ index 3e17e0fcac..e774532e36 100644
|
|||||||
#define M2MDEC_CLASS(NAME) \
|
#define M2MDEC_CLASS(NAME) \
|
||||||
static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
|
static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
|
||||||
.class_name = #NAME "_v4l2m2m_decoder", \
|
.class_name = #NAME "_v4l2m2m_decoder", \
|
||||||
@@ -259,9 +610,14 @@ static const AVOption options[] = {
|
@@ -259,9 +609,14 @@ static const AVOption options[] = {
|
||||||
.init = v4l2_decode_init, \
|
.init = v4l2_decode_init, \
|
||||||
.receive_frame = v4l2_receive_frame, \
|
.receive_frame = v4l2_receive_frame, \
|
||||||
.close = v4l2_decode_close, \
|
.close = v4l2_decode_close, \
|
||||||
@ -54715,10 +54715,10 @@ index 5613813ba8..ab8bcfcf34 100644
|
|||||||
+
|
+
|
||||||
diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
|
diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
|
||||||
new file mode 100644
|
new file mode 100644
|
||||||
index 0000000000..641242dd8f
|
index 0000000000..1981e7d46f
|
||||||
--- /dev/null
|
--- /dev/null
|
||||||
+++ b/libavutil/aarch64/rpi_sand_neon.S
|
+++ b/libavutil/aarch64/rpi_sand_neon.S
|
||||||
@@ -0,0 +1,239 @@
|
@@ -0,0 +1,498 @@
|
||||||
+/*
|
+/*
|
||||||
+Copyright (c) 2021 Michael Eiler
|
+Copyright (c) 2021 Michael Eiler
|
||||||
+
|
+
|
||||||
@ -54838,7 +54838,7 @@ index 0000000000..641242dd8f
|
|||||||
+ add w11, w11, #128
|
+ add w11, w11, #128
|
||||||
+ // increment the row counter
|
+ // increment the row counter
|
||||||
+ add w12, w12, #1
|
+ add w12, w12, #1
|
||||||
+
|
+
|
||||||
+ // process the next row if we haven't finished yet
|
+ // process the next row if we haven't finished yet
|
||||||
+ cmp w15, w12
|
+ cmp w15, w12
|
||||||
+ bgt row_loop
|
+ bgt row_loop
|
||||||
@ -54957,13 +54957,272 @@ index 0000000000..641242dd8f
|
|||||||
+ ret
|
+ ret
|
||||||
+endfunc
|
+endfunc
|
||||||
+
|
+
|
||||||
|
+//void ff_rpi_sand30_lines_to_planar_y16(
|
||||||
|
+// uint8_t * dest, // [x0]
|
||||||
|
+// unsigned int dst_stride, // [w1] -> assumed to be equal to _w
|
||||||
|
+// const uint8_t * src, // [x2]
|
||||||
|
+// unsigned int src_stride1, // [w3] -> 128
|
||||||
|
+// unsigned int src_stride2, // [w4]
|
||||||
|
+// unsigned int _x, // [w5]
|
||||||
|
+// unsigned int y, // [w6]
|
||||||
|
+// unsigned int _w, // [w7]
|
||||||
|
+// unsigned int h); // [sp, #0]
|
||||||
|
+
|
||||||
|
+function ff_rpi_sand30_lines_to_planar_y16, export=1
|
||||||
|
+ str x19, [sp, #-8]
|
||||||
|
+ str x20, [sp, #-16]
|
||||||
|
+ str x21, [sp, #-24]
|
||||||
|
+ str x22, [sp, #-32]
|
||||||
|
+ str x23, [sp, #-40]
|
||||||
|
+
|
||||||
|
+ // w6 = argument h
|
||||||
|
+ ldr w6, [sp, #0]
|
||||||
|
+
|
||||||
|
+ // slice_inc = ((stride2 - 1) * stride1)
|
||||||
|
+ mov w5, w4
|
||||||
|
+ sub w5, w5, #1
|
||||||
|
+ lsl w5, w5, #7
|
||||||
|
+
|
||||||
|
+ // total number of bytes per row = (width / 3) * 4
|
||||||
|
+ mov w8, w7
|
||||||
|
+ mov w9, #3
|
||||||
|
+ udiv w8, w8, w9
|
||||||
|
+ lsl w8, w8, #2
|
||||||
|
+
|
||||||
|
+ // number of full 128 byte blocks to be processed
|
||||||
|
+ mov w9, #96
|
||||||
|
+ udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
|
||||||
|
+
|
||||||
|
+ // w10 = number of full integers to process (4 bytes)
|
||||||
|
+ // w11 = remaning zero to two 10bit values still to copy over
|
||||||
|
+ mov w12, #96
|
||||||
|
+ mul w12, w9, w12
|
||||||
|
+ sub w12, w7, w12 // width - blocks*96 = remaining points per row
|
||||||
|
+ mov w11, #3
|
||||||
|
+ udiv w10, w12, w11 // full integers to process = w12 / 3
|
||||||
|
+ mul w11, w10, w11 // #integers *3
|
||||||
|
+ sub w11, w12, w11 // remaining 0-2 points = remaining points - integers*3
|
||||||
|
+
|
||||||
|
+ // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
|
||||||
|
+ // this is to efficiently copy incomplete blocks at the end of the rows
|
||||||
|
+ // the last row is handled explicitly to avoid writing out of bounds
|
||||||
|
+ add w22, w10, w11
|
||||||
|
+ cmp w22, #0
|
||||||
|
+ cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
|
||||||
|
+ add w9, w9, w22
|
||||||
|
+ sub w6, w6, #1
|
||||||
|
+
|
||||||
|
+ // store the number of bytes in w20 which we copy too much for every row
|
||||||
|
+ // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
|
||||||
|
+ mov w20, #3
|
||||||
|
+ mul w21, w10, w20
|
||||||
|
+ mov w20, #96
|
||||||
|
+ sub w20, w20, w21 // w20 = 96 - #integers*3
|
||||||
|
+ sub w20, w20, w11 // w20 = 96 - #integers*3 - rem. points
|
||||||
|
+ cmp w20, #96
|
||||||
|
+ cset w21, eq
|
||||||
|
+ mov w23, #96
|
||||||
|
+ mul w23, w23, w21 // 0 or 1 * 96
|
||||||
|
+ sub w20, w20, w23 // = w20 mod 96
|
||||||
|
+ lsl w20, w20, #1 // convert to bytes (*2 since we store 16bits per value)
|
||||||
|
+
|
||||||
|
+ mov w23, #0 // flag to check whether the last line had already been processed
|
||||||
|
+
|
||||||
|
+ // bitmask to clear the uppper 6bits of the result values
|
||||||
|
+ mov x19, #0x03ff03ff03ff03ff
|
||||||
|
+ dup v22.2d, x19
|
||||||
|
+
|
||||||
|
+ // row counter = 0
|
||||||
|
+ eor w12, w12, w12
|
||||||
|
+row_loop_y16:
|
||||||
|
+ cmp w12, w6 // jump to row_loop_y16_fin if we processed all rows
|
||||||
|
+ bge row_loop_y16_fin
|
||||||
|
+
|
||||||
|
+ mov x13, x2 // row src
|
||||||
|
+ eor w14, w14, w14 // full block counter
|
||||||
|
+block_loop_y16:
|
||||||
|
+ cmp w14, w9
|
||||||
|
+ bge block_loop_y16_fin
|
||||||
|
+
|
||||||
|
+ // load 64 bytes
|
||||||
|
+ ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x13], #64
|
||||||
|
+
|
||||||
|
+ // process v0 and v1
|
||||||
|
+ xtn v16.4h, v0.4s
|
||||||
|
+ ushr v0.4s, v0.4s, #10
|
||||||
|
+ xtn v17.4h, v0.4s
|
||||||
|
+ ushr v0.4s, v0.4s, #10
|
||||||
|
+ xtn v18.4h, v0.4s
|
||||||
|
+
|
||||||
|
+ xtn2 v16.8h, v1.4s
|
||||||
|
+ and v16.16b, v16.16b, v22.16b
|
||||||
|
+ ushr v1.4s, v1.4s, #10
|
||||||
|
+ xtn2 v17.8h, v1.4s
|
||||||
|
+ and v17.16b, v17.16b, v22.16b
|
||||||
|
+ ushr v1.4s, v1.4s, #10
|
||||||
|
+ xtn2 v18.8h, v1.4s
|
||||||
|
+ and v18.16b, v18.16b, v22.16b
|
||||||
|
+
|
||||||
|
+ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
|
||||||
|
+
|
||||||
|
+ // process v2 and v3
|
||||||
|
+ xtn v23.4h, v2.4s
|
||||||
|
+ ushr v2.4s, v2.4s, #10
|
||||||
|
+ xtn v24.4h, v2.4s
|
||||||
|
+ ushr v2.4s, v2.4s, #10
|
||||||
|
+ xtn v25.4h, v2.4s
|
||||||
|
+
|
||||||
|
+ xtn2 v23.8h, v3.4s
|
||||||
|
+ and v23.16b, v23.16b, v22.16b
|
||||||
|
+ ushr v3.4s, v3.4s, #10
|
||||||
|
+ xtn2 v24.8h, v3.4s
|
||||||
|
+ and v24.16b, v24.16b, v22.16b
|
||||||
|
+ ushr v3.4s, v3.4s, #10
|
||||||
|
+ xtn2 v25.8h, v3.4s
|
||||||
|
+ and v25.16b, v25.16b, v22.16b
|
||||||
|
+
|
||||||
|
+ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
|
||||||
|
+
|
||||||
|
+ // load the second half of the block -> 64 bytes into registers v4-v7
|
||||||
|
+ ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13], #64
|
||||||
|
+
|
||||||
|
+ // process v4 and v5
|
||||||
|
+ xtn v16.4h, v4.4s
|
||||||
|
+ ushr v4.4s, v4.4s, #10
|
||||||
|
+ xtn v17.4h, v4.4s
|
||||||
|
+ ushr v4.4s, v4.4s, #10
|
||||||
|
+ xtn v18.4h, v4.4s
|
||||||
|
+
|
||||||
|
+ xtn2 v16.8h, v5.4s
|
||||||
|
+ and v16.16b, v16.16b, v22.16b
|
||||||
|
+ ushr v5.4s, v5.4s, #10
|
||||||
|
+ xtn2 v17.8h, v5.4s
|
||||||
|
+ and v17.16b, v17.16b, v22.16b
|
||||||
|
+ ushr v5.4s, v5.4s, #10
|
||||||
|
+ xtn2 v18.8h, v5.4s
|
||||||
|
+ and v18.16b, v18.16b, v22.16b
|
||||||
|
+
|
||||||
|
+ st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
|
||||||
|
+
|
||||||
|
+ // v6 and v7
|
||||||
|
+ xtn v23.4h, v6.4s
|
||||||
|
+ ushr v6.4s, v6.4s, #10
|
||||||
|
+ xtn v24.4h, v6.4s
|
||||||
|
+ ushr v6.4s, v6.4s, #10
|
||||||
|
+ xtn v25.4h, v6.4s
|
||||||
|
+
|
||||||
|
+ xtn2 v23.8h, v7.4s
|
||||||
|
+ and v23.16b, v23.16b, v22.16b
|
||||||
|
+ ushr v7.4s, v7.4s, #10
|
||||||
|
+ xtn2 v24.8h, v7.4s
|
||||||
|
+ and v24.16b, v24.16b, v22.16b
|
||||||
|
+ ushr v7.4s, v7.4s, #10
|
||||||
|
+ xtn2 v25.8h, v7.4s
|
||||||
|
+ and v25.16b, v25.16b, v22.16b
|
||||||
|
+
|
||||||
|
+ st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
|
||||||
|
+
|
||||||
|
+ add x13, x13, x5 // row src += slice_inc
|
||||||
|
+ add w14, w14, #1
|
||||||
|
+ b block_loop_y16
|
||||||
|
+block_loop_y16_fin:
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+ add x2, x2, #128 // src += stride1 (start of the next row)
|
||||||
|
+ sub x0, x0, x20 // subtract the bytes we copied too much from dst
|
||||||
|
+ add w12, w12, #1
|
||||||
|
+ b row_loop_y16
|
||||||
|
+row_loop_y16_fin:
|
||||||
|
+
|
||||||
|
+ // check whether we have incomplete blocks at the end of every row
|
||||||
|
+ // in that case decrease row block count by one
|
||||||
|
+ // change height back to it's original value (meaning increase it by 1)
|
||||||
|
+ // and jump back to another iteration of row_loop_y16
|
||||||
|
+
|
||||||
|
+ cmp w23, #1
|
||||||
|
+ beq row_loop_y16_fin2 // don't continue here if we already processed the last row
|
||||||
|
+ add w6, w6, #1 // increase height to the original value
|
||||||
|
+ sub w9, w9, w22 // block count - 1 or 0, depending on the remaining bytes count
|
||||||
|
+ mov w23, #1
|
||||||
|
+ b row_loop_y16
|
||||||
|
+row_loop_y16_fin2:
|
||||||
|
+
|
||||||
|
+ add x0, x0, x20 // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
|
||||||
|
+
|
||||||
|
+ // now we've got to handle the last block in the last row
|
||||||
|
+ eor w12, w12, w12 // w12 = 0 = counter
|
||||||
|
+integer_loop_y16:
|
||||||
|
+ cmp w12, w10
|
||||||
|
+ bge integer_loop_y16_fin
|
||||||
|
+ ldr w14, [x2], #4
|
||||||
|
+ and w15, w14, #0x3ff
|
||||||
|
+ strh w15, [x0], #2
|
||||||
|
+ lsr w14, w14, #10
|
||||||
|
+ and w15, w14, #0x3ff
|
||||||
|
+ strh w15, [x0], #2
|
||||||
|
+ lsr w14, w14, #10
|
||||||
|
+ and w15, w14, #0x3ff
|
||||||
|
+ strh w15, [x0], #2
|
||||||
|
+ add w12, w12, #1
|
||||||
|
+ b integer_loop_y16
|
||||||
|
+integer_loop_y16_fin:
|
||||||
|
+
|
||||||
|
+final_values_y16:
|
||||||
|
+ // remaining point count = w11
|
||||||
|
+ ldr w14, [x2], #4
|
||||||
|
+ cmp w11, #0
|
||||||
|
+ beq final_values_y16_fin
|
||||||
|
+ and w15, w14, #0x3ff
|
||||||
|
+ strh w15, [x0], #2
|
||||||
|
+ cmp w11, #1
|
||||||
|
+ beq final_values_y16_fin
|
||||||
|
+ lsr w14, w14, #10
|
||||||
|
+ and w15, w14, #0x3ff
|
||||||
|
+ strh w15, [x0], #2
|
||||||
|
+final_values_y16_fin:
|
||||||
|
+
|
||||||
|
+ ldr x23, [sp, #-40]
|
||||||
|
+ ldr x22, [sp, #-32]
|
||||||
|
+ ldr x21, [sp, #-24]
|
||||||
|
+ ldr x20, [sp, #-16]
|
||||||
|
+ ldr x19, [sp, #-8]
|
||||||
|
+
|
||||||
|
+ ret
|
||||||
|
+endfunc
|
||||||
|
+
|
||||||
|
+//void ff_rpi_sand30_lines_to_planar_c16(
|
||||||
|
+// uint8_t * dst_u,
|
||||||
|
+// unsigned int dst_stride_u,
|
||||||
|
+// uint8_t * dst_v,
|
||||||
|
+// unsigned int dst_stride_v,
|
||||||
|
+// const uint8_t * src,
|
||||||
|
+// unsigned int stride1,
|
||||||
|
+// unsigned int stride2,
|
||||||
|
+// unsigned int _x,
|
||||||
|
+// unsigned int y,
|
||||||
|
+// unsigned int _w,
|
||||||
|
+// unsigned int h);
|
||||||
|
+
|
||||||
|
+//void ff_rpi_sand30_lines_to_planar_p010(
|
||||||
|
+// uint8_t * dest,
|
||||||
|
+// unsigned int dst_stride,
|
||||||
|
+// const uint8_t * src,
|
||||||
|
+// unsigned int src_stride1,
|
||||||
|
+// unsigned int src_stride2,
|
||||||
|
+// unsigned int _x,
|
||||||
|
+// unsigned int y,
|
||||||
|
+// unsigned int _w,
|
||||||
|
+// unsigned int h);
|
||||||
|
+
|
||||||
+
|
+
|
||||||
diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
|
diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
|
||||||
new file mode 100644
|
new file mode 100644
|
||||||
index 0000000000..2894ce5aa3
|
index 0000000000..d820057624
|
||||||
--- /dev/null
|
--- /dev/null
|
||||||
+++ b/libavutil/aarch64/rpi_sand_neon.h
|
+++ b/libavutil/aarch64/rpi_sand_neon.h
|
||||||
@@ -0,0 +1,47 @@
|
@@ -0,0 +1,51 @@
|
||||||
+/*
|
+/*
|
||||||
+Copyright (c) 2021 Michael Eiler
|
+Copyright (c) 2021 Michael Eiler
|
||||||
+
|
+
|
||||||
@ -55007,6 +55266,10 @@ index 0000000000..2894ce5aa3
|
|||||||
+ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
|
+ unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
|
||||||
+ unsigned int _w, unsigned int h);
|
+ unsigned int _w, unsigned int h);
|
||||||
+
|
+
|
||||||
|
+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
|
||||||
|
+ const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
|
||||||
|
+ unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
|
||||||
|
+
|
||||||
+#ifdef __cplusplus
|
+#ifdef __cplusplus
|
||||||
+}
|
+}
|
||||||
+#endif
|
+#endif
|
||||||
@ -56495,7 +56758,7 @@ index 0000000000..0324f6826d
|
|||||||
+
|
+
|
||||||
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
|
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
|
||||||
new file mode 100644
|
new file mode 100644
|
||||||
index 0000000000..038c306877
|
index 0000000000..64c34ced56
|
||||||
--- /dev/null
|
--- /dev/null
|
||||||
+++ b/libavutil/rpi_sand_fns.c
|
+++ b/libavutil/rpi_sand_fns.c
|
||||||
@@ -0,0 +1,357 @@
|
@@ -0,0 +1,357 @@
|
||||||
@ -56598,7 +56861,7 @@ index 0000000000..038c306877
|
|||||||
+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
|
+ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
|
||||||
+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
|
+ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words
|
||||||
+
|
+
|
||||||
+#if HAVE_SAND_ASM
|
+#if HAVE_SAND_ASM || HAVE_SAND_ASM64
|
||||||
+ if (_x == 0) {
|
+ if (_x == 0) {
|
||||||
+ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
|
+ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
|
||||||
+ return;
|
+ return;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user