From 847cb16c0f54083a2120846891443b8e5a142fd9 Mon Sep 17 00:00:00 2001
From: Matthias Reichl <hias@horus.com>
Date: Tue, 16 Feb 2021 16:47:04 +0100
Subject: [PATCH] ffmpeg: update rpi patch

Patch created using revisions 922f5ee..3497613
from branch dev/4.3.1/drm_prime_1 of https://github.com/jc-kynesim/rpi-ffmpeg
---
 .../ffmpeg/patches/rpi/ffmpeg-001-rpi.patch   | 278 ++++++++++++++++--
 1 file changed, 253 insertions(+), 25 deletions(-)

diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 0bb3199ffe..b54dc78905 100644
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -47232,7 +47232,7 @@ index 8dbc7fc104..46ca85ce65 100644
   * Extracts the data from an AVFrame to a V4L2Buffer
   *
 diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 29b144ed73..97956eeb2b 100644
+index 29b144ed73..a8590d0ea1 100644
 --- a/libavcodec/v4l2_context.c
 +++ b/libavcodec/v4l2_context.c
 @@ -173,7 +173,8 @@ static int v4l2_handle_event(V4L2Context *ctx)
@@ -47245,6 +47245,24 @@ index 29b144ed73..97956eeb2b 100644
          return 0;
      }
  
+@@ -196,15 +197,15 @@ static int v4l2_handle_event(V4L2Context *ctx)
+     if (full_reinit) {
+         s->output.height = v4l2_get_height(&out_fmt);
+         s->output.width = v4l2_get_width(&out_fmt);
+-        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
+     }
++    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
+ 
+     reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
+     if (reinit) {
+         s->capture.height = v4l2_get_height(&cap_fmt);
+         s->capture.width = v4l2_get_width(&cap_fmt);
+-        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+     }
++    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+ 
+     if (full_reinit || reinit)
+         s->reinit = 1;
 @@ -280,6 +281,21 @@ static int v4l2_stop_encode(V4L2Context *ctx)
      return 0;
  }
@@ -50089,10 +50107,10 @@ index 0000000000..d6332c01c7
 +};
 diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
 new file mode 100644
-index 0000000000..1c675d6dee
+index 0000000000..2e21145328
 --- /dev/null
 +++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,652 @@
+@@ -0,0 +1,675 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -50226,6 +50244,24 @@ index 0000000000..1c675d6dee
 +    return 0;
 +}
 +
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++    unsigned int z = 0;
++    while (idx--) {
++        if (*b++ == 0) {
++            ++z;
++            if (z >= 2 && *b == 3) {
++                ++b;
++                z = 0;
++            }
++        }
++        else {
++            z = 0;
++        }
++    }
++    return b;
++}
++
 +static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h,
 +                                                struct v4l2_ctrl_hevc_slice_params *slice_params)
 +{
@@ -50235,8 +50271,8 @@ index 0000000000..1c675d6dee
 +    RefPicList *rpl;
 +
 +    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
-+        .bit_size = 0,
-+        .data_bit_offset = get_bits_count(&h->HEVClc->gb),
++        .bit_size = 0, // Set later
++        .data_bit_offset = 0, // Set later
 +
 +        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
 +        .slice_segment_addr = sh->slice_segment_addr,
@@ -50564,6 +50600,8 @@ index 0000000000..1c675d6dee
 +    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
 +    V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->ref->frame->data[0];
 +    int ret, slice = FFMIN(controls->num_slices, MAX_SLICES - 1);
++    int bcount = get_bits_count(&h->HEVClc->gb);
++    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
 +
 +    if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && slice) {
 +        ret = v4l2_request_hevc_queue_decode(avctx, 0);
@@ -50578,16 +50616,19 @@ index 0000000000..1c675d6dee
 +    v4l2_request_hevc_fill_slice_params(h, &controls->slice_params[slice]);
 +
 +    if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
++        // ?? Do we really not need the nal type ??
 +        ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, nalu_slice_start_code, 3);
 +        if (ret)
 +            return ret;
 +    }
++    boff += req->output.used * 8;
 +
 +    ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, buffer, size);
 +    if (ret)
 +        return ret;
 +
 +    controls->slice_params[slice].bit_size = req->output.used * 8; //FIXME
++    controls->slice_params[slice].data_bit_offset = boff; //FIXME
 +    controls->num_slices++;
 +    return 0;
 +}
@@ -54715,10 +54756,10 @@ index 5613813ba8..ab8bcfcf34 100644
 +
 diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
 new file mode 100644
-index 0000000000..1981e7d46f
+index 0000000000..5922d6eaf5
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,498 @@
+@@ -0,0 +1,681 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -55156,7 +55197,7 @@ index 0000000000..1981e7d46f
 +integer_loop_y16:
 +    cmp w12, w10
 +    bge integer_loop_y16_fin
-+    ldr w14, [x2], #4
++    ldr w14, [x13], #4
 +    and w15, w14, #0x3ff
 +    strh w15, [x0], #2
 +    lsr w14, w14, #10
@@ -55171,7 +55212,7 @@ index 0000000000..1981e7d46f
 +
 +final_values_y16:
 +    // remaining point count = w11
-+    ldr w14, [x2], #4
++    ldr w14, [x13], #4
 +    cmp w11, #0
 +    beq final_values_y16_fin
 +    and w15, w14, #0x3ff
@@ -55193,17 +55234,201 @@ index 0000000000..1981e7d46f
 +endfunc
 +
 +//void ff_rpi_sand30_lines_to_planar_c16(
-+//  uint8_t * dst_u,
-+//  unsigned int dst_stride_u,
-+//  uint8_t * dst_v,
-+//  unsigned int dst_stride_v,
-+//  const uint8_t * src,
-+//  unsigned int stride1,
-+//  unsigned int stride2,
-+//  unsigned int _x,
-+//  unsigned int y,
-+//  unsigned int _w,
-+//  unsigned int h);
++//  uint8_t * dst_u,            // [x0]
++//  unsigned int dst_stride_u,  // [w1] == _w*2
++//  uint8_t * dst_v,            // [x2]
++//  unsigned int dst_stride_v,  // [w3] == _w*2
++//  const uint8_t * src,        // [x4]
++//  unsigned int stride1,       // [w5] == 128
++//  unsigned int stride2,       // [w6] 
++//  unsigned int _x,            // [w7] == 0
++//  unsigned int y,             // [sp, #0] == 0
++//  unsigned int _w,            // [sp, #8] -> w3
++//  unsigned int h);            // [sp, #16] -> w7
++
++.macro rpi_sand30_lines_to_planar_c16_block_half
++    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
++
++    xtn v4.4h, v0.4s
++    ushr v0.4s, v0.4s, #10
++    xtn v5.4h, v0.4s
++    ushr v0.4s, v0.4s, #10
++    xtn v6.4h, v0.4s
++    xtn2 v4.8h, v1.4s
++    ushr v1.4s, v1.4s, #10
++    xtn2 v5.8h, v1.4s
++    ushr v1.4s, v1.4s, #10
++    xtn2 v6.8h, v1.4s
++    and v4.16b, v4.16b, v16.16b
++    and v5.16b, v5.16b, v16.16b
++    and v6.16b, v6.16b, v16.16b
++    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
++    
++    xtn v4.4h, v2.4s
++    ushr v2.4s, v2.4s, #10
++    xtn v5.4h, v2.4s
++    ushr v2.4s, v2.4s, #10
++    xtn v6.4h, v2.4s
++    xtn2 v4.8h, v3.4s
++    ushr v3.4s, v3.4s, #10
++    xtn2 v5.8h, v3.4s
++    ushr v3.4s, v3.4s, #10
++    xtn2 v6.8h, v3.4s
++    and v4.16b, v4.16b, v16.16b
++    and v5.16b, v5.16b, v16.16b
++    and v6.16b, v6.16b, v16.16b
++    st3 { v4.8h, v5.8h, v6.8h }, [sp]
++    sub sp, sp, #48
++.endm
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++    str x19, [sp, #-8]
++    str x20, [sp, #-16]
++    str x21, [sp, #-24]
++    str x22, [sp, #-32]
++    str x23, [sp, #-40]
++
++    ldr w3, [sp, #8]    // w3 = width
++    ldr w7, [sp, #16]   // w7 = height
++
++    // reserve space on the stack for intermediate results
++    sub sp, sp, #256
++
++    // number of 128byte blocks per row, w8 = width / 48
++    mov w9, #48
++    udiv w8, w3, w9
++
++    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
++    mul w9, w8, w9
++    sub w9, w3, w9
++
++    // row offset, the beginning of the next row to process
++    eor w10, w10, w10
++
++    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
++    lsl w11, w6, #7
++    sub w11, w11, #128
++
++    // decrease the height by one and in case of remaining pixels increase the block count by one
++    sub w7, w7, #1
++    cmp w9, #0
++    cset w19, ne    // w19 == 1 iff reamining pixels != 0
++    add w8, w8, w19
++
++    // bytes we have to move dst back by at the end of every row
++    mov w21, #48
++    mul w21, w21, w19 
++    sub w21, w21, w9
++    lsl w21, w21, #1    // w21 = (#48 * w19 - rem_pix) * 2
++
++    mov w20, #0     // w20 = flag, last row processed
++
++    mov x12, #0x03ff03ff03ff03ff
++    dup v16.2d, x12
++
++    // iterate through rows, row counter = w12 = 0
++    eor w12, w12, w12
++row_loop_c16:
++    cmp w12, w7
++    bge row_loop_c16_fin
++
++    // address of row data = src + row_offset
++    mov x13, x4
++    add x13, x13, x10
++
++    eor w14, w14, w14
++block_loop_c16:
++    cmp w14, w8
++    bge block_loop_c16_fin
++
++    rpi_sand30_lines_to_planar_c16_block_half
++
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp]
++    sub sp, sp, #64
++
++    st1 { v0.8h }, [x0], #16
++    st1 { v2.8h }, [x0], #16
++    st1 { v4.8h }, [x0], #16
++    st1 { v1.8h }, [x2], #16
++    st1 { v3.8h }, [x2], #16
++    st1 { v5.8h }, [x2], #16
++
++    rpi_sand30_lines_to_planar_c16_block_half
++
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp]
++    sub sp, sp, #64
++
++    st1 { v0.8h }, [x0], #16
++    st1 { v2.8h }, [x0], #16
++    st1 { v4.8h }, [x0], #16
++    st1 { v1.8h }, [x2], #16
++    st1 { v3.8h }, [x2], #16
++    st1 { v5.8h }, [x2], #16
++
++    add x13, x13, x11 // offset to next block
++    add w14, w14, #1
++    b block_loop_c16
++block_loop_c16_fin:
++
++    add w10, w10, #128
++    add w12, w12, #1
++    sub x0, x0, x21  // move dst pointers back by x21
++    sub x2, x2, x21
++    b row_loop_c16
++row_loop_c16_fin:
++
++    cmp w20, #1
++    beq row_loop_c16_fin2
++    mov w20, #1
++    sub w8, w8, w19 // decrease block count by w19
++    add w7, w7, #1 // increase height
++    b row_loop_c16
++
++row_loop_c16_fin2:
++    add x0, x0, x21 // readd x21 in case of the last row
++    add x2, x2, x21 // so that we can write out the few remaining pixels
++
++    // last incomplete block to be finished
++    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
++    rpi_sand30_lines_to_planar_c16_block_half
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp], #32
++    rpi_sand30_lines_to_planar_c16_block_half
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp]
++    sub sp, sp, #160
++
++    mov x4, sp
++    eor w20, w20, w20
++rem_pix_c16_loop:
++    cmp w20, w9
++    bge rem_pix_c16_fin
++
++    ldr w22, [x4], #4
++    str w22, [x0], #2
++    lsr w22, w22, #16
++    str w22, [x2], #2 
++
++    add w20, w20, #1
++    b rem_pix_c16_loop
++rem_pix_c16_fin:
++
++    add sp, sp, #256
++    ldr x23, [sp, #-40]
++    ldr x22, [sp, #-32]
++    ldr x21, [sp, #-24]
++    ldr x20, [sp, #-16]
++    ldr x19, [sp, #-8]
++    ret
++endfunc
++
++
 +
 +//void ff_rpi_sand30_lines_to_planar_p010(
 +//  uint8_t * dest,
@@ -55216,13 +55441,12 @@ index 0000000000..1981e7d46f
 +//  unsigned int _w,
 +//  unsigned int h);
 +
-+
 diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
 new file mode 100644
-index 0000000000..d820057624
+index 0000000000..b3aa481ea4
 --- /dev/null
 +++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -0,0 +1,51 @@
+@@ -0,0 +1,55 @@
 +/*
 +Copyright (c) 2021 Michael Eiler
 +
@@ -55270,6 +55494,10 @@ index 0000000000..d820057624
 +  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
 +  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
 +
++void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
++  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
++  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
 +#ifdef __cplusplus
 +}
 +#endif
@@ -56758,7 +56986,7 @@ index 0000000000..0324f6826d
 +
 diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
 new file mode 100644
-index 0000000000..64c34ced56
+index 0000000000..4256adf9c8
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
 @@ -0,0 +1,357 @@
@@ -56927,7 +57155,7 @@ index 0000000000..64c34ced56
 +    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
 +    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
 +
-+#if HAVE_SAND_ASM
++#if HAVE_SAND_ASM || HAVE_SAND_ASM64
 +    if (_x == 0) {
 +        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
 +                                       src, stride1, stride2, _x, y, _w, h);