ffmpeg: update upstream patches

Signed-off-by: Stephan Raue <stephan@openelec.tv>
This commit is contained in:
Stephan Raue 2013-08-25 22:59:41 +02:00
parent 97f129b29d
commit 96f69f6abd
3 changed files with 490 additions and 0 deletions

View File

@ -0,0 +1,40 @@
From 90ac611930e202c8255887c6b3e82ff97a1117f0 Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Thu, 22 Aug 2013 14:34:37 +0100
Subject: [PATCH] h264_parser: Initialize the h264dsp context in the parser as
well
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Each AVStream struct for an H.264 elementary stream actually has two
copies of the H264DSPContext struct (and in fact all the other members
of H264Context as well):
((H264Context *) ((AVStream *)st)->codec->priv_data)->h264dsp
((H264Context *) ((AVStream *)st)->parser->priv_data)->h264dsp
but only the first of these was actually being initialised. This
prevented the addition of platform-specific implementations of
parser-related functions.
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/h264_parser.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index c9dea81..98e2844 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -377,6 +377,7 @@ static int init(AVCodecParserContext *s)
H264Context *h = s->priv_data;
h->thread_context[0] = h;
h->s.slice_context_count = 1;
+ ff_h264dsp_init(&h->h264dsp, 8, 1);
return 0;
}
--
1.8.1.6

View File

@ -0,0 +1,129 @@
From c58d69bea87c77c7a1e8221624137beaaa670586 Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Thu, 22 Aug 2013 14:37:18 +0100
Subject: [PATCH] h264dsp: Factorize code into a new function,
h264_find_start_code_candidate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This performs the start code search which was previously part of
h264_find_frame_end() - the most CPU intensive part of the function.
By itself, this results in a performance regression:
Before After
Mean StdDev Mean StdDev Change
Overall time 2925.6 26.2 3068.5 31.7 -4.7%
but this can more than be made up for by platform-optimised
implementations of the function.
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/h264_parser.c | 21 +++------------------
libavcodec/h264dsp.c | 29 +++++++++++++++++++++++++++++
libavcodec/h264dsp.h | 9 +++++++++
3 files changed, 41 insertions(+), 18 deletions(-)
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index 98e2844..41c874d 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -65,24 +65,9 @@ static int ff_h264_find_frame_end(H264Context *h, const uint8_t *buf, int buf_si
}
if(state==7){
-#if HAVE_FAST_UNALIGNED
- /* we check i<buf_size instead of i+3/7 because its simpler
- * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end
- */
-# if HAVE_FAST_64BIT
- while(i<next_avc && !((~*(const uint64_t*)(buf+i) & (*(const uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL))
- i+=8;
-# else
- while(i<next_avc && !((~*(const uint32_t*)(buf+i) & (*(const uint32_t*)(buf+i) - 0x01010101U)) & 0x80808080U))
- i+=4;
-# endif
-#endif
- for(; i<next_avc; i++){
- if(!buf[i]){
- state=2;
- break;
- }
- }
+ i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i);
+ if (i < buf_size)
+ state = 2;
}else if(state<=2){
if(buf[i]==1) state^= 5; //2->7, 1->4, 0->5
else if(buf[i]) state = 7;
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index bd35aa3..0703172 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -41,6 +41,34 @@
#include "h264dsp_template.c"
#undef BIT_DEPTH
+static int h264_find_start_code_candidate_c(const uint8_t *buf, int size)
+{
+ int i = 0;
+#if HAVE_FAST_UNALIGNED
+ /* we check i < size instead of i + 3 / 7 because it is
+ * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
+ * bytes at the end.
+ */
+#if HAVE_FAST_64BIT
+ while (i < size &&
+ !((~*(const uint64_t *)(buf + i) &
+ (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+ 0x8080808080808080ULL))
+ i += 8;
+#else
+ while (i < size &&
+ !((~*(const uint32_t *)(buf + i) &
+ (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+ 0x80808080U))
+ i += 4;
+#endif
+#endif
+ for (; i < size; i++)
+ if (!buf[i])
+ break;
+ return i;
+}
+
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
{
#undef FUNC
@@ -110,6 +138,7 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
H264_DSP(8);
break;
}
+ c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 490a936..1d172f1 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -74,6 +74,15 @@ typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int h
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[15*8]);
void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
void (*h264_chroma_dc_dequant_idct)(DCTELEM *block, int qmul);
+
+ /**
+ * Search buf from the start for up to size bytes. Return the index
+ * of a zero byte, or >= size if not found. Ideally, use lookahead
+ * to filter out any zero bytes that are known to not be followed by
+ * one or more further zero bytes and a one byte. Better still, filter
+ * out any bytes that form the trailing_zero_8bits syntax element too.
+ */
+ int (*h264_find_start_code_candidate)(const uint8_t *buf, int size);
}H264DSPContext;
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc);
--
1.8.1.6

View File

@ -0,0 +1,321 @@
From 4b8f90c0702324a9fcc3909c8bf7d80f090c41ba Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Thu, 22 Aug 2013 14:39:21 +0100
Subject: [PATCH] arm: Add assembly version of h264_find_start_code_candidate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 508.8 23.4 185.4 9.0 +174.4%
Overall 3068.5 31.7 2752.1 29.4 +11.5%
In combination with the preceding patch:
Before After
Mean StdDev Mean StdDev Change
Overall 2925.6 26.2 2752.1 29.4 +6.3%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/h264dsp_armv6.S | 253 +++++++++++++++++++++++++++
libavcodec/arm/h264dsp_init_arm.c | 4 +
3 files changed, 258 insertions(+)
create mode 100644 libavcodec/arm/h264dsp_armv6.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 798dfe3..8f2b683 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -4,6 +4,7 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
+ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S
new file mode 100644
index 0000000..c2cafc2
--- /dev/null
+++ b/libavcodec/arm/h264dsp_armv6.S
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+RESULT .req a1
+BUF .req a1
+SIZE .req a2
+PATTERN .req a3
+PTR .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+TMP0 .req v5
+TMP1 .req v6
+TMP2 .req ip
+TMP3 .req lr
+
+#define PRELOAD_DISTANCE 4
+
+.macro innerloop4
+ ldr DAT0, [PTR], #4
+ subs SIZE, SIZE, #4 @ C flag survives rest of macro
+ sub TMP0, DAT0, PATTERN, lsr #14
+ bic TMP0, TMP0, DAT0
+ ands TMP0, TMP0, PATTERN
+.endm
+
+.macro innerloop16 decrement, do_preload
+ ldmia PTR!, {DAT0,DAT1,DAT2,DAT3}
+ .ifnc "\do_preload",""
+ pld [PTR, #PRELOAD_DISTANCE*32]
+ .endif
+ .ifnc "\decrement",""
+ subs SIZE, SIZE, #\decrement @ C flag survives rest of macro
+ .endif
+ sub TMP0, DAT0, PATTERN, lsr #14
+ sub TMP1, DAT1, PATTERN, lsr #14
+ bic TMP0, TMP0, DAT0
+ bic TMP1, TMP1, DAT1
+ sub TMP2, DAT2, PATTERN, lsr #14
+ sub TMP3, DAT3, PATTERN, lsr #14
+ ands TMP0, TMP0, PATTERN
+ bic TMP2, TMP2, DAT2
+ it eq
+ andseq TMP1, TMP1, PATTERN
+ bic TMP3, TMP3, DAT3
+ itt eq
+ andseq TMP2, TMP2, PATTERN
+ andseq TMP3, TMP3, PATTERN
+.endm
+
+/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */
+function ff_h264_find_start_code_candidate_armv6, export=1
+ push {v1-v6,lr}
+ mov PTR, BUF
+ @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
+ @ before using code that does preloads
+ cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
+ blo 60f
+
+ @ Get to word-alignment, 1 byte at a time
+ tst PTR, #3
+ beq 2f
+1: ldrb DAT0, [PTR], #1
+ sub SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ tst PTR, #3
+ bne 1b
+2: @ Get to 4-word alignment, 1 word at a time
+ ldr PATTERN, =0x80008000
+ setend be
+ tst PTR, #12
+ beq 4f
+3: innerloop4
+ bne 91f
+ tst PTR, #12
+ bne 3b
+4: @ Get to cacheline (8-word) alignment
+ tst PTR, #16
+ beq 5f
+ innerloop16 16
+ bne 93f
+5: @ Check complete cachelines, with preloading
+ @ We need to stop when there are still (PRELOAD_DISTANCE+1)
+ @ complete cachelines to go
+ sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
+6: innerloop16 , do_preload
+ bne 93f
+ innerloop16 32
+ bne 93f
+ bcs 6b
+ @ Preload trailing part-cacheline, if any
+ tst SIZE, #31
+ beq 7f
+ pld [PTR, #(PRELOAD_DISTANCE+1)*32]
+ @ Check remaining data without doing any more preloads. First
+ @ do in chunks of 4 words:
+7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
+ bmi 9f
+8: innerloop16 16
+ bne 93f
+ bcs 8b
+ @ Then in words:
+9: adds SIZE, SIZE, #16 - 4
+ bmi 11f
+10: innerloop4
+ bne 91f
+ bcs 10b
+11: setend le
+ @ Check second byte of final halfword
+ ldrb DAT0, [PTR, #-1]
+ teq DAT0, #0
+ beq 90f
+ @ Check any remaining bytes
+ tst SIZE, #3
+ beq 13f
+12: ldrb DAT0, [PTR], #1
+ sub SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ tst SIZE, #3
+ bne 12b
+ @ No candidate found
+13: sub RESULT, PTR, BUF
+ b 99f
+
+60: @ Small buffer - simply check by looping over bytes
+ subs SIZE, SIZE, #1
+ bcc 99f
+61: ldrb DAT0, [PTR], #1
+ subs SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ bcs 61b
+ @ No candidate found
+ sub RESULT, PTR, BUF
+ b 99f
+
+90: @ Found a candidate at the preceding byte
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #1
+ b 99f
+
+91: @ Found a candidate somewhere in the preceding 4 bytes
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #4
+ sub TMP0, DAT0, #0x20000
+ bics TMP0, TMP0, DAT0
+ itt pl
+ ldrbpl DAT0, [PTR, #-3]
+ addpl RESULT, RESULT, #2
+ bpl 92f
+ teq RESULT, #0
+ beq 98f @ don't look back a byte if found at first byte in buffer
+ ldrb DAT0, [PTR, #-5]
+92: teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+
+93: @ Found a candidate somewhere in the preceding 16 bytes
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #16
+ teq TMP0, #0
+ beq 95f @ not in first 4 bytes
+ sub TMP0, DAT0, #0x20000
+ bics TMP0, TMP0, DAT0
+ itt pl
+ ldrbpl DAT0, [PTR, #-15]
+ addpl RESULT, RESULT, #2
+ bpl 94f
+ teq RESULT, #0
+ beq 98f @ don't look back a byte if found at first byte in buffer
+ ldrb DAT0, [PTR, #-17]
+94: teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+95: add RESULT, RESULT, #4
+ teq TMP1, #0
+ beq 96f @ not in next 4 bytes
+ sub TMP1, DAT1, #0x20000
+ bics TMP1, TMP1, DAT1
+ itee mi
+ ldrbmi DAT0, [PTR, #-13]
+ ldrbpl DAT0, [PTR, #-11]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+96: add RESULT, RESULT, #4
+ teq TMP2, #0
+ beq 97f @ not in next 4 bytes
+ sub TMP2, DAT2, #0x20000
+ bics TMP2, TMP2, DAT2
+ itee mi
+ ldrbmi DAT0, [PTR, #-9]
+ ldrbpl DAT0, [PTR, #-7]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+97: add RESULT, RESULT, #4
+ sub TMP3, DAT3, #0x20000
+ bics TMP3, TMP3, DAT3
+ itee mi
+ ldrbmi DAT0, [PTR, #-5]
+ ldrbpl DAT0, [PTR, #-3]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ @ drop through to 98f
+98: setend le
+99: pop {v1-v6,pc}
+.endfunc
+
+ .unreq RESULT
+ .unreq BUF
+ .unreq SIZE
+ .unreq PATTERN
+ .unreq PTR
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq TMP0
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index cc4c688..c282c21 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -23,6 +23,8 @@
#include "libavcodec/dsputil.h"
#include "libavcodec/h264dsp.h"
+int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
+
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
@@ -99,5 +101,7 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
{
+ if (HAVE_ARMV6)
+ c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
if (HAVE_NEON) ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc);
}
--
1.8.1.6