ffmpeg: more patches

This commit is contained in:
Stefan Saraev 2013-12-09 21:08:57 +02:00
parent 10cad8e040
commit 78297b67fd
3 changed files with 503 additions and 0 deletions

View File

@ -0,0 +1,40 @@
From 6f99f0779ea56e4bfe40f7ca56e60b3dfd84eba6 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 5 Aug 2013 13:12:46 +0100
Subject: [PATCH] h264_parser: Initialize the h264dsp context in the parser as
well
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Each AVStream struct for an H.264 elementary stream actually has two
copies of the H264DSPContext struct (and in fact all the other members
of H264Context as well):
((H264Context *) ((AVStream *)st)->codec->priv_data)->h264dsp
((H264Context *) ((AVStream *)st)->parser->priv_data)->h264dsp
but only the first of these was actually being initialised. This
prevented the addition of platform-specific implementations of
parser-related functions.
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/h264_parser.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index aff9ba1..a732f79 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -386,6 +386,7 @@ static int init(AVCodecParserContext *s)
H264Context *h = s->priv_data;
h->thread_context[0] = h;
h->slice_context_count = 1;
+ ff_h264dsp_init(&h->h264dsp, 8, 1);
return 0;
}
--
1.8.5.1

View File

@ -0,0 +1,128 @@
From 971a57f6067c96f8dba087285065618f1ac3ecd5 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 5 Aug 2013 13:12:47 +0100
Subject: [PATCH] h264dsp: Factorize code into a new function,
h264_find_start_code_candidate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This performs the start code search which was previously part of
h264_find_frame_end() - the most CPU intensive part of the function.
By itself, this results in a performance regression:
Before After
Mean StdDev Mean StdDev Change
Overall time 2925.6 26.2 3068.5 31.7 -4.7%
but this can more than be made up for by platform-optimised
implementations of the function.
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/h264_parser.c | 20 +++-----------------
libavcodec/h264dsp.c | 29 +++++++++++++++++++++++++++++
libavcodec/h264dsp.h | 9 +++++++++
3 files changed, 41 insertions(+), 17 deletions(-)
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index a732f79..972aace 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -62,23 +62,9 @@ static int ff_h264_find_frame_end(H264Context *h, const uint8_t *buf, int buf_si
}
if(state==7){
-#if HAVE_FAST_UNALIGNED
- /* we check i<buf_size instead of i+3/7 because its simpler
- * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end
- */
-# if HAVE_FAST_64BIT
- while(i<next_avc && !((~*(const uint64_t*)(buf+i) & (*(const uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL))
- i+=8;
-# else
- while(i<next_avc && !((~*(const uint32_t*)(buf+i) & (*(const uint32_t*)(buf+i) - 0x01010101U)) & 0x80808080U))
- i+=4;
-# endif
-#endif
- for(; i<next_avc; i++){
- if(!buf[i]){
- state=2;
- break;
- }
+ i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i);
+ if (i < buf_size)
+ state = 2;
}
}else if(state<=2){
if(buf[i]==1) state^= 5; //2->7, 1->4, 0->5
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index da9e417..b7d61cd 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -60,6 +60,34 @@
#include "h264addpx_template.c"
#undef BIT_DEPTH
+static int h264_find_start_code_candidate_c(const uint8_t *buf, int size)
+{
+ int i = 0;
+#if HAVE_FAST_UNALIGNED
+ /* we check i < size instead of i + 3 / 7 because it is
+ * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
+ * bytes at the end.
+ */
+#if HAVE_FAST_64BIT
+ while (i < size &&
+ !((~*(const uint64_t *)(buf + i) &
+ (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+ 0x8080808080808080ULL))
+ i += 8;
+#else
+ while (i < size &&
+ !((~*(const uint32_t *)(buf + i) &
+ (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+ 0x80808080U))
+ i += 4;
+#endif
+#endif
+ for (; i < size; i++)
+ if (!buf[i])
+ break;
+ return i;
+}
+
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
{
#undef FUNC
@@ -146,6 +174,7 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
H264_DSP(8);
break;
}
+ c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index 98ea15c..1be4804 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -105,6 +105,15 @@ typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src,
/* bypass-transform */
void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride);
void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride);
+
+ /**
+ * Search buf from the start for up to size bytes. Return the index
+ * of a zero byte, or >= size if not found. Ideally, use lookahead
+ * to filter out any zero bytes that are known to not be followed by
+ * one or more further zero bytes and a one byte. Better still, filter
+ * out any bytes that form the trailing_zero_8bits syntax element too.
+ */
+ int (*h264_find_start_code_candidate)(const uint8_t *buf, int size);
} H264DSPContext;
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
--
1.8.5.1

View File

@ -0,0 +1,335 @@
From fdc814cc6701f3e882a7ea7f29d16500c7340f0d Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 5 Aug 2013 13:12:48 +0100
Subject: [PATCH] arm: Add assembly version of h264_find_start_code_candidate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 508.8 23.4 185.4 9.0 +174.4%
Overall 3068.5 31.7 2752.1 29.4 +11.5%
In combination with the preceding patch:
Before After
Mean StdDev Mean StdDev Change
Overall 2925.6 26.2 2752.1 29.4 +6.3%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/h264dsp_armv6.S | 253 +++++++++++++++++++++++++++
libavcodec/arm/h264dsp_init_arm.c | 4 +
libavcodec/h264_parser.c | 1 -
4 files changed, 258 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/arm/h264dsp_armv6.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 7390a8b..480000b 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_init_arm.o \
OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
+ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
arm/flacdsp_arm.o \
diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S
new file mode 100644
index 0000000..c4f12a6
--- /dev/null
+++ b/libavcodec/arm/h264dsp_armv6.S
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+RESULT .req a1
+BUF .req a1
+SIZE .req a2
+PATTERN .req a3
+PTR .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+TMP0 .req v5
+TMP1 .req v6
+TMP2 .req ip
+TMP3 .req lr
+
+#define PRELOAD_DISTANCE 4
+
+.macro innerloop4
+ ldr DAT0, [PTR], #4
+ subs SIZE, SIZE, #4 @ C flag survives rest of macro
+ sub TMP0, DAT0, PATTERN, lsr #14
+ bic TMP0, TMP0, DAT0
+ ands TMP0, TMP0, PATTERN
+.endm
+
+.macro innerloop16 decrement, do_preload
+ ldmia PTR!, {DAT0,DAT1,DAT2,DAT3}
+ .ifnc "\do_preload",""
+ pld [PTR, #PRELOAD_DISTANCE*32]
+ .endif
+ .ifnc "\decrement",""
+ subs SIZE, SIZE, #\decrement @ C flag survives rest of macro
+ .endif
+ sub TMP0, DAT0, PATTERN, lsr #14
+ sub TMP1, DAT1, PATTERN, lsr #14
+ bic TMP0, TMP0, DAT0
+ bic TMP1, TMP1, DAT1
+ sub TMP2, DAT2, PATTERN, lsr #14
+ sub TMP3, DAT3, PATTERN, lsr #14
+ ands TMP0, TMP0, PATTERN
+ bic TMP2, TMP2, DAT2
+ it eq
+ andseq TMP1, TMP1, PATTERN
+ bic TMP3, TMP3, DAT3
+ itt eq
+ andseq TMP2, TMP2, PATTERN
+ andseq TMP3, TMP3, PATTERN
+.endm
+
+/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */
+function ff_h264_find_start_code_candidate_armv6, export=1
+ push {v1-v6,lr}
+ mov PTR, BUF
+ @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
+ @ before using code that does preloads
+ cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
+ blo 60f
+
+ @ Get to word-alignment, 1 byte at a time
+ tst PTR, #3
+ beq 2f
+1: ldrb DAT0, [PTR], #1
+ sub SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ tst PTR, #3
+ bne 1b
+2: @ Get to 4-word alignment, 1 word at a time
+ ldr PATTERN, =0x80008000
+ setend be
+ tst PTR, #12
+ beq 4f
+3: innerloop4
+ bne 91f
+ tst PTR, #12
+ bne 3b
+4: @ Get to cacheline (8-word) alignment
+ tst PTR, #16
+ beq 5f
+ innerloop16 16
+ bne 93f
+5: @ Check complete cachelines, with preloading
+ @ We need to stop when there are still (PRELOAD_DISTANCE+1)
+ @ complete cachelines to go
+ sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
+6: innerloop16 , do_preload
+ bne 93f
+ innerloop16 32
+ bne 93f
+ bcs 6b
+ @ Preload trailing part-cacheline, if any
+ tst SIZE, #31
+ beq 7f
+ pld [PTR, #(PRELOAD_DISTANCE+1)*32]
+ @ Check remaining data without doing any more preloads. First
+ @ do in chunks of 4 words:
+7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
+ bmi 9f
+8: innerloop16 16
+ bne 93f
+ bcs 8b
+ @ Then in words:
+9: adds SIZE, SIZE, #16 - 4
+ bmi 11f
+10: innerloop4
+ bne 91f
+ bcs 10b
+11: setend le
+ @ Check second byte of final halfword
+ ldrb DAT0, [PTR, #-1]
+ teq DAT0, #0
+ beq 90f
+ @ Check any remaining bytes
+ tst SIZE, #3
+ beq 13f
+12: ldrb DAT0, [PTR], #1
+ sub SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ tst SIZE, #3
+ bne 12b
+ @ No candidate found
+13: sub RESULT, PTR, BUF
+ b 99f
+
+60: @ Small buffer - simply check by looping over bytes
+ subs SIZE, SIZE, #1
+ bcc 99f
+61: ldrb DAT0, [PTR], #1
+ subs SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ bcs 61b
+ @ No candidate found
+ sub RESULT, PTR, BUF
+ b 99f
+
+90: @ Found a candidate at the preceding byte
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #1
+ b 99f
+
+91: @ Found a candidate somewhere in the preceding 4 bytes
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #4
+ sub TMP0, DAT0, #0x20000
+ bics TMP0, TMP0, DAT0
+ itt pl
+ ldrbpl DAT0, [PTR, #-3]
+ addpl RESULT, RESULT, #2
+ bpl 92f
+ teq RESULT, #0
+ beq 98f @ don't look back a byte if found at first byte in buffer
+ ldrb DAT0, [PTR, #-5]
+92: teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+
+93: @ Found a candidate somewhere in the preceding 16 bytes
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #16
+ teq TMP0, #0
+ beq 95f @ not in first 4 bytes
+ sub TMP0, DAT0, #0x20000
+ bics TMP0, TMP0, DAT0
+ itt pl
+ ldrbpl DAT0, [PTR, #-15]
+ addpl RESULT, RESULT, #2
+ bpl 94f
+ teq RESULT, #0
+ beq 98f @ don't look back a byte if found at first byte in buffer
+ ldrb DAT0, [PTR, #-17]
+94: teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+95: add RESULT, RESULT, #4
+ teq TMP1, #0
+ beq 96f @ not in next 4 bytes
+ sub TMP1, DAT1, #0x20000
+ bics TMP1, TMP1, DAT1
+ itee mi
+ ldrbmi DAT0, [PTR, #-13]
+ ldrbpl DAT0, [PTR, #-11]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+96: add RESULT, RESULT, #4
+ teq TMP2, #0
+ beq 97f @ not in next 4 bytes
+ sub TMP2, DAT2, #0x20000
+ bics TMP2, TMP2, DAT2
+ itee mi
+ ldrbmi DAT0, [PTR, #-9]
+ ldrbpl DAT0, [PTR, #-7]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+97: add RESULT, RESULT, #4
+ sub TMP3, DAT3, #0x20000
+ bics TMP3, TMP3, DAT3
+ itee mi
+ ldrbmi DAT0, [PTR, #-5]
+ ldrbpl DAT0, [PTR, #-3]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ @ drop through to 98f
+98: setend le
+99: pop {v1-v6,pc}
+.endfunc
+
+ .unreq RESULT
+ .unreq BUF
+ .unreq SIZE
+ .unreq PATTERN
+ .unreq PTR
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq TMP0
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index 785b604..2804e56 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -24,6 +24,8 @@
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264dsp.h"
+int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
+
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
@@ -106,6 +108,8 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
{
int cpu_flags = av_get_cpu_flags();
+ if (have_armv6(cpu_flags))
+ c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
if (have_neon(cpu_flags))
ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc);
}
diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
index 972aace..363843c 100644
--- a/libavcodec/h264_parser.c
+++ b/libavcodec/h264_parser.c
@@ -65,7 +65,6 @@ static int ff_h264_find_frame_end(H264Context *h, const uint8_t *buf, int buf_si
i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i);
if (i < buf_size)
state = 2;
- }
}else if(state<=2){
if(buf[i]==1) state^= 5; //2->7, 1->4, 0->5
else if(buf[i]) state = 7;
--
1.8.5.1