From 34d4ce847631f7c1f53657bafc32da1d1c2d495c Mon Sep 17 00:00:00 2001 From: Stephan Raue Date: Sun, 14 Jun 2015 19:50:12 +0200 Subject: [PATCH] kodi: add PR7280 Signed-off-by: Stephan Raue --- .../kodi/patches/kodi-999.22-PR7280.patch | 1364 +++++++++++++++++ .../kodi-001-isengard-rpb-backports.patch | 23 - .../kodi-001-isengard-rpb-backports.patch | 23 - 3 files changed, 1364 insertions(+), 46 deletions(-) create mode 100644 packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch diff --git a/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch b/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch new file mode 100644 index 0000000000..0cfe4bbd31 --- /dev/null +++ b/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch @@ -0,0 +1,1364 @@ +From 224c1919ad3f68e23e817f41036687343f34aaae Mon Sep 17 00:00:00 2001 +From: popcornmix +Date: Fri, 12 Jun 2015 17:27:47 +0100 +Subject: [PATCH] [utils] Disable fast_memcpy which is slower than memcpy + +The default glibc memcpy is likely to be better tuned than this code +which hasn't been touched for four years. + +In a test with software video decode on Pi2 the skipped frames went +from 189 to 172 when fast_memcpy was disabled. +--- + Kodi.xcodeproj/project.pbxproj | 6 - + project/VS2010Express/XBMC.vcxproj | 4 - + project/VS2010Express/XBMC.vcxproj.filters | 3 - + xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp | 1 - + xbmc/cores/VideoRenderers/RenderCapture.cpp | 7 +- + xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp | 33 +- + .../Video/libstagefrightICS/StageFrightVideo.cpp | 3 +- + xbmc/utils/Makefile.in | 2 - + xbmc/utils/fastmemcpy-arm.S | 528 --------------------- + xbmc/utils/fastmemcpy.c | 396 ---------------- + xbmc/utils/fastmemcpy.h | 35 -- + xbmc/utils/test/Makefile | 1 - + xbmc/utils/test/Testfastmemcpy.cpp | 39 -- + 13 files changed, 20 insertions(+), 1038 deletions(-) + delete mode 100644 xbmc/utils/fastmemcpy-arm.S + delete mode 100644 xbmc/utils/fastmemcpy.c + delete mode 100644 xbmc/utils/fastmemcpy.h + delete mode 100644 xbmc/utils/test/Testfastmemcpy.cpp + +diff --git a/Kodi.xcodeproj/project.pbxproj b/Kodi.xcodeproj/project.pbxproj +index 395c4ea..ce5a7f7 100644 +--- a/Kodi.xcodeproj/project.pbxproj ++++ b/Kodi.xcodeproj/project.pbxproj +@@ -3192,7 +3192,6 @@ + F5E55B5D10741272006E788A /* DVDPlayerTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B5B10741272006E788A /* DVDPlayerTeletext.cpp */; }; + F5E55B66107412DE006E788A /* GUIDialogTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B65107412DE006E788A /* GUIDialogTeletext.cpp */; }; + F5E55B7010741340006E788A /* Teletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B6E10741340006E788A /* Teletext.cpp */; }; +- F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = F5E5697210803FC3006E788A /* fastmemcpy.c */; }; + F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E56BA51082A675006E788A /* PosixMountProvider.cpp */; }; + F5EA02260F6DA990005C2EC5 /* CocoaPowerSyscall.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA02200F6DA85C005C2EC5 /* CocoaPowerSyscall.cpp */; }; + F5EA02270F6DA9A5005C2EC5 /* PowerManager.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */; }; +@@ -3632,7 +3631,6 @@ + 43348AAB1077486D00F859CF /* PlayerSelectionRule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PlayerSelectionRule.h; path = playercorefactory/PlayerSelectionRule.h; sourceTree = ""; }; + 436721A612D66A09002508E6 /* IAnnouncer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IAnnouncer.h; sourceTree = ""; }; + 436B38F3106628850049AB3B /* EndianSwap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = EndianSwap.h; sourceTree = ""; }; +- 43BF09DD1080D39300E25290 /* fastmemcpy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fastmemcpy.h; sourceTree = ""; }; + 43FAC87112D6349400F67914 /* IStorageProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IStorageProvider.h; sourceTree = ""; }; + 551C3A43175A12010051AAAD /* VDA.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = VDA.cpp; sourceTree = ""; }; + 551C3A44175A12010051AAAD /* VDA.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = VDA.h; sourceTree = ""; }; +@@ -5735,7 +5733,6 @@ + F5E55B6D10741340006E788A /* Teletext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Teletext.h; sourceTree = ""; }; + F5E55B6E10741340006E788A /* Teletext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Teletext.cpp; sourceTree = ""; }; + F5E55B6F10741340006E788A /* TeletextDefines.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TeletextDefines.h; sourceTree = ""; }; +- F5E5697210803FC3006E788A /* fastmemcpy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fastmemcpy.c; sourceTree = ""; }; + F5E56BA41082A675006E788A /* PosixMountProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PosixMountProvider.h; sourceTree = ""; }; + F5E56BA51082A675006E788A /* PosixMountProvider.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PosixMountProvider.cpp; sourceTree = ""; }; + F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PowerManager.cpp; sourceTree = ""; }; +@@ -9202,8 +9199,6 @@ + DF529BAD1741697B00523FB4 /* Environment.h */, + E36C29E90DA72486001F0C9D /* Fanart.cpp */, + 6E97BDC30DA2B620003A2A89 /* Fanart.h */, +- F5E5697210803FC3006E788A /* fastmemcpy.c */, +- 43BF09DD1080D39300E25290 /* fastmemcpy.h */, + F5F244641110DC6B009126C6 /* FileOperationJob.cpp */, + F5F244631110DC6B009126C6 /* FileOperationJob.h */, + F5F245EC1112C9AB009126C6 /* FileUtils.cpp */, +@@ -10519,7 +10514,6 @@ + 43348AAE1077486D00F859CF /* PlayerCoreFactory.cpp in Sources */, + 43348AAF1077486D00F859CF /* PlayerSelectionRule.cpp in Sources */, + 7CAA20511079C8160096DE39 /* BaseRenderer.cpp in Sources */, +- F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */, + 55D3604E1826CAB900DA66D2 /* OverlayRendererGUI.cpp in Sources */, + F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */, + 7CAA25351085963B0096DE39 /* PasswordManager.cpp in Sources */, +diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj +index 2d37c57..e8e8dce 100644 +--- a/project/VS2010Express/XBMC.vcxproj ++++ b/project/VS2010Express/XBMC.vcxproj +@@ -1439,10 +1439,6 @@ + true + true + +- +- true +- true +- + + + +diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters +index c858f32..cada31e 100644 +--- a/project/VS2010Express/XBMC.vcxproj.filters ++++ b/project/VS2010Express/XBMC.vcxproj.filters +@@ -2371,9 +2371,6 @@ + + utils\test + +- +- utils\test +- + + utils\test + +diff --git a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp +index 2b64121..fdad7f0 100644 +--- a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp ++++ b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp +@@ -31,7 +31,6 @@ + #include + #include "guilib/MatrixGLES.h" + #include "LinuxRendererGLES.h" +-#include "utils/fastmemcpy.h" + #include "utils/MathUtils.h" + #include "utils/GLUtils.h" + #include "utils/log.h" +diff --git a/xbmc/cores/VideoRenderers/RenderCapture.cpp b/xbmc/cores/VideoRenderers/RenderCapture.cpp +index 603b68d..0456a27 100644 +--- a/xbmc/cores/VideoRenderers/RenderCapture.cpp ++++ b/xbmc/cores/VideoRenderers/RenderCapture.cpp +@@ -21,7 +21,6 @@ + #include "RenderCapture.h" + #include "utils/log.h" + #include "windowing/WindowingFactory.h" +-#include "utils/fastmemcpy.h" + #include "settings/AdvancedSettings.h" + + CRenderCaptureBase::CRenderCaptureBase() +@@ -297,7 +296,7 @@ void CRenderCaptureGL::PboToBuffer() + + if (pboPtr) + { +- fast_memcpy(m_pixels, pboPtr, m_bufferSize); ++ memcpy(m_pixels, pboPtr, m_bufferSize); + SetState(CAPTURESTATE_DONE); + } + else +@@ -491,12 +490,12 @@ void CRenderCaptureDX::SurfaceToBuffer() + //if pitch is same, do a direct copy, otherwise copy one line at a time + if (lockedRect.Pitch == m_width * 4) + { +- fast_memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4); ++ memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4); + } + else + { + for (unsigned int y = 0; y < m_height; y++) +- fast_memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4); ++ memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4); + } + m_copySurface->UnlockRect(); + SetState(CAPTURESTATE_DONE); +diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp +index 56e68713..5f0e486 100644 +--- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp ++++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp +@@ -22,7 +22,6 @@ + #include "DVDClock.h" + #include "cores/VideoRenderers/RenderManager.h" + #include "utils/log.h" +-#include "utils/fastmemcpy.h" + #include "cores/FFmpeg.h" + #include "Util.h" + #ifdef HAS_DX +@@ -95,7 +94,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc) + + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[0]; + d += pDst->iLineSize[0]; + } +@@ -107,7 +106,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc) + d = pDst->data[1]; + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[1]; + d += pDst->iLineSize[1]; + } +@@ -116,7 +115,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc) + d = pDst->data[2]; + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[2]; + d += pDst->iLineSize[2]; + } +@@ -131,13 +130,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc) + int h = pImage->height; + if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0])) + { +- fast_memcpy(d, s, w*h); ++ memcpy(d, s, w*h); + } + else + { + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[0]; + d += pImage->stride[0]; + } +@@ -148,13 +147,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc) + h =(pImage->height >> pImage->cshift_y); + if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1])) + { +- fast_memcpy(d, s, w*h); ++ memcpy(d, s, w*h); + } + else + { + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[1]; + d += pImage->stride[1]; + } +@@ -163,13 +162,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc) + d = pImage->plane[2]; + if ((w==pSrc->iLineSize[2]) && ((unsigned int) pSrc->iLineSize[2]==pImage->stride[2])) + { +- fast_memcpy(d, s, w*h); ++ memcpy(d, s, w*h); + } + else + { + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[2]; + d += pImage->stride[2]; + } +@@ -207,7 +206,7 @@ DVDVideoPicture* CDVDCodecUtils::ConvertToNV12Picture(DVDVideoPicture *pSrc) + uint8_t *d = pPicture->data[0]; + for (int y = 0; y < (int)pSrc->iHeight; y++) + { +- fast_memcpy(d, s, pSrc->iWidth); ++ memcpy(d, s, pSrc->iWidth); + s += pSrc->iLineSize[0]; + d += pPicture->iLineSize[0]; + } +@@ -298,13 +297,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc) + // Copy Y + if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0])) + { +- fast_memcpy(d, s, w*h); ++ memcpy(d, s, w*h); + } + else + { + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[0]; + d += pImage->stride[0]; + } +@@ -317,13 +316,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc) + // Copy packed UV (width is same as for Y as it's both U and V components) + if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1])) + { +- fast_memcpy(d, s, w*h); ++ memcpy(d, s, w*h); + } + else + { + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w); ++ memcpy(d, s, w); + s += pSrc->iLineSize[1]; + d += pImage->stride[1]; + } +@@ -342,13 +341,13 @@ bool CDVDCodecUtils::CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture + // Copy YUYV + if ((w * 2 == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0])) + { +- fast_memcpy(d, s, w*h*2); ++ memcpy(d, s, w*h*2); + } + else + { + for (int y = 0; y < h; y++) + { +- fast_memcpy(d, s, w*2); ++ memcpy(d, s, w*2); + s += pSrc->iLineSize[0]; + d += pImage->stride[0]; + } +diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp +index 019bc7a..d5ca74f 100644 +--- a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp ++++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp +@@ -30,7 +30,6 @@ + #include "guilib/GraphicContext.h" + #include "DVDClock.h" + #include "utils/log.h" +-#include "utils/fastmemcpy.h" + #include "threads/Thread.h" + #include "threads/Event.h" + #include "Application.h" +@@ -620,7 +619,7 @@ int CStageFrightVideo::Decode(uint8_t *pData, int iSize, double dts, double pts + return VC_ERROR; + } + +- fast_memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes); ++ memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes); + frame->medbuf->set_range(0, demuxer_bytes); + frame->medbuf->meta_data()->clear(); + frame->medbuf->meta_data()->setInt64(kKeyTime, frame->pts); +diff --git a/xbmc/utils/Makefile.in b/xbmc/utils/Makefile.in +index 438f025..dbd3db9 100644 +--- a/xbmc/utils/Makefile.in ++++ b/xbmc/utils/Makefile.in +@@ -17,8 +17,6 @@ SRCS += DatabaseUtils.cpp + SRCS += EndianSwap.cpp + SRCS += Environment.cpp + SRCS += Fanart.cpp +-SRCS += fastmemcpy.c +-SRCS += fastmemcpy-arm.S + SRCS += FileOperationJob.cpp + SRCS += FileUtils.cpp + SRCS += fstrcmp.c +diff --git a/xbmc/utils/fastmemcpy-arm.S b/xbmc/utils/fastmemcpy-arm.S +deleted file mode 100644 +index 6cb8b0c..0000000 +--- a/xbmc/utils/fastmemcpy-arm.S ++++ /dev/null +@@ -1,528 +0,0 @@ +-/* +- * Copyright (C) 2008 The Android Open Source Project +- * All rights reserved. +- * +- * Copyright (C) 2011-2013 Team XBMC +- * http://xbmc.org +- * +- * This Program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License as published by +- * the Free Software Foundation; either version 2, or (at your option) +- * any later version. +- * +- * This Program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with XBMC; see the file COPYING. If not, see +- * . +- * +- */ +-#if defined(__arm__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) +-#if defined(__ARM_NEON__) +- +- .text +-#ifndef __APPLE__ +- .fpu neon +- .global fast_memcpy +- .type fast_memcpy, %function +-#else +- .globl _fast_memcpy +-#endif +- .align 4 +- +-/* a prefetch distance of 4 cache-lines works best experimentally */ +-#define CACHE_LINE_SIZE 64 +-#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) +- +-#ifndef __APPLE__ +- .fnstart +- .save {r0, lr} +-fast_memcpy: +-#else +-_fast_memcpy: +-#endif +- stmfd sp!, {r0, lr} +- +- /* start preloading as early as possible */ +- pld [r1, #(CACHE_LINE_SIZE*0)] +- pld [r1, #(CACHE_LINE_SIZE*1)] +- +- /* do we have at least 16-bytes to copy (needed for alignment below) */ +- cmp r2, #16 +- blo 5f +- +- /* align destination to half cache-line for the write-buffer */ +- rsb r3, r0, #0 +- ands r3, r3, #0xF +- beq 0f +- +- /* copy up to 15-bytes (count in r3) */ +- sub r2, r2, r3 +- movs ip, r3, lsl #31 +- ldrmib lr, [r1], #1 +- strmib lr, [r0], #1 +- ldrcsb ip, [r1], #1 +- ldrcsb lr, [r1], #1 +- strcsb ip, [r0], #1 +- strcsb lr, [r0], #1 +- movs ip, r3, lsl #29 +- bge 1f +- // copies 4 bytes, destination 32-bits aligned +- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! +- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]! +-1: bcc 2f +- // copies 8 bytes, destination 64-bits aligned +- vld1.8 {d0}, [r1]! +- vst1.8 {d0}, [r0, :64]! +-2: +- +-0: /* preload immediately the next cache line, which we may need */ +- pld [r1, #(CACHE_LINE_SIZE*0)] +- pld [r1, #(CACHE_LINE_SIZE*1)] +- +- /* make sure we have at least 64 bytes to copy */ +- subs r2, r2, #64 +- blo 2f +- +- /* preload all the cache lines we need. +- * NOTE: the number of pld below depends on PREFETCH_DISTANCE, +- * ideally would would increase the distance in the main loop to +- * avoid the goofy code below. In practice this doesn't seem to make +- * a big difference. +- */ +- pld [r1, #(CACHE_LINE_SIZE*2)] +- pld [r1, #(CACHE_LINE_SIZE*3)] +- pld [r1, #(PREFETCH_DISTANCE)] +- +-1: /* The main loop copies 64 bytes at a time */ +- vld1.8 {d0 - d3}, [r1]! +- vld1.8 {d4 - d7}, [r1]! +- pld [r1, #(PREFETCH_DISTANCE)] +- subs r2, r2, #64 +- vst1.8 {d0 - d3}, [r0, :128]! +- vst1.8 {d4 - d7}, [r0, :128]! +- bhs 1b +- +-2: /* fix-up the remaining count and make sure we have >= 32 bytes left */ +- add r2, r2, #64 +- subs r2, r2, #32 +- blo 4f +- +-3: /* 32 bytes at a time. These cache lines were already preloaded */ +- vld1.8 {d0 - d3}, [r1]! +- subs r2, r2, #32 +- vst1.8 {d0 - d3}, [r0, :128]! +- bhs 3b +- +-4: /* less than 32 left */ +- add r2, r2, #32 +- tst r2, #0x10 +- beq 5f +- // copies 16 bytes, 128-bits aligned +- vld1.8 {d0, d1}, [r1]! +- vst1.8 {d0, d1}, [r0, :128]! +- +-5: /* copy up to 15-bytes (count in r2) */ +- movs ip, r2, lsl #29 +- bcc 1f +- vld1.8 {d0}, [r1]! +- vst1.8 {d0}, [r0]! +-1: bge 2f +- vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! +- vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! +-2: movs ip, r2, lsl #31 +- ldrmib r3, [r1], #1 +- ldrcsb ip, [r1], #1 +- ldrcsb lr, [r1], #1 +- strmib r3, [r0], #1 +- strcsb ip, [r0], #1 +- strcsb lr, [r0], #1 +- +- ldmfd sp!, {r0, lr} +- bx lr +-#ifndef __APPLE__ +- .fnend +-#endif +- +-#else /* __ARM_ARCH__ < 7 */ +- +- +- .text +- +-#ifndef __APPLE__ +- .global fast_memcpy +- .type fast_memcpy, %function +-#else +- .globl _fast_memcpy +-#endif +- .align 4 +- +- /* +- * Optimized memcpy() for ARM. +- * +- * note that memcpy() always returns the destination pointer, +- * so we have to preserve R0. +- */ +- +-#ifndef __APPLE__ +-fast_memcpy: +-#else +-_fast_memcpy: +-#endif +- /* The stack must always be 64-bits aligned to be compliant with the +- * ARM ABI. Since we have to save R0, we might as well save R4 +- * which we can use for better pipelining of the reads below +- */ +-#ifndef __APPLE__ +- .fnstart +- .save {r0, r4, lr} +-#endif +- stmfd sp!, {r0, r4, lr} +- /* Making room for r5-r11 which will be spilled later */ +- .pad #28 +- sub sp, sp, #28 +- +- // preload the destination because we'll align it to a cache line +- // with small writes. Also start the source "pump". +- //PLD (r0, #0) +- //PLD (r1, #0) +- //PLD (r1, #32) +- +- /* it simplifies things to take care of len<4 early */ +- cmp r2, #4 +- blo copy_last_3_and_return +- +- /* compute the offset to align the source +- * offset = (4-(src&3))&3 = -src & 3 +- */ +- rsb r3, r1, #0 +- ands r3, r3, #3 +- beq src_aligned +- +- /* align source to 32 bits. We need to insert 2 instructions between +- * a ldr[b|h] and str[b|h] because byte and half-word instructions +- * stall 2 cycles. +- */ +- movs r12, r3, lsl #31 +- sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */ +- ldrmib r3, [r1], #1 +- ldrcsb r4, [r1], #1 +- ldrcsb r12,[r1], #1 +- strmib r3, [r0], #1 +- strcsb r4, [r0], #1 +- strcsb r12,[r0], #1 +- +-src_aligned: +- +- /* see if src and dst are aligned together (congruent) */ +- eor r12, r0, r1 +- tst r12, #3 +- bne non_congruent +- +- /* Use post-incriment mode for stm to spill r5-r11 to reserved stack +- * frame. Don't update sp. +- */ +- stmea sp, {r5-r11} +- +- /* align the destination to a cache-line */ +- rsb r3, r0, #0 +- ands r3, r3, #0x1C +- beq congruent_aligned32 +- cmp r3, r2 +- andhi r3, r2, #0x1C +- +- /* conditionnaly copies 0 to 7 words (length in r3) */ +- movs r12, r3, lsl #28 +- ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ +- ldmmiia r1!, {r8, r9} /* 8 bytes */ +- stmcsia r0!, {r4, r5, r6, r7} +- stmmiia r0!, {r8, r9} +- tst r3, #0x4 +- ldrne r10,[r1], #4 /* 4 bytes */ +- strne r10,[r0], #4 +- sub r2, r2, r3 +- +-congruent_aligned32: +- /* +- * here source is aligned to 32 bytes. +- */ +- +-cached_aligned32: +- subs r2, r2, #32 +- blo less_than_32_left +- +- /* +- * We preload a cache-line up to 64 bytes ahead. On the 926, this will +- * stall only until the requested world is fetched, but the linefill +- * continues in the the background. +- * While the linefill is going, we write our previous cache-line +- * into the write-buffer (which should have some free space). +- * When the linefill is done, the writebuffer will +- * start dumping its content into memory +- * +- * While all this is going, we then load a full cache line into +- * 8 registers, this cache line should be in the cache by now +- * (or partly in the cache). +- * +- * This code should work well regardless of the source/dest alignment. +- * +- */ +- +- // Align the preload register to a cache-line because the cpu does +- // "critical word first" (the first word requested is loaded first). +- bic r12, r1, #0x1F +- add r12, r12, #64 +- +-1: ldmia r1!, { r4-r11 } +- //PLD (r12, #64) +- subs r2, r2, #32 +- +- // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi +- // for ARM9 preload will not be safely guarded by the preceding subs. +- // When it is safely guarded the only possibility to have SIGSEGV here +- // is because the caller overstates the length. +- ldrhi r3, [r12], #32 /* cheap ARM9 preload */ +- stmia r0!, { r4-r11 } +- bhs 1b +- +- add r2, r2, #32 +- +- +- +- +-less_than_32_left: +- /* +- * less than 32 bytes left at this point (length in r2) +- */ +- +- /* skip all this if there is nothing to do, which should +- * be a common case (if not executed the code below takes +- * about 16 cycles) +- */ +- tst r2, #0x1F +- beq 1f +- +- /* conditionnaly copies 0 to 31 bytes */ +- movs r12, r2, lsl #28 +- ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */ +- ldmmiia r1!, {r8, r9} /* 8 bytes */ +- stmcsia r0!, {r4, r5, r6, r7} +- stmmiia r0!, {r8, r9} +- movs r12, r2, lsl #30 +- ldrcs r3, [r1], #4 /* 4 bytes */ +- ldrmih r4, [r1], #2 /* 2 bytes */ +- strcs r3, [r0], #4 +- strmih r4, [r0], #2 +- tst r2, #0x1 +- ldrneb r3, [r1] /* last byte */ +- strneb r3, [r0] +- +- /* we're done! restore everything and return */ +-1: ldmfd sp!, {r5-r11} +- ldmfd sp!, {r0, r4, lr} +- bx lr +- +- /********************************************************************/ +- +-non_congruent: +- /* +- * here source is aligned to 4 bytes +- * but destination is not. +- * +- * in the code below r2 is the number of bytes read +- * (the number of bytes written is always smaller, because we have +- * partial words in the shift queue) +- */ +- cmp r2, #4 +- blo copy_last_3_and_return +- +- /* Use post-incriment mode for stm to spill r5-r11 to reserved stack +- * frame. Don't update sp. +- */ +- stmea sp, {r5-r11} +- +- /* compute shifts needed to align src to dest */ +- rsb r5, r0, #0 +- and r5, r5, #3 /* r5 = # bytes in partial words */ +- mov r12, r5, lsl #3 /* r12 = right */ +- rsb lr, r12, #32 /* lr = left */ +- +- /* read the first word */ +- ldr r3, [r1], #4 +- sub r2, r2, #4 +- +- /* write a partial word (0 to 3 bytes), such that destination +- * becomes aligned to 32 bits (r5 = nb of words to copy for alignment) +- */ +- movs r5, r5, lsl #31 +- strmib r3, [r0], #1 +- movmi r3, r3, lsr #8 +- strcsb r3, [r0], #1 +- movcs r3, r3, lsr #8 +- strcsb r3, [r0], #1 +- movcs r3, r3, lsr #8 +- +- cmp r2, #4 +- blo partial_word_tail +- +- /* Align destination to 32 bytes (cache line boundary) */ +-1: tst r0, #0x1c +- beq 2f +- ldr r5, [r1], #4 +- sub r2, r2, #4 +- orr r4, r3, r5, lsl lr +- mov r3, r5, lsr r12 +- str r4, [r0], #4 +- cmp r2, #4 +- bhs 1b +- blo partial_word_tail +- +- /* copy 32 bytes at a time */ +-2: subs r2, r2, #32 +- blo less_than_thirtytwo +- +- /* Use immediate mode for the shifts, because there is an extra cycle +- * for register shifts, which could account for up to 50% of +- * performance hit. +- */ +- +- cmp r12, #24 +- beq loop24 +- cmp r12, #8 +- beq loop8 +- +-loop16: +- ldr r12, [r1], #4 +-1: mov r4, r12 +- ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} +- //PLD (r1, #64) +- subs r2, r2, #32 +- ldrhs r12, [r1], #4 +- orr r3, r3, r4, lsl #16 +- mov r4, r4, lsr #16 +- orr r4, r4, r5, lsl #16 +- mov r5, r5, lsr #16 +- orr r5, r5, r6, lsl #16 +- mov r6, r6, lsr #16 +- orr r6, r6, r7, lsl #16 +- mov r7, r7, lsr #16 +- orr r7, r7, r8, lsl #16 +- mov r8, r8, lsr #16 +- orr r8, r8, r9, lsl #16 +- mov r9, r9, lsr #16 +- orr r9, r9, r10, lsl #16 +- mov r10, r10, lsr #16 +- orr r10, r10, r11, lsl #16 +- stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} +- mov r3, r11, lsr #16 +- bhs 1b +- b less_than_thirtytwo +- +-loop8: +- ldr r12, [r1], #4 +-1: mov r4, r12 +- ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} +- //PLD (r1, #64) +- subs r2, r2, #32 +- ldrhs r12, [r1], #4 +- orr r3, r3, r4, lsl #24 +- mov r4, r4, lsr #8 +- orr r4, r4, r5, lsl #24 +- mov r5, r5, lsr #8 +- orr r5, r5, r6, lsl #24 +- mov r6, r6, lsr #8 +- orr r6, r6, r7, lsl #24 +- mov r7, r7, lsr #8 +- orr r7, r7, r8, lsl #24 +- mov r8, r8, lsr #8 +- orr r8, r8, r9, lsl #24 +- mov r9, r9, lsr #8 +- orr r9, r9, r10, lsl #24 +- mov r10, r10, lsr #8 +- orr r10, r10, r11, lsl #24 +- stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} +- mov r3, r11, lsr #8 +- bhs 1b +- b less_than_thirtytwo +- +-loop24: +- ldr r12, [r1], #4 +-1: mov r4, r12 +- ldmia r1!, { r5,r6,r7, r8,r9,r10,r11} +- //PLD (r1, #64) +- subs r2, r2, #32 +- ldrhs r12, [r1], #4 +- orr r3, r3, r4, lsl #8 +- mov r4, r4, lsr #24 +- orr r4, r4, r5, lsl #8 +- mov r5, r5, lsr #24 +- orr r5, r5, r6, lsl #8 +- mov r6, r6, lsr #24 +- orr r6, r6, r7, lsl #8 +- mov r7, r7, lsr #24 +- orr r7, r7, r8, lsl #8 +- mov r8, r8, lsr #24 +- orr r8, r8, r9, lsl #8 +- mov r9, r9, lsr #24 +- orr r9, r9, r10, lsl #8 +- mov r10, r10, lsr #24 +- orr r10, r10, r11, lsl #8 +- stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10} +- mov r3, r11, lsr #24 +- bhs 1b +- +- +-less_than_thirtytwo: +- /* copy the last 0 to 31 bytes of the source */ +- rsb r12, lr, #32 /* we corrupted r12, recompute it */ +- add r2, r2, #32 +- cmp r2, #4 +- blo partial_word_tail +- +-1: ldr r5, [r1], #4 +- sub r2, r2, #4 +- orr r4, r3, r5, lsl lr +- mov r3, r5, lsr r12 +- str r4, [r0], #4 +- cmp r2, #4 +- bhs 1b +- +-partial_word_tail: +- /* we have a partial word in the input buffer */ +- movs r5, lr, lsl #(31-3) +- strmib r3, [r0], #1 +- movmi r3, r3, lsr #8 +- strcsb r3, [r0], #1 +- movcs r3, r3, lsr #8 +- strcsb r3, [r0], #1 +- +- /* Refill spilled registers from the stack. Don't update sp. */ +- ldmfd sp, {r5-r11} +- +-copy_last_3_and_return: +- movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */ +- ldrmib r2, [r1], #1 +- ldrcsb r3, [r1], #1 +- ldrcsb r12,[r1] +- strmib r2, [r0], #1 +- strcsb r3, [r0], #1 +- strcsb r12,[r0] +- +- /* we're done! restore sp and spilled registers and return */ +- add sp, sp, #28 +- ldmfd sp!, {r0, r4, lr} +- bx lr +-#ifndef __APPLE__ +- .fnend +-#endif +- +-#endif /* __ARM_ARCH__ < 7 */ +-#endif +- +-#if defined(__linux__) && defined(__ELF__) +-/* we don't need an executable stack */ +-.section .note.GNU-stack,"",%progbits +-#endif +diff --git a/xbmc/utils/fastmemcpy.c b/xbmc/utils/fastmemcpy.c +deleted file mode 100644 +index ec9019a..0000000 +--- a/xbmc/utils/fastmemcpy.c ++++ /dev/null +@@ -1,396 +0,0 @@ +-/* +- * fastmemcpy.h : fast memcpy routines +- ***************************************************************************** +- * $Id: fastmemcpy.h 13905 2006-01-12 23:10:04Z dionoea $ +- * +- * Authors: various Linux kernel hackers +- * various MPlayer hackers +- * Nick Kurshev +- * +- * Copyright (C) 2011-2013 Team XBMC +- * http://xbmc.org +- * +- * This Program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License as published by +- * the Free Software Foundation; either version 2, or (at your option) +- * any later version. +- * +- * This Program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with XBMC; see the file COPYING. If not, see +- * . +- * +- */ +-#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__mips__) +-#define HAVE_MMX2 +-#define HAVE_SSE +- +-/* +- aclib - advanced C library ;) +- This file contains functions which improve and expand standard C-library +-*/ +-#include +- +-#define BLOCK_SIZE 4096 +-#define CONFUSION_FACTOR 0 +-/*Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)*/ +- +-/*#define STATISTICS*/ +- +-#ifndef HAVE_SSE2 +-/* +- P3 processor has only one SSE decoder so can execute only 1 sse insn per +- cpu clock, but it has 3 mmx decoders (include load/store unit) +- and executes 3 mmx insns per cpu clock. +- P4 processor has some chances, but after reading: +- http://www.emulators.com/pentium4.htm +- I have doubts. Anyway SSE2 version of this code can be written better. +-*/ +-#undef HAVE_SSE +-#endif +- +- +-/* +- This part of code was taken by me from Linux-2.4.3 and slightly modified +-for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned +-blocks but mplayer uses weakly ordered data and original sources can not +-speedup them. Only using PREFETCHNTA and MOVNTQ together have effect! +- +->From IA-32 Intel Architecture Software Developer's Manual Volume 1, +- +-Order Number 245470: +-"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions" +- +-Data referenced by a program can be temporal (data will be used again) or +-non-temporal (data will be referenced once and not reused in the immediate +-future). To make efficient use of the processor's caches, it is generally +-desirable to cache temporal data and not cache non-temporal data. Overloading +-the processor's caches with non-temporal data is sometimes referred to as +-"polluting the caches". +-The non-temporal data is written to memory with Write-Combining semantics. +- +-The PREFETCHh instructions permits a program to load data into the processor +-at a suggested cache level, so that it is closer to the processors load and +-store unit when it is needed. If the data is already present in a level of +-the cache hierarchy that is closer to the processor, the PREFETCHh instruction +-will not result in any data movement. +-But we should you PREFETCHNTA: Non-temporal data fetch data into location +-close to the processor, minimizing cache pollution. +- +-The MOVNTQ (store quadword using non-temporal hint) instruction stores +-packed integer data from an MMX register to memory, using a non-temporal hint. +-The MOVNTPS (store packed single-precision floating-point values using +-non-temporal hint) instruction stores packed floating-point data from an +-XMM register to memory, using a non-temporal hint. +- +-The SFENCE (Store Fence) instruction controls write ordering by creating a +-fence for memory store operations. This instruction guarantees that the results +-of every store instruction that precedes the store fence in program order is +-globally visible before any store instruction that follows the fence. The +-SFENCE instruction provides an efficient way of ensuring ordering between +-procedures that produce weakly-ordered data and procedures that consume that +-data. +- +-If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru. +-*/ +- +-/* 3dnow memcpy support from kernel 2.4.2 */ +-/* by Pontscho/fresh!mindworkz */ +- +-#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) +- +-#undef HAVE_MMX1 +-#if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE) +-/* means: mmx v.1. Note: Since we added alignment of destinition it speedups +- of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus +- standard (non MMX-optimized) version. +- Note: on K6-2+ it speedups memory copying upto 25% and +- on K7 and P3 about 500% (5 times). */ +-#define HAVE_MMX1 +-#endif +- +- +-#undef HAVE_K6_2PLUS +-#if !defined( HAVE_MMX2) && defined( HAVE_3DNOW) +-#define HAVE_K6_2PLUS +-#endif +- +-/* for small memory blocks (<256 bytes) this version is faster */ +-#define small_memcpy(to,from,n)\ +-{\ +-register unsigned long int dummy;\ +-__asm__ __volatile__(\ +- "rep; movsb"\ +- :"=&D"(to), "=&S"(from), "=&c"(dummy)\ +-/* It's most portable way to notify compiler */\ +-/* that edi, esi and ecx are clobbered in asm block. */\ +-/* Thanks to A'rpi for hint!!! */\ +- :"0" (to), "1" (from),"2" (n)\ +- : "memory");\ +-} +- +-#ifdef HAVE_SSE +-#define MMREG_SIZE 16 +-#else +-#define MMREG_SIZE 64 /*8*/ +-#endif +- +-/* Small defines (for readability only) ;) */ +-#ifdef HAVE_K6_2PLUS +-#define PREFETCH "prefetch" +-/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */ +-#define EMMS "femms" +-#else +-#define PREFETCH "prefetchnta" +-#define EMMS "emms" +-#endif +- +-#ifdef HAVE_MMX2 +-#define MOVNTQ "movntq" +-#else +-#define MOVNTQ "movq" +-#endif +- +-#ifdef HAVE_MMX1 +-#define MIN_LEN 0x800 /* 2K blocks */ +-#else +-#define MIN_LEN 0x40 /* 64-byte blocks */ +-#endif +- +-void * fast_memcpy(void * to, const void * from, size_t len) +-{ +- void *retval; +- size_t i; +- retval = to; +-#ifdef STATISTICS +- { +- static int freq[33]; +- static int t=0; +- int i; +- for(i=0; len>(1<= MIN_LEN) +- { +- register unsigned long int delta; +- /* Align destinition to MMREG_SIZE -boundary */ +- delta = ((unsigned long int)to)&(MMREG_SIZE-1); +- if(delta) +- { +- delta=MMREG_SIZE-delta; +- len -= delta; +- small_memcpy(to, from, delta); +- } +- i = len >> 6; /* len/64 */ +- len&=63; +- /* +- This algorithm is top effective when the code consequently +- reads and writes blocks which have size of cache line. +- Size of cache line is processor-dependent. +- It will, however, be a minimum of 32 bytes on any processors. +- It would be better to have a number of instructions which +- perform reading and writing to be multiple to a number of +- processor's decoders, but it's not always possible. +- */ +-#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */ +- if(((unsigned long)from) & 15) +- /* if SRC is misaligned */ +- for(; i>0; i--) +- { +- __asm__ __volatile__ ( +- PREFETCH" 320(%0)\n" +- "movups (%0), %%xmm0\n" +- "movups 16(%0), %%xmm1\n" +- "movups 32(%0), %%xmm2\n" +- "movups 48(%0), %%xmm3\n" +- "movntps %%xmm0, (%1)\n" +- "movntps %%xmm1, 16(%1)\n" +- "movntps %%xmm2, 32(%1)\n" +- "movntps %%xmm3, 48(%1)\n" +- :: "r" (from), "r" (to) : "memory"); +- ((const unsigned char *)from)+=64; +- ((unsigned char *)to)+=64; +- } +- else +- /* +- Only if SRC is aligned on 16-byte boundary. +- It allows to use movaps instead of movups, which required data +- to be aligned or a general-protection exception (#GP) is generated. +- */ +- for(; i>0; i--) +- { +- __asm__ __volatile__ ( +- PREFETCH" 320(%0)\n" +- "movaps (%0), %%xmm0\n" +- "movaps 16(%0), %%xmm1\n" +- "movaps 32(%0), %%xmm2\n" +- "movaps 48(%0), %%xmm3\n" +- "movntps %%xmm0, (%1)\n" +- "movntps %%xmm1, 16(%1)\n" +- "movntps %%xmm2, 32(%1)\n" +- "movntps %%xmm3, 48(%1)\n" +- :: "r" (from), "r" (to) : "memory"); +- ((const unsigned char *)from)+=64; +- ((unsigned char *)to)+=64; +- } +-#else +- /* Align destination at BLOCK_SIZE boundary */ +- for(; ((ptrdiff_t)to & (BLOCK_SIZE-1)) && i>0; i--) +- { +- __asm__ __volatile__ ( +-#ifndef HAVE_MMX1 +- PREFETCH" 320(%0)\n" +-#endif +- "movq (%0), %%mm0\n" +- "movq 8(%0), %%mm1\n" +- "movq 16(%0), %%mm2\n" +- "movq 24(%0), %%mm3\n" +- "movq 32(%0), %%mm4\n" +- "movq 40(%0), %%mm5\n" +- "movq 48(%0), %%mm6\n" +- "movq 56(%0), %%mm7\n" +- MOVNTQ" %%mm0, (%1)\n" +- MOVNTQ" %%mm1, 8(%1)\n" +- MOVNTQ" %%mm2, 16(%1)\n" +- MOVNTQ" %%mm3, 24(%1)\n" +- MOVNTQ" %%mm4, 32(%1)\n" +- MOVNTQ" %%mm5, 40(%1)\n" +- MOVNTQ" %%mm6, 48(%1)\n" +- MOVNTQ" %%mm7, 56(%1)\n" +- :: "r" (from), "r" (to) : "memory"); +- from = (const void *) (((const unsigned char *)from)+64); +- to = (void *) (((unsigned char *)to)+64); +- } +- +-/* printf(" %p %p\n", (ptrdiff_t)from&1023, (ptrdiff_t)to&1023); */ +- /* Pure Assembly cuz gcc is a bit unpredictable ;) */ +-# if 0 +- if(i>=BLOCK_SIZE/64) +- asm volatile( +- "xorl %%eax, %%eax \n\t" +- ".balign 16 \n\t" +- "1: \n\t" +- "movl (%0, %%eax), %%ebx \n\t" +- "movl 32(%0, %%eax), %%ebx \n\t" +- "movl 64(%0, %%eax), %%ebx \n\t" +- "movl 96(%0, %%eax), %%ebx \n\t" +- "addl $128, %%eax \n\t" +- "cmpl %3, %%eax \n\t" +- " jb 1b \n\t" +- +- "xorl %%eax, %%eax \n\t" +- +- ".balign 16 \n\t" +- "2: \n\t" +- "movq (%0, %%eax), %%mm0\n" +- "movq 8(%0, %%eax), %%mm1\n" +- "movq 16(%0, %%eax), %%mm2\n" +- "movq 24(%0, %%eax), %%mm3\n" +- "movq 32(%0, %%eax), %%mm4\n" +- "movq 40(%0, %%eax), %%mm5\n" +- "movq 48(%0, %%eax), %%mm6\n" +- "movq 56(%0, %%eax), %%mm7\n" +- MOVNTQ" %%mm0, (%1, %%eax)\n" +- MOVNTQ" %%mm1, 8(%1, %%eax)\n" +- MOVNTQ" %%mm2, 16(%1, %%eax)\n" +- MOVNTQ" %%mm3, 24(%1, %%eax)\n" +- MOVNTQ" %%mm4, 32(%1, %%eax)\n" +- MOVNTQ" %%mm5, 40(%1, %%eax)\n" +- MOVNTQ" %%mm6, 48(%1, %%eax)\n" +- MOVNTQ" %%mm7, 56(%1, %%eax)\n" +- "addl $64, %%eax \n\t" +- "cmpl %3, %%eax \n\t" +- "jb 2b \n\t" +- +-#if CONFUSION_FACTOR > 0 +- /* a few percent speedup on out of order executing CPUs */ +- "movl %5, %%eax \n\t" +- "2: \n\t" +- "movl (%0), %%ebx \n\t" +- "movl (%0), %%ebx \n\t" +- "movl (%0), %%ebx \n\t" +- "movl (%0), %%ebx \n\t" +- "decl %%eax \n\t" +- " jnz 2b \n\t" +-#endif +- +- "xorl %%eax, %%eax \n\t" +- "addl %3, %0 \n\t" +- "addl %3, %1 \n\t" +- "subl %4, %2 \n\t" +- "cmpl %4, %2 \n\t" +- " jae 1b \n\t" +- : "+r" (from), "+r" (to), "+r" (i) +- : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR) +- : "%eax", "%ebx" +- ); +-#endif +- +- for(; i>0; i--) +- { +- __asm__ __volatile__ ( +-#ifndef HAVE_MMX1 +- PREFETCH" 320(%0)\n" +-#endif +- "movq (%0), %%mm0\n" +- "movq 8(%0), %%mm1\n" +- "movq 16(%0), %%mm2\n" +- "movq 24(%0), %%mm3\n" +- "movq 32(%0), %%mm4\n" +- "movq 40(%0), %%mm5\n" +- "movq 48(%0), %%mm6\n" +- "movq 56(%0), %%mm7\n" +- MOVNTQ" %%mm0, (%1)\n" +- MOVNTQ" %%mm1, 8(%1)\n" +- MOVNTQ" %%mm2, 16(%1)\n" +- MOVNTQ" %%mm3, 24(%1)\n" +- MOVNTQ" %%mm4, 32(%1)\n" +- MOVNTQ" %%mm5, 40(%1)\n" +- MOVNTQ" %%mm6, 48(%1)\n" +- MOVNTQ" %%mm7, 56(%1)\n" +- :: "r" (from), "r" (to) : "memory"); +- from = (const void *) (((const unsigned char *)from)+64); +- to = (void *) (((unsigned char *)to)+64); +- } +- +-#endif /* Have SSE */ +-#ifdef HAVE_MMX2 +- /* since movntq is weakly-ordered, a "sfence" +- * is needed to become ordered again. */ +- __asm__ __volatile__ ("sfence":::"memory"); +-#endif +-#ifndef HAVE_SSE +- /* enables to use FPU */ +- __asm__ __volatile__ (EMMS:::"memory"); +-#endif +- } +- /* +- * Now do the tail of the block +- */ +- if(len) small_memcpy(to, from, len); +- return retval; +-} +- +- +-#endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */ +- +-#endif +diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h +deleted file mode 100644 +index 43f5904..0000000 +--- a/xbmc/utils/fastmemcpy.h ++++ /dev/null +@@ -1,35 +0,0 @@ +-/* +- * Copyright (C) 2005-2013 Team XBMC +- * http://xbmc.org +- * +- * This Program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License as published by +- * the Free Software Foundation; either version 2, or (at your option) +- * any later version. +- * +- * This Program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with XBMC; see the file COPYING. If not, see +- * . +- * +- */ +-#pragma once +- +-#ifdef __cplusplus +-extern "C" { +-#endif +- +-#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) +-void * fast_memcpy(void * to, const void * from, size_t len); +-//#define fast_memcpy memcpy +-#else +-#define fast_memcpy memcpy +-#endif +- +-#ifdef __cplusplus +-} +-#endif +diff --git a/xbmc/utils/test/Makefile b/xbmc/utils/test/Makefile +index 8fa0526..3a467ad 100644 +--- a/xbmc/utils/test/Makefile ++++ b/xbmc/utils/test/Makefile +@@ -11,7 +11,6 @@ SRCS= \ + TestCryptThreading.cpp \ + TestDatabaseUtils.cpp \ + TestEndianSwap.cpp \ +- Testfastmemcpy.cpp \ + TestFileOperationJob.cpp \ + TestFileUtils.cpp \ + Testfstrcmp.cpp \ +diff --git a/xbmc/utils/test/Testfastmemcpy.cpp b/xbmc/utils/test/Testfastmemcpy.cpp +deleted file mode 100644 +index 93a9bb0..0000000 +--- a/xbmc/utils/test/Testfastmemcpy.cpp ++++ /dev/null +@@ -1,39 +0,0 @@ +-/* +- * Copyright (C) 2005-2013 Team XBMC +- * http://xbmc.org +- * +- * This Program is free software; you can redistribute it and/or modify +- * it under the terms of the GNU General Public License as published by +- * the Free Software Foundation; either version 2, or (at your option) +- * any later version. +- * +- * This Program is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +- * GNU General Public License for more details. +- * +- * You should have received a copy of the GNU General Public License +- * along with XBMC; see the file COPYING. If not, see +- * . +- * +- */ +- +-#include // TODO: This should go in fastmemcpy.h instead. +-#include "utils/fastmemcpy.h" +- +-#include "gtest/gtest.h" +- +-static const char refdata[] = "\x01\x02\x03\x04\x05\x06\x07\x08" +- "\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10" +- "\x11\x12\x13\x14\x15\x16\x17\x18" +- "\x19\x1a\x1b\x1c\x1d\x1e\x1f\x20" +- "\x21\x22\x23\x24\x25\x26\x27\x28" +- "\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30"; +- +-TEST(Testfastmemcpy, General) +-{ +- char vardata[sizeof(refdata)]; +- memset(vardata, 0, sizeof(vardata)); +- EXPECT_NE(nullptr, fast_memcpy(vardata, refdata, sizeof(refdata))); +- EXPECT_EQ(0, memcmp(refdata, vardata, sizeof(refdata))); +-} diff --git a/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch b/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch index 961fcff721..c60b9d9c25 100644 --- a/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch +++ b/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch @@ -3110,29 +3110,6 @@ index f9b9232..33aa88c 100644 if (pts == DVD_NOPTS_VALUE) pts = dts; -From bd332e5190d098ab8d22309eec31c0a3a8a5dfa9 Mon Sep 17 00:00:00 2001 -From: popcornmix -Date: Fri, 12 Jun 2015 17:27:47 +0100 -Subject: [PATCH 48/51] [rbp] Disable fast_memcpy which is slower than memcpy - ---- - xbmc/utils/fastmemcpy.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h -index 43f5904..6d872b1 100644 ---- a/xbmc/utils/fastmemcpy.h -+++ b/xbmc/utils/fastmemcpy.h -@@ -23,7 +23,7 @@ - extern "C" { - #endif - --#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) -+#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) && !defined(TARGET_RASPBERRY_PI) - void * fast_memcpy(void * to, const void * from, size_t len); - //#define fast_memcpy memcpy - #else - From 493d0d8dfac375bedb0e80c08213bb45a714a4bb Mon Sep 17 00:00:00 2001 From: popcornmix Date: Wed, 10 Jun 2015 20:42:03 +0100 diff --git a/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch b/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch index 961fcff721..c60b9d9c25 100644 --- a/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch +++ b/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch @@ -3110,29 +3110,6 @@ index f9b9232..33aa88c 100644 if (pts == DVD_NOPTS_VALUE) pts = dts; -From bd332e5190d098ab8d22309eec31c0a3a8a5dfa9 Mon Sep 17 00:00:00 2001 -From: popcornmix -Date: Fri, 12 Jun 2015 17:27:47 +0100 -Subject: [PATCH 48/51] [rbp] Disable fast_memcpy which is slower than memcpy - ---- - xbmc/utils/fastmemcpy.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h -index 43f5904..6d872b1 100644 ---- a/xbmc/utils/fastmemcpy.h -+++ b/xbmc/utils/fastmemcpy.h -@@ -23,7 +23,7 @@ - extern "C" { - #endif - --#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) -+#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) && !defined(TARGET_RASPBERRY_PI) - void * fast_memcpy(void * to, const void * from, size_t len); - //#define fast_memcpy memcpy - #else - From 493d0d8dfac375bedb0e80c08213bb45a714a4bb Mon Sep 17 00:00:00 2001 From: popcornmix Date: Wed, 10 Jun 2015 20:42:03 +0100