From 34d4ce847631f7c1f53657bafc32da1d1c2d495c Mon Sep 17 00:00:00 2001
From: Stephan Raue <stephan@openelec.tv>
Date: Sun, 14 Jun 2015 19:50:12 +0200
Subject: [PATCH] kodi: add PR7280

Signed-off-by: Stephan Raue <stephan@openelec.tv>
---
 .../kodi/patches/kodi-999.22-PR7280.patch     | 1364 +++++++++++++++++
 .../kodi-001-isengard-rpb-backports.patch     |   23 -
 .../kodi-001-isengard-rpb-backports.patch     |   23 -
 3 files changed, 1364 insertions(+), 46 deletions(-)
 create mode 100644 packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch
diff --git a/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch b/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch
new file mode 100644
index 0000000000..0cfe4bbd31
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch
@@ -0,0 +1,1364 @@
+From 224c1919ad3f68e23e817f41036687343f34aaae Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 12 Jun 2015 17:27:47 +0100
+Subject: [PATCH] [utils] Disable fast_memcpy which is slower than memcpy
+
+The default glibc memcpy is likely to be better tuned than this code
+which hasn't been touched for four years.
+
+In a test with software video decode on Pi2 the skipped frames went
+from 189 to 172 when fast_memcpy was disabled.
+---
+ Kodi.xcodeproj/project.pbxproj                     |   6 -
+ project/VS2010Express/XBMC.vcxproj                 |   4 -
+ project/VS2010Express/XBMC.vcxproj.filters         |   3 -
+ xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp    |   1 -
+ xbmc/cores/VideoRenderers/RenderCapture.cpp        |   7 +-
+ xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp   |  33 +-
+ .../Video/libstagefrightICS/StageFrightVideo.cpp   |   3 +-
+ xbmc/utils/Makefile.in                             |   2 -
+ xbmc/utils/fastmemcpy-arm.S                        | 528 ---------------------
+ xbmc/utils/fastmemcpy.c                            | 396 ----------------
+ xbmc/utils/fastmemcpy.h                            |  35 --
+ xbmc/utils/test/Makefile                           |   1 -
+ xbmc/utils/test/Testfastmemcpy.cpp                 |  39 --
+ 13 files changed, 20 insertions(+), 1038 deletions(-)
+ delete mode 100644 xbmc/utils/fastmemcpy-arm.S
+ delete mode 100644 xbmc/utils/fastmemcpy.c
+ delete mode 100644 xbmc/utils/fastmemcpy.h
+ delete mode 100644 xbmc/utils/test/Testfastmemcpy.cpp
+
+diff --git a/Kodi.xcodeproj/project.pbxproj b/Kodi.xcodeproj/project.pbxproj
+index 395c4ea..ce5a7f7 100644
+--- a/Kodi.xcodeproj/project.pbxproj
++++ b/Kodi.xcodeproj/project.pbxproj
+@@ -3192,7 +3192,6 @@
+ 		F5E55B5D10741272006E788A /* DVDPlayerTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B5B10741272006E788A /* DVDPlayerTeletext.cpp */; };
+ 		F5E55B66107412DE006E788A /* GUIDialogTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B65107412DE006E788A /* GUIDialogTeletext.cpp */; };
+ 		F5E55B7010741340006E788A /* Teletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B6E10741340006E788A /* Teletext.cpp */; };
+-		F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = F5E5697210803FC3006E788A /* fastmemcpy.c */; };
+ 		F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E56BA51082A675006E788A /* PosixMountProvider.cpp */; };
+ 		F5EA02260F6DA990005C2EC5 /* CocoaPowerSyscall.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA02200F6DA85C005C2EC5 /* CocoaPowerSyscall.cpp */; };
+ 		F5EA02270F6DA9A5005C2EC5 /* PowerManager.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */; };
+@@ -3632,7 +3631,6 @@
+ 		43348AAB1077486D00F859CF /* PlayerSelectionRule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PlayerSelectionRule.h; path = playercorefactory/PlayerSelectionRule.h; sourceTree = "<group>"; };
+ 		436721A612D66A09002508E6 /* IAnnouncer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IAnnouncer.h; sourceTree = "<group>"; };
+ 		436B38F3106628850049AB3B /* EndianSwap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = EndianSwap.h; sourceTree = "<group>"; };
+-		43BF09DD1080D39300E25290 /* fastmemcpy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fastmemcpy.h; sourceTree = "<group>"; };
+ 		43FAC87112D6349400F67914 /* IStorageProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IStorageProvider.h; sourceTree = "<group>"; };
+ 		551C3A43175A12010051AAAD /* VDA.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = VDA.cpp; sourceTree = "<group>"; };
+ 		551C3A44175A12010051AAAD /* VDA.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = VDA.h; sourceTree = "<group>"; };
+@@ -5735,7 +5733,6 @@
+ 		F5E55B6D10741340006E788A /* Teletext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Teletext.h; sourceTree = "<group>"; };
+ 		F5E55B6E10741340006E788A /* Teletext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Teletext.cpp; sourceTree = "<group>"; };
+ 		F5E55B6F10741340006E788A /* TeletextDefines.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TeletextDefines.h; sourceTree = "<group>"; };
+-		F5E5697210803FC3006E788A /* fastmemcpy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fastmemcpy.c; sourceTree = "<group>"; };
+ 		F5E56BA41082A675006E788A /* PosixMountProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PosixMountProvider.h; sourceTree = "<group>"; };
+ 		F5E56BA51082A675006E788A /* PosixMountProvider.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PosixMountProvider.cpp; sourceTree = "<group>"; };
+ 		F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PowerManager.cpp; sourceTree = "<group>"; };
+@@ -9202,8 +9199,6 @@
+ 				DF529BAD1741697B00523FB4 /* Environment.h */,
+ 				E36C29E90DA72486001F0C9D /* Fanart.cpp */,
+ 				6E97BDC30DA2B620003A2A89 /* Fanart.h */,
+-				F5E5697210803FC3006E788A /* fastmemcpy.c */,
+-				43BF09DD1080D39300E25290 /* fastmemcpy.h */,
+ 				F5F244641110DC6B009126C6 /* FileOperationJob.cpp */,
+ 				F5F244631110DC6B009126C6 /* FileOperationJob.h */,
+ 				F5F245EC1112C9AB009126C6 /* FileUtils.cpp */,
+@@ -10519,7 +10514,6 @@
+ 				43348AAE1077486D00F859CF /* PlayerCoreFactory.cpp in Sources */,
+ 				43348AAF1077486D00F859CF /* PlayerSelectionRule.cpp in Sources */,
+ 				7CAA20511079C8160096DE39 /* BaseRenderer.cpp in Sources */,
+-				F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */,
+ 				55D3604E1826CAB900DA66D2 /* OverlayRendererGUI.cpp in Sources */,
+ 				F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */,
+ 				7CAA25351085963B0096DE39 /* PasswordManager.cpp in Sources */,
+diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
+index 2d37c57..e8e8dce 100644
+--- a/project/VS2010Express/XBMC.vcxproj
++++ b/project/VS2010Express/XBMC.vcxproj
+@@ -1439,10 +1439,6 @@
+       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+     </ClCompile>
+-    <ClCompile Include="..\..\xbmc\utils\test\Testfastmemcpy.cpp">
+-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
+-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
+-    </ClCompile>
+     <ClCompile Include="..\..\xbmc\utils\TimeSmoother.cpp" />
+     <ClCompile Include="..\..\xbmc\utils\TimeUtils.cpp" />
+     <ClCompile Include="..\..\xbmc\utils\URIUtils.cpp" />
+diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
+index c858f32..cada31e 100644
+--- a/project/VS2010Express/XBMC.vcxproj.filters
++++ b/project/VS2010Express/XBMC.vcxproj.filters
+@@ -2371,9 +2371,6 @@
+     <ClCompile Include="..\..\xbmc\utils\test\TestEndianSwap.cpp">
+       <Filter>utils\test</Filter>
+     </ClCompile>
+-    <ClCompile Include="..\..\xbmc\utils\test\Testfastmemcpy.cpp">
+-      <Filter>utils\test</Filter>
+-    </ClCompile>
+     <ClCompile Include="..\..\xbmc\utils\test\TestFileOperationJob.cpp">
+       <Filter>utils\test</Filter>
+     </ClCompile>
+diff --git a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
+index 2b64121..fdad7f0 100644
+--- a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
++++ b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
+@@ -31,7 +31,6 @@
+ #include <locale.h>
+ #include "guilib/MatrixGLES.h"
+ #include "LinuxRendererGLES.h"
+-#include "utils/fastmemcpy.h"
+ #include "utils/MathUtils.h"
+ #include "utils/GLUtils.h"
+ #include "utils/log.h"
+diff --git a/xbmc/cores/VideoRenderers/RenderCapture.cpp b/xbmc/cores/VideoRenderers/RenderCapture.cpp
+index 603b68d..0456a27 100644
+--- a/xbmc/cores/VideoRenderers/RenderCapture.cpp
++++ b/xbmc/cores/VideoRenderers/RenderCapture.cpp
+@@ -21,7 +21,6 @@
+ #include "RenderCapture.h"
+ #include "utils/log.h"
+ #include "windowing/WindowingFactory.h"
+-#include "utils/fastmemcpy.h"
+ #include "settings/AdvancedSettings.h"
+ 
+ CRenderCaptureBase::CRenderCaptureBase()
+@@ -297,7 +296,7 @@ void CRenderCaptureGL::PboToBuffer()
+ 
+   if (pboPtr)
+   {
+-    fast_memcpy(m_pixels, pboPtr, m_bufferSize);
++    memcpy(m_pixels, pboPtr, m_bufferSize);
+     SetState(CAPTURESTATE_DONE);
+   }
+   else
+@@ -491,12 +490,12 @@ void CRenderCaptureDX::SurfaceToBuffer()
+     //if pitch is same, do a direct copy, otherwise copy one line at a time
+     if (lockedRect.Pitch == m_width * 4)
+     {
+-      fast_memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4);
++      memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4);
+     }
+     else
+     {
+       for (unsigned int y = 0; y < m_height; y++)
+-        fast_memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4);
++        memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4);
+     }
+     m_copySurface->UnlockRect();
+     SetState(CAPTURESTATE_DONE);
+diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
+index 56e68713..5f0e486 100644
+--- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
++++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
+@@ -22,7 +22,6 @@
+ #include "DVDClock.h"
+ #include "cores/VideoRenderers/RenderManager.h"
+ #include "utils/log.h"
+-#include "utils/fastmemcpy.h"
+ #include "cores/FFmpeg.h"
+ #include "Util.h"
+ #ifdef HAS_DX
+@@ -95,7 +94,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
+ 
+   for (int y = 0; y < h; y++)
+   {
+-    fast_memcpy(d, s, w);
++    memcpy(d, s, w);
+     s += pSrc->iLineSize[0];
+     d += pDst->iLineSize[0];
+   }
+@@ -107,7 +106,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
+   d = pDst->data[1];
+   for (int y = 0; y < h; y++)
+   {
+-    fast_memcpy(d, s, w);
++    memcpy(d, s, w);
+     s += pSrc->iLineSize[1];
+     d += pDst->iLineSize[1];
+   }
+@@ -116,7 +115,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
+   d = pDst->data[2];
+   for (int y = 0; y < h; y++)
+   {
+-    fast_memcpy(d, s, w);
++    memcpy(d, s, w);
+     s += pSrc->iLineSize[2];
+     d += pDst->iLineSize[2];
+   }
+@@ -131,13 +130,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
+   int h = pImage->height;
+   if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
+   {
+-    fast_memcpy(d, s, w*h);
++    memcpy(d, s, w*h);
+   }
+   else
+   {
+     for (int y = 0; y < h; y++)
+     {
+-      fast_memcpy(d, s, w);
++      memcpy(d, s, w);
+       s += pSrc->iLineSize[0];
+       d += pImage->stride[0];
+     }
+@@ -148,13 +147,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
+   h =(pImage->height >> pImage->cshift_y);
+   if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1]))
+   {
+-    fast_memcpy(d, s, w*h);
++    memcpy(d, s, w*h);
+   }
+   else
+   {
+     for (int y = 0; y < h; y++)
+     {
+-      fast_memcpy(d, s, w);
++      memcpy(d, s, w);
+       s += pSrc->iLineSize[1];
+       d += pImage->stride[1];
+     }
+@@ -163,13 +162,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
+   d = pImage->plane[2];
+   if ((w==pSrc->iLineSize[2]) && ((unsigned int) pSrc->iLineSize[2]==pImage->stride[2]))
+   {
+-    fast_memcpy(d, s, w*h);
++    memcpy(d, s, w*h);
+   }
+   else
+   {
+     for (int y = 0; y < h; y++)
+     {
+-      fast_memcpy(d, s, w);
++      memcpy(d, s, w);
+       s += pSrc->iLineSize[2];
+       d += pImage->stride[2];
+     }
+@@ -207,7 +206,7 @@ DVDVideoPicture* CDVDCodecUtils::ConvertToNV12Picture(DVDVideoPicture *pSrc)
+       uint8_t *d = pPicture->data[0];
+       for (int y = 0; y < (int)pSrc->iHeight; y++)
+       {
+-        fast_memcpy(d, s, pSrc->iWidth);
++        memcpy(d, s, pSrc->iWidth);
+         s += pSrc->iLineSize[0];
+         d += pPicture->iLineSize[0];
+       }
+@@ -298,13 +297,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
+   // Copy Y
+   if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
+   {
+-    fast_memcpy(d, s, w*h);
++    memcpy(d, s, w*h);
+   }
+   else
+   {
+     for (int y = 0; y < h; y++)
+     {
+-      fast_memcpy(d, s, w);
++      memcpy(d, s, w);
+       s += pSrc->iLineSize[0];
+       d += pImage->stride[0];
+     }
+@@ -317,13 +316,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
+   // Copy packed UV (width is same as for Y as it's both U and V components)
+   if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1]))
+   {
+-    fast_memcpy(d, s, w*h);
++    memcpy(d, s, w*h);
+   }
+   else
+   {
+     for (int y = 0; y < h; y++)
+     {
+-      fast_memcpy(d, s, w);
++      memcpy(d, s, w);
+       s += pSrc->iLineSize[1];
+       d += pImage->stride[1];
+     }
+@@ -342,13 +341,13 @@ bool CDVDCodecUtils::CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture
+   // Copy YUYV
+   if ((w * 2 == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
+   {
+-    fast_memcpy(d, s, w*h*2);
++    memcpy(d, s, w*h*2);
+   }
+   else
+   {
+     for (int y = 0; y < h; y++)
+     {
+-      fast_memcpy(d, s, w*2);
++      memcpy(d, s, w*2);
+       s += pSrc->iLineSize[0];
+       d += pImage->stride[0];
+     }
+diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
+index 019bc7a..d5ca74f 100644
+--- a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
++++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
+@@ -30,7 +30,6 @@
+ #include "guilib/GraphicContext.h"
+ #include "DVDClock.h"
+ #include "utils/log.h"
+-#include "utils/fastmemcpy.h"
+ #include "threads/Thread.h"
+ #include "threads/Event.h"
+ #include "Application.h"
+@@ -620,7 +619,7 @@ int  CStageFrightVideo::Decode(uint8_t *pData, int iSize, double dts, double pts
+       return VC_ERROR;
+     }
+ 
+-    fast_memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes);
++    memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes);
+     frame->medbuf->set_range(0, demuxer_bytes);
+     frame->medbuf->meta_data()->clear();
+     frame->medbuf->meta_data()->setInt64(kKeyTime, frame->pts);
+diff --git a/xbmc/utils/Makefile.in b/xbmc/utils/Makefile.in
+index 438f025..dbd3db9 100644
+--- a/xbmc/utils/Makefile.in
++++ b/xbmc/utils/Makefile.in
+@@ -17,8 +17,6 @@ SRCS += DatabaseUtils.cpp
+ SRCS += EndianSwap.cpp
+ SRCS += Environment.cpp
+ SRCS += Fanart.cpp
+-SRCS += fastmemcpy.c
+-SRCS += fastmemcpy-arm.S
+ SRCS += FileOperationJob.cpp
+ SRCS += FileUtils.cpp
+ SRCS += fstrcmp.c
+diff --git a/xbmc/utils/fastmemcpy-arm.S b/xbmc/utils/fastmemcpy-arm.S
+deleted file mode 100644
+index 6cb8b0c..0000000
+--- a/xbmc/utils/fastmemcpy-arm.S
++++ /dev/null
+@@ -1,528 +0,0 @@
+-/*
+- *      Copyright (C) 2008 The Android Open Source Project
+- *      All rights reserved.
+- *
+- *      Copyright (C) 2011-2013 Team XBMC
+- *      http://xbmc.org
+- *
+- *  This Program is free software; you can redistribute it and/or modify
+- *  it under the terms of the GNU General Public License as published by
+- *  the Free Software Foundation; either version 2, or (at your option)
+- *  any later version.
+- *
+- *  This Program is distributed in the hope that it will be useful,
+- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+- *  GNU General Public License for more details.
+- *
+- *  You should have received a copy of the GNU General Public License
+- *  along with XBMC; see the file COPYING.  If not, see
+- *  <http://www.gnu.org/licenses/>.
+- *
+- */
+-#if defined(__arm__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
+-#if defined(__ARM_NEON__)
+-
+-        .text
+-#ifndef __APPLE__
+-        .fpu    neon
+-        .global fast_memcpy
+-        .type fast_memcpy, %function
+-#else
+-        .globl _fast_memcpy
+-#endif
+-        .align 4
+-
+-/* a prefetch distance of 4 cache-lines works best experimentally */
+-#define CACHE_LINE_SIZE     64
+-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
+-
+-#ifndef __APPLE__
+-        .fnstart
+-        .save       {r0, lr}
+-fast_memcpy:
+-#else
+-_fast_memcpy:
+-#endif
+-        stmfd       sp!, {r0, lr}
+-
+-        /* start preloading as early as possible */
+-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+-
+-        /* do we have at least 16-bytes to copy (needed for alignment below) */
+-        cmp         r2, #16
+-        blo         5f
+-
+-        /* align destination to half cache-line for the write-buffer */
+-        rsb         r3, r0, #0
+-        ands        r3, r3, #0xF
+-        beq         0f
+-
+-        /* copy up to 15-bytes (count in r3) */
+-        sub         r2, r2, r3
+-        movs        ip, r3, lsl #31
+-        ldrmib      lr, [r1], #1
+-        strmib      lr, [r0], #1
+-        ldrcsb      ip, [r1], #1
+-        ldrcsb      lr, [r1], #1
+-        strcsb      ip, [r0], #1
+-        strcsb      lr, [r0], #1
+-        movs        ip, r3, lsl #29
+-        bge         1f
+-        // copies 4 bytes, destination 32-bits aligned
+-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
+-1:      bcc         2f
+-        // copies 8 bytes, destination 64-bits aligned
+-        vld1.8      {d0}, [r1]!
+-        vst1.8      {d0}, [r0, :64]!
+-2:
+-
+-0:      /* preload immediately the next cache line, which we may need */
+-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+-        pld         [r1, #(CACHE_LINE_SIZE*1)]
+-
+-        /* make sure we have at least 64 bytes to copy */
+-        subs        r2, r2, #64
+-        blo         2f
+-
+-        /* preload all the cache lines we need.
+-         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
+-         * ideally would would increase the distance in the main loop to
+-         * avoid the goofy code below. In practice this doesn't seem to make
+-         * a big difference.
+-         */
+-        pld         [r1, #(CACHE_LINE_SIZE*2)]
+-        pld         [r1, #(CACHE_LINE_SIZE*3)]
+-        pld         [r1, #(PREFETCH_DISTANCE)]
+-
+-1:      /* The main loop copies 64 bytes at a time */
+-        vld1.8      {d0  - d3},   [r1]!
+-        vld1.8      {d4  - d7},   [r1]!
+-        pld         [r1, #(PREFETCH_DISTANCE)]
+-        subs        r2, r2, #64
+-        vst1.8      {d0  - d3},   [r0, :128]!
+-        vst1.8      {d4  - d7},   [r0, :128]!
+-        bhs         1b
+-
+-2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
+-        add         r2, r2, #64
+-        subs        r2, r2, #32
+-        blo         4f
+-
+-3:      /* 32 bytes at a time. These cache lines were already preloaded */
+-        vld1.8      {d0 - d3},  [r1]!
+-        subs        r2, r2, #32
+-        vst1.8      {d0 - d3},  [r0, :128]!
+-        bhs         3b
+-
+-4:      /* less than 32 left */
+-        add         r2, r2, #32
+-        tst         r2, #0x10
+-        beq         5f
+-        // copies 16 bytes, 128-bits aligned
+-        vld1.8      {d0, d1}, [r1]!
+-        vst1.8      {d0, d1}, [r0, :128]!
+-
+-5:      /* copy up to 15-bytes (count in r2) */
+-        movs        ip, r2, lsl #29
+-        bcc         1f
+-        vld1.8      {d0}, [r1]!
+-        vst1.8      {d0}, [r0]!
+-1:      bge         2f
+-        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+-        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+-2:      movs        ip, r2, lsl #31
+-        ldrmib      r3, [r1], #1
+-        ldrcsb      ip, [r1], #1
+-        ldrcsb      lr, [r1], #1
+-        strmib      r3, [r0], #1
+-        strcsb      ip, [r0], #1
+-        strcsb      lr, [r0], #1
+-
+-        ldmfd       sp!, {r0, lr}
+-        bx          lr
+-#ifndef __APPLE__
+-        .fnend
+-#endif
+-
+-#else   /* __ARM_ARCH__ < 7 */
+-
+-
+-	.text
+-
+-#ifndef __APPLE__
+-    .global fast_memcpy
+-    .type fast_memcpy, %function
+-#else
+-    .globl _fast_memcpy
+-#endif
+-    .align 4
+-
+-		/*
+-		 * Optimized memcpy() for ARM.
+-         *
+-		 * note that memcpy() always returns the destination pointer,
+-		 * so we have to preserve R0.
+-		 */
+-
+-#ifndef __APPLE__
+-fast_memcpy:
+-#else
+-_fast_memcpy:
+-#endif
+-		/* The stack must always be 64-bits aligned to be compliant with the
+-		 * ARM ABI. Since we have to save R0, we might as well save R4
+-		 * which we can use for better pipelining of the reads below
+-		 */
+-#ifndef __APPLE__
+-        .fnstart
+-        .save       {r0, r4, lr}
+-#endif
+-        stmfd       sp!, {r0, r4, lr}
+-        /* Making room for r5-r11 which will be spilled later */
+-        .pad        #28
+-        sub         sp, sp, #28
+-
+-        // preload the destination because we'll align it to a cache line
+-        // with small writes. Also start the source "pump".
+-        //PLD         (r0, #0)
+-        //PLD         (r1, #0)
+-        //PLD         (r1, #32)
+-
+-		/* it simplifies things to take care of len<4 early */
+-		cmp			r2, #4
+-		blo			copy_last_3_and_return
+-
+-		/* compute the offset to align the source
+-		 * offset = (4-(src&3))&3 = -src & 3
+-		 */
+-		rsb			r3, r1, #0
+-		ands		r3, r3, #3
+-		beq			src_aligned
+-
+-		/* align source to 32 bits. We need to insert 2 instructions between
+-		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
+-		 * stall 2 cycles.
+-		 */
+-		movs		r12, r3, lsl #31
+-		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
+-		ldrmib		r3, [r1], #1
+-		ldrcsb		r4, [r1], #1
+-		ldrcsb		r12,[r1], #1
+-        strmib		r3, [r0], #1
+-		strcsb		r4, [r0], #1
+-		strcsb		r12,[r0], #1
+-
+-src_aligned:
+-
+-		/* see if src and dst are aligned together (congruent) */
+-		eor			r12, r0, r1
+-		tst			r12, #3
+-		bne			non_congruent
+-
+-        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
+-         * frame. Don't update sp.
+-         */
+-        stmea		sp, {r5-r11}
+-
+-		/* align the destination to a cache-line */
+-		rsb         r3, r0, #0
+-		ands		r3, r3, #0x1C
+-		beq         congruent_aligned32
+-		cmp         r3, r2
+-		andhi		r3, r2, #0x1C
+-
+-		/* conditionnaly copies 0 to 7 words (length in r3) */
+-		movs		r12, r3, lsl #28
+-		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
+-		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
+-		stmcsia		r0!, {r4, r5, r6, r7}
+-		stmmiia		r0!, {r8, r9}
+-		tst         r3, #0x4
+-		ldrne		r10,[r1], #4			/*  4 bytes */
+-		strne		r10,[r0], #4
+-		sub         r2, r2, r3
+-
+-congruent_aligned32:
+-		/*
+-		 * here source is aligned to 32 bytes.
+-		 */
+-
+-cached_aligned32:
+-        subs        r2, r2, #32
+-        blo         less_than_32_left
+-
+-        /*
+-         * We preload a cache-line up to 64 bytes ahead. On the 926, this will
+-         * stall only until the requested world is fetched, but the linefill
+-         * continues in the the background.
+-         * While the linefill is going, we write our previous cache-line
+-         * into the write-buffer (which should have some free space).
+-         * When the linefill is done, the writebuffer will
+-         * start dumping its content into memory
+-         *
+-         * While all this is going, we then load a full cache line into
+-         * 8 registers, this cache line should be in the cache by now
+-         * (or partly in the cache).
+-         *
+-         * This code should work well regardless of the source/dest alignment.
+-         *
+-         */
+-
+-        // Align the preload register to a cache-line because the cpu does
+-        // "critical word first" (the first word requested is loaded first).
+-        bic         r12, r1, #0x1F
+-        add         r12, r12, #64
+-
+-1:      ldmia       r1!, { r4-r11 }
+-        //PLD         (r12, #64)
+-        subs        r2, r2, #32
+-
+-        // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
+-        // for ARM9 preload will not be safely guarded by the preceding subs.
+-        // When it is safely guarded the only possibility to have SIGSEGV here
+-        // is because the caller overstates the length.
+-        ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
+-        stmia       r0!, { r4-r11 }
+-		bhs         1b
+-
+-        add         r2, r2, #32
+-
+-
+-
+-
+-less_than_32_left:
+-		/*
+-		 * less than 32 bytes left at this point (length in r2)
+-		 */
+-
+-		/* skip all this if there is nothing to do, which should
+-		 * be a common case (if not executed the code below takes
+-		 * about 16 cycles)
+-		 */
+-		tst			r2, #0x1F
+-		beq			1f
+-
+-		/* conditionnaly copies 0 to 31 bytes */
+-		movs		r12, r2, lsl #28
+-		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
+-		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
+-		stmcsia		r0!, {r4, r5, r6, r7}
+-		stmmiia		r0!, {r8, r9}
+-		movs		r12, r2, lsl #30
+-		ldrcs		r3, [r1], #4			/*  4 bytes */
+-		ldrmih		r4, [r1], #2			/*  2 bytes */
+-		strcs		r3, [r0], #4
+-		strmih		r4, [r0], #2
+-		tst         r2, #0x1
+-		ldrneb		r3, [r1]				/*  last byte  */
+-		strneb		r3, [r0]
+-
+-		/* we're done! restore everything and return */
+-1:		ldmfd		sp!, {r5-r11}
+-		ldmfd		sp!, {r0, r4, lr}
+-		bx			lr
+-
+-		/********************************************************************/
+-
+-non_congruent:
+-		/*
+-		 * here source is aligned to 4 bytes
+-		 * but destination is not.
+-		 *
+-		 * in the code below r2 is the number of bytes read
+-		 * (the number of bytes written is always smaller, because we have
+-		 * partial words in the shift queue)
+-		 */
+-		cmp			r2, #4
+-		blo			copy_last_3_and_return
+-
+-        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
+-         * frame. Don't update sp.
+-         */
+-        stmea		sp, {r5-r11}
+-
+-		/* compute shifts needed to align src to dest */
+-		rsb			r5, r0, #0
+-		and			r5, r5, #3			/* r5 = # bytes in partial words */
+-		mov			r12, r5, lsl #3		/* r12 = right */
+-		rsb			lr, r12, #32		/* lr = left  */
+-
+-		/* read the first word */
+-		ldr			r3, [r1], #4
+-		sub			r2, r2, #4
+-
+-		/* write a partial word (0 to 3 bytes), such that destination
+-		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
+-		 */
+-		movs		r5, r5, lsl #31
+-		strmib		r3, [r0], #1
+-		movmi		r3, r3, lsr #8
+-		strcsb		r3, [r0], #1
+-		movcs		r3, r3, lsr #8
+-		strcsb		r3, [r0], #1
+-		movcs		r3, r3, lsr #8
+-
+-		cmp			r2, #4
+-		blo			partial_word_tail
+-
+-		/* Align destination to 32 bytes (cache line boundary) */
+-1:		tst			r0, #0x1c
+-		beq			2f
+-		ldr			r5, [r1], #4
+-		sub         r2, r2, #4
+-		orr			r4, r3, r5,		lsl lr
+-		mov			r3, r5,			lsr r12
+-		str			r4, [r0], #4
+-        cmp         r2, #4
+-		bhs			1b
+-		blo			partial_word_tail
+-
+-		/* copy 32 bytes at a time */
+-2:		subs		r2, r2, #32
+-		blo			less_than_thirtytwo
+-
+-		/* Use immediate mode for the shifts, because there is an extra cycle
+-		 * for register shifts, which could account for up to 50% of
+-		 * performance hit.
+-		 */
+-
+-        cmp			r12, #24
+-		beq			loop24
+-		cmp			r12, #8
+-		beq			loop8
+-
+-loop16:
+-        ldr         r12, [r1], #4
+-1:      mov         r4, r12
+-		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
+-        //PLD         (r1, #64)
+-        subs        r2, r2, #32
+-        ldrhs       r12, [r1], #4
+-		orr			r3, r3, r4,		lsl #16
+-		mov			r4, r4,			lsr #16
+-		orr			r4, r4, r5,		lsl #16
+-		mov			r5, r5,			lsr #16
+-		orr			r5, r5, r6,		lsl #16
+-		mov			r6, r6,			lsr #16
+-		orr			r6, r6, r7,		lsl #16
+-		mov			r7, r7,			lsr #16
+-		orr			r7, r7, r8,		lsl #16
+-		mov			r8, r8,			lsr #16
+-		orr			r8, r8, r9,		lsl #16
+-		mov			r9, r9,			lsr #16
+-		orr			r9, r9, r10,	lsl #16
+-		mov			r10, r10,		lsr #16
+-		orr			r10, r10, r11,	lsl #16
+-		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+-		mov			r3, r11,		lsr #16
+-		bhs			1b
+-		b			less_than_thirtytwo
+-
+-loop8:
+-        ldr         r12, [r1], #4
+-1:      mov         r4, r12
+-		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
+-        //PLD         (r1, #64)
+-		subs		r2, r2, #32
+-        ldrhs       r12, [r1], #4
+-		orr			r3, r3, r4,		lsl #24
+-		mov			r4, r4,			lsr #8
+-		orr			r4, r4, r5,		lsl #24
+-		mov			r5, r5,			lsr #8
+-		orr			r5, r5, r6,		lsl #24
+-		mov			r6, r6,			lsr #8
+-		orr			r6, r6, r7,		lsl #24
+-		mov			r7, r7,			lsr #8
+-		orr			r7, r7, r8,		lsl #24
+-		mov			r8, r8,			lsr #8
+-		orr			r8, r8, r9,		lsl #24
+-		mov			r9, r9,			lsr #8
+-		orr			r9, r9, r10,	lsl #24
+-		mov			r10, r10,		lsr #8
+-		orr			r10, r10, r11,	lsl #24
+-		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+-		mov			r3, r11,		lsr #8
+-		bhs			1b
+-		b			less_than_thirtytwo
+-
+-loop24:
+-        ldr         r12, [r1], #4
+-1:      mov         r4, r12
+-		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
+-        //PLD         (r1, #64)
+-		subs		r2, r2, #32
+-        ldrhs       r12, [r1], #4
+-		orr			r3, r3, r4,		lsl #8
+-		mov			r4, r4,			lsr #24
+-		orr			r4, r4, r5,		lsl #8
+-		mov			r5, r5,			lsr #24
+-		orr			r5, r5, r6,		lsl #8
+-		mov			r6, r6,			lsr #24
+-		orr			r6, r6, r7,		lsl #8
+-		mov			r7, r7,			lsr #24
+-		orr			r7, r7, r8,		lsl #8
+-		mov			r8, r8,			lsr #24
+-		orr			r8, r8, r9,		lsl #8
+-		mov			r9, r9,			lsr #24
+-		orr			r9, r9, r10,	lsl #8
+-		mov			r10, r10,		lsr #24
+-		orr			r10, r10, r11,	lsl #8
+-		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
+-		mov			r3, r11,		lsr #24
+-		bhs			1b
+-
+-
+-less_than_thirtytwo:
+-		/* copy the last 0 to 31 bytes of the source */
+-		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
+-		add			r2, r2, #32
+-		cmp			r2, #4
+-		blo			partial_word_tail
+-
+-1:		ldr			r5, [r1], #4
+-		sub         r2, r2, #4
+-		orr			r4, r3, r5,		lsl lr
+-		mov			r3,	r5,			lsr r12
+-		str			r4, [r0], #4
+-        cmp         r2, #4
+-		bhs			1b
+-
+-partial_word_tail:
+-		/* we have a partial word in the input buffer */
+-		movs		r5, lr, lsl #(31-3)
+-		strmib		r3, [r0], #1
+-		movmi		r3, r3, lsr #8
+-		strcsb		r3, [r0], #1
+-		movcs		r3, r3, lsr #8
+-		strcsb		r3, [r0], #1
+-
+-		/* Refill spilled registers from the stack. Don't update sp. */
+-		ldmfd		sp, {r5-r11}
+-
+-copy_last_3_and_return:
+-		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
+-		ldrmib		r2, [r1], #1
+-		ldrcsb		r3, [r1], #1
+-		ldrcsb		r12,[r1]
+-		strmib		r2, [r0], #1
+-		strcsb		r3, [r0], #1
+-		strcsb		r12,[r0]
+-
+-        /* we're done! restore sp and spilled registers and return */
+-        add         sp,  sp, #28
+-		ldmfd		sp!, {r0, r4, lr}
+-		bx			lr
+-#ifndef __APPLE__
+-        .fnend
+-#endif
+-
+-#endif    /* __ARM_ARCH__ < 7 */
+-#endif
+-
+-#if defined(__linux__) && defined(__ELF__)
+-/* we don't need an executable stack */
+-.section .note.GNU-stack,"",%progbits
+-#endif
+diff --git a/xbmc/utils/fastmemcpy.c b/xbmc/utils/fastmemcpy.c
+deleted file mode 100644
+index ec9019a..0000000
+--- a/xbmc/utils/fastmemcpy.c
++++ /dev/null
+@@ -1,396 +0,0 @@
+-/*
+- * fastmemcpy.h : fast memcpy routines
+- *****************************************************************************
+- *      $Id: fastmemcpy.h 13905 2006-01-12 23:10:04Z dionoea $
+- *
+- *      Authors: various Linux kernel hackers
+- *               various MPlayer hackers
+- *               Nick Kurshev <nickols_k@mail.ru>
+- *
+- *      Copyright (C) 2011-2013 Team XBMC
+- *      http://xbmc.org
+- *
+- *  This Program is free software; you can redistribute it and/or modify
+- *  it under the terms of the GNU General Public License as published by
+- *  the Free Software Foundation; either version 2, or (at your option)
+- *  any later version.
+- *
+- *  This Program is distributed in the hope that it will be useful,
+- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+- *  GNU General Public License for more details.
+- *
+- *  You should have received a copy of the GNU General Public License
+- *  along with XBMC; see the file COPYING.  If not, see
+- *  <http://www.gnu.org/licenses/>.
+- *
+- */
+-#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__mips__)
+-#define HAVE_MMX2
+-#define HAVE_SSE
+-
+-/*
+-  aclib - advanced C library ;)
+-  This file contains functions which improve and expand standard C-library
+-*/
+-#include <stddef.h>
+-
+-#define BLOCK_SIZE 4096
+-#define CONFUSION_FACTOR 0
+-/*Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)*/
+-
+-/*#define STATISTICS*/
+-
+-#ifndef HAVE_SSE2
+-/*
+-   P3 processor has only one SSE decoder so can execute only 1 sse insn per
+-   cpu clock, but it has 3 mmx decoders (include load/store unit)
+-   and executes 3 mmx insns per cpu clock.
+-   P4 processor has some chances, but after reading:
+-   http://www.emulators.com/pentium4.htm
+-   I have doubts. Anyway SSE2 version of this code can be written better.
+-*/
+-#undef HAVE_SSE
+-#endif
+-
+-
+-/*
+- This part of code was taken by me from Linux-2.4.3 and slightly modified
+-for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
+-blocks but mplayer uses weakly ordered data and original sources can not
+-speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
+-
+->From IA-32 Intel Architecture Software Developer's Manual Volume 1,
+-
+-Order Number 245470:
+-"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
+-
+-Data referenced by a program can be temporal (data will be used again) or
+-non-temporal (data will be referenced once and not reused in the immediate
+-future). To make efficient use of the processor's caches, it is generally
+-desirable to cache temporal data and not cache non-temporal data. Overloading
+-the processor's caches with non-temporal data is sometimes referred to as
+-"polluting the caches".
+-The non-temporal data is written to memory with Write-Combining semantics.
+-
+-The PREFETCHh instructions permits a program to load data into the processor
+-at a suggested cache level, so that it is closer to the processors load and
+-store unit when it is needed. If the data is already present in a level of
+-the cache hierarchy that is closer to the processor, the PREFETCHh instruction
+-will not result in any data movement.
+-But we should you PREFETCHNTA: Non-temporal data fetch data into location
+-close to the processor, minimizing cache pollution.
+-
+-The MOVNTQ (store quadword using non-temporal hint) instruction stores
+-packed integer data from an MMX register to memory, using a non-temporal hint.
+-The MOVNTPS (store packed single-precision floating-point values using
+-non-temporal hint) instruction stores packed floating-point data from an
+-XMM register to memory, using a non-temporal hint.
+-
+-The SFENCE (Store Fence) instruction controls write ordering by creating a
+-fence for memory store operations. This instruction guarantees that the results
+-of every store instruction that precedes the store fence in program order is
+-globally visible before any store instruction that follows the fence. The
+-SFENCE instruction provides an efficient way of ensuring ordering between
+-procedures that produce weakly-ordered data and procedures that consume that
+-data.
+-
+-If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
+-*/
+-
+-/* 3dnow memcpy support from kernel 2.4.2 */
+-/*  by Pontscho/fresh!mindworkz           */
+-
+-#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
+-
+-#undef HAVE_MMX1
+-#if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
+-/*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
+-    of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
+-    standard (non MMX-optimized) version.
+-    Note: on K6-2+ it speedups memory copying upto 25% and
+-          on K7 and P3 about 500% (5 times). */
+-#define HAVE_MMX1
+-#endif
+-
+-
+-#undef HAVE_K6_2PLUS
+-#if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
+-#define HAVE_K6_2PLUS
+-#endif
+-
+-/* for small memory blocks (<256 bytes) this version is faster */
+-#define small_memcpy(to,from,n)\
+-{\
+-register unsigned long int dummy;\
+-__asm__ __volatile__(\
+-	"rep; movsb"\
+-	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
+-/* It's most portable way to notify compiler */\
+-/* that edi, esi and ecx are clobbered in asm block. */\
+-/* Thanks to A'rpi for hint!!! */\
+-        :"0" (to), "1" (from),"2" (n)\
+-	: "memory");\
+-}
+-
+-#ifdef HAVE_SSE
+-#define MMREG_SIZE 16
+-#else
+-#define MMREG_SIZE 64 /*8*/
+-#endif
+-
+-/* Small defines (for readability only) ;) */
+-#ifdef HAVE_K6_2PLUS
+-#define PREFETCH "prefetch"
+-/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
+-#define EMMS     "femms"
+-#else
+-#define PREFETCH "prefetchnta"
+-#define EMMS     "emms"
+-#endif
+-
+-#ifdef HAVE_MMX2
+-#define MOVNTQ "movntq"
+-#else
+-#define MOVNTQ "movq"
+-#endif
+-
+-#ifdef HAVE_MMX1
+-#define MIN_LEN 0x800  /* 2K blocks */
+-#else
+-#define MIN_LEN 0x40  /* 64-byte blocks */
+-#endif
+-
+-void * fast_memcpy(void * to, const void * from, size_t len)
+-{
+-	void *retval;
+-	size_t i;
+-	retval = to;
+-#ifdef STATISTICS
+-	{
+-		static int freq[33];
+-		static int t=0;
+-		int i;
+-		for(i=0; len>(1<<i); i++);
+-		freq[i]++;
+-		t++;
+-		if(1024*1024*1024 % t == 0)
+-			for(i=0; i<32; i++)
+-				printf("freq < %8d %4d\n", 1<<i, freq[i]);
+-	}
+-#endif
+-#ifndef HAVE_MMX1
+-        /* PREFETCH has effect even for MOVSB instruction ;) */
+-	__asm__ __volatile__ (
+-	        PREFETCH" (%0)\n"
+-	        PREFETCH" 64(%0)\n"
+-	        PREFETCH" 128(%0)\n"
+-        	PREFETCH" 192(%0)\n"
+-        	PREFETCH" 256(%0)\n"
+-		: : "r" (from) );
+-#endif
+-        if(len >= MIN_LEN)
+-	{
+-	  register unsigned long int delta;
+-          /* Align destinition to MMREG_SIZE -boundary */
+-          delta = ((unsigned long int)to)&(MMREG_SIZE-1);
+-          if(delta)
+-	  {
+-	    delta=MMREG_SIZE-delta;
+-	    len -= delta;
+-	    small_memcpy(to, from, delta);
+-	  }
+-	  i = len >> 6; /* len/64 */
+-	  len&=63;
+-        /*
+-           This algorithm is top effective when the code consequently
+-           reads and writes blocks which have size of cache line.
+-           Size of cache line is processor-dependent.
+-           It will, however, be a minimum of 32 bytes on any processors.
+-           It would be better to have a number of instructions which
+-           perform reading and writing to be multiple to a number of
+-           processor's decoders, but it's not always possible.
+-        */
+-#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
+-	if(((unsigned long)from) & 15)
+-	/* if SRC is misaligned */
+-	for(; i>0; i--)
+-	{
+-		__asm__ __volatile__ (
+-		PREFETCH" 320(%0)\n"
+-		"movups (%0), %%xmm0\n"
+-		"movups 16(%0), %%xmm1\n"
+-		"movups 32(%0), %%xmm2\n"
+-		"movups 48(%0), %%xmm3\n"
+-		"movntps %%xmm0, (%1)\n"
+-		"movntps %%xmm1, 16(%1)\n"
+-		"movntps %%xmm2, 32(%1)\n"
+-		"movntps %%xmm3, 48(%1)\n"
+-		:: "r" (from), "r" (to) : "memory");
+-		((const unsigned char *)from)+=64;
+-		((unsigned char *)to)+=64;
+-	}
+-	else
+-	/*
+-	   Only if SRC is aligned on 16-byte boundary.
+-	   It allows to use movaps instead of movups, which required data
+-	   to be aligned or a general-protection exception (#GP) is generated.
+-	*/
+-	for(; i>0; i--)
+-	{
+-		__asm__ __volatile__ (
+-		PREFETCH" 320(%0)\n"
+-		"movaps (%0), %%xmm0\n"
+-		"movaps 16(%0), %%xmm1\n"
+-		"movaps 32(%0), %%xmm2\n"
+-		"movaps 48(%0), %%xmm3\n"
+-		"movntps %%xmm0, (%1)\n"
+-		"movntps %%xmm1, 16(%1)\n"
+-		"movntps %%xmm2, 32(%1)\n"
+-		"movntps %%xmm3, 48(%1)\n"
+-		:: "r" (from), "r" (to) : "memory");
+-		((const unsigned char *)from)+=64;
+-		((unsigned char *)to)+=64;
+-	}
+-#else
+-	/* Align destination at BLOCK_SIZE boundary */
+-	for(; ((ptrdiff_t)to & (BLOCK_SIZE-1)) && i>0; i--)
+-	{
+-		__asm__ __volatile__ (
+-#ifndef HAVE_MMX1
+-        	PREFETCH" 320(%0)\n"
+-#endif
+-		"movq (%0), %%mm0\n"
+-		"movq 8(%0), %%mm1\n"
+-		"movq 16(%0), %%mm2\n"
+-		"movq 24(%0), %%mm3\n"
+-		"movq 32(%0), %%mm4\n"
+-		"movq 40(%0), %%mm5\n"
+-		"movq 48(%0), %%mm6\n"
+-		"movq 56(%0), %%mm7\n"
+-		MOVNTQ" %%mm0, (%1)\n"
+-		MOVNTQ" %%mm1, 8(%1)\n"
+-		MOVNTQ" %%mm2, 16(%1)\n"
+-		MOVNTQ" %%mm3, 24(%1)\n"
+-		MOVNTQ" %%mm4, 32(%1)\n"
+-		MOVNTQ" %%mm5, 40(%1)\n"
+-		MOVNTQ" %%mm6, 48(%1)\n"
+-		MOVNTQ" %%mm7, 56(%1)\n"
+-		:: "r" (from), "r" (to) : "memory");
+-                from = (const void *) (((const unsigned char *)from)+64);
+-		to = (void *) (((unsigned char *)to)+64);
+-	}
+-
+-/*	printf(" %p %p\n", (ptrdiff_t)from&1023, (ptrdiff_t)to&1023); */
+-	/* Pure Assembly cuz gcc is a bit unpredictable ;) */
+-# if 0
+-	if(i>=BLOCK_SIZE/64)
+-		asm volatile(
+-			"xorl %%eax, %%eax	\n\t"
+-			".balign 16		\n\t"
+-			"1:			\n\t"
+-				"movl (%0, %%eax), %%ebx 	\n\t"
+-				"movl 32(%0, %%eax), %%ebx 	\n\t"
+-				"movl 64(%0, %%eax), %%ebx 	\n\t"
+-				"movl 96(%0, %%eax), %%ebx 	\n\t"
+-				"addl $128, %%eax		\n\t"
+-				"cmpl %3, %%eax			\n\t"
+-				" jb 1b				\n\t"
+-
+-			"xorl %%eax, %%eax	\n\t"
+-
+-				".balign 16		\n\t"
+-				"2:			\n\t"
+-				"movq (%0, %%eax), %%mm0\n"
+-				"movq 8(%0, %%eax), %%mm1\n"
+-				"movq 16(%0, %%eax), %%mm2\n"
+-				"movq 24(%0, %%eax), %%mm3\n"
+-				"movq 32(%0, %%eax), %%mm4\n"
+-				"movq 40(%0, %%eax), %%mm5\n"
+-				"movq 48(%0, %%eax), %%mm6\n"
+-				"movq 56(%0, %%eax), %%mm7\n"
+-				MOVNTQ" %%mm0, (%1, %%eax)\n"
+-				MOVNTQ" %%mm1, 8(%1, %%eax)\n"
+-				MOVNTQ" %%mm2, 16(%1, %%eax)\n"
+-				MOVNTQ" %%mm3, 24(%1, %%eax)\n"
+-				MOVNTQ" %%mm4, 32(%1, %%eax)\n"
+-				MOVNTQ" %%mm5, 40(%1, %%eax)\n"
+-				MOVNTQ" %%mm6, 48(%1, %%eax)\n"
+-				MOVNTQ" %%mm7, 56(%1, %%eax)\n"
+-				"addl $64, %%eax		\n\t"
+-				"cmpl %3, %%eax		\n\t"
+-				"jb 2b				\n\t"
+-
+-#if CONFUSION_FACTOR > 0
+-	/* a few percent speedup on out of order executing CPUs */
+-			"movl %5, %%eax		\n\t"
+-				"2:			\n\t"
+-				"movl (%0), %%ebx	\n\t"
+-				"movl (%0), %%ebx	\n\t"
+-				"movl (%0), %%ebx	\n\t"
+-				"movl (%0), %%ebx	\n\t"
+-				"decl %%eax		\n\t"
+-				" jnz 2b		\n\t"
+-#endif
+-
+-			"xorl %%eax, %%eax	\n\t"
+-			"addl %3, %0		\n\t"
+-			"addl %3, %1		\n\t"
+-			"subl %4, %2		\n\t"
+-			"cmpl %4, %2		\n\t"
+-			" jae 1b		\n\t"
+-				: "+r" (from), "+r" (to), "+r" (i)
+-				: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
+-				: "%eax", "%ebx"
+-		);
+-#endif
+-
+-	for(; i>0; i--)
+-	{
+-		__asm__ __volatile__ (
+-#ifndef HAVE_MMX1
+-        	PREFETCH" 320(%0)\n"
+-#endif
+-		"movq (%0), %%mm0\n"
+-		"movq 8(%0), %%mm1\n"
+-		"movq 16(%0), %%mm2\n"
+-		"movq 24(%0), %%mm3\n"
+-		"movq 32(%0), %%mm4\n"
+-		"movq 40(%0), %%mm5\n"
+-		"movq 48(%0), %%mm6\n"
+-		"movq 56(%0), %%mm7\n"
+-		MOVNTQ" %%mm0, (%1)\n"
+-		MOVNTQ" %%mm1, 8(%1)\n"
+-		MOVNTQ" %%mm2, 16(%1)\n"
+-		MOVNTQ" %%mm3, 24(%1)\n"
+-		MOVNTQ" %%mm4, 32(%1)\n"
+-		MOVNTQ" %%mm5, 40(%1)\n"
+-		MOVNTQ" %%mm6, 48(%1)\n"
+-		MOVNTQ" %%mm7, 56(%1)\n"
+-		:: "r" (from), "r" (to) : "memory");
+-		from = (const void *) (((const unsigned char *)from)+64);
+-		to = (void *) (((unsigned char *)to)+64);
+-	}
+-
+-#endif /* Have SSE */
+-#ifdef HAVE_MMX2
+-                /* since movntq is weakly-ordered, a "sfence"
+-		 * is needed to become ordered again. */
+-		__asm__ __volatile__ ("sfence":::"memory");
+-#endif
+-#ifndef HAVE_SSE
+-		/* enables to use FPU */
+-		__asm__ __volatile__ (EMMS:::"memory");
+-#endif
+-	}
+-	/*
+-	 *	Now do the tail of the block
+-	 */
+-	if(len) small_memcpy(to, from, len);
+-	return retval;
+-}
+-
+-
+-#endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */
+-
+-#endif
+diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h
+deleted file mode 100644
+index 43f5904..0000000
+--- a/xbmc/utils/fastmemcpy.h
++++ /dev/null
+@@ -1,35 +0,0 @@
+-/*
+- *      Copyright (C) 2005-2013 Team XBMC
+- *      http://xbmc.org
+- *
+- *  This Program is free software; you can redistribute it and/or modify
+- *  it under the terms of the GNU General Public License as published by
+- *  the Free Software Foundation; either version 2, or (at your option)
+- *  any later version.
+- *
+- *  This Program is distributed in the hope that it will be useful,
+- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+- *  GNU General Public License for more details.
+- *
+- *  You should have received a copy of the GNU General Public License
+- *  along with XBMC; see the file COPYING.  If not, see
+- *  <http://www.gnu.org/licenses/>.
+- *
+- */
+-#pragma once
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
+-void * fast_memcpy(void * to, const void * from, size_t len);
+-//#define fast_memcpy memcpy
+-#else
+-#define fast_memcpy memcpy
+-#endif
+-
+-#ifdef __cplusplus
+-}
+-#endif
+diff --git a/xbmc/utils/test/Makefile b/xbmc/utils/test/Makefile
+index 8fa0526..3a467ad 100644
+--- a/xbmc/utils/test/Makefile
++++ b/xbmc/utils/test/Makefile
+@@ -11,7 +11,6 @@ SRCS=	\
+ 	TestCryptThreading.cpp \
+ 	TestDatabaseUtils.cpp \
+ 	TestEndianSwap.cpp \
+-	Testfastmemcpy.cpp \
+ 	TestFileOperationJob.cpp \
+ 	TestFileUtils.cpp \
+ 	Testfstrcmp.cpp \
+diff --git a/xbmc/utils/test/Testfastmemcpy.cpp b/xbmc/utils/test/Testfastmemcpy.cpp
+deleted file mode 100644
+index 93a9bb0..0000000
+--- a/xbmc/utils/test/Testfastmemcpy.cpp
++++ /dev/null
+@@ -1,39 +0,0 @@
+-/*
+- *      Copyright (C) 2005-2013 Team XBMC
+- *      http://xbmc.org
+- *
+- *  This Program is free software; you can redistribute it and/or modify
+- *  it under the terms of the GNU General Public License as published by
+- *  the Free Software Foundation; either version 2, or (at your option)
+- *  any later version.
+- *
+- *  This Program is distributed in the hope that it will be useful,
+- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+- *  GNU General Public License for more details.
+- *
+- *  You should have received a copy of the GNU General Public License
+- *  along with XBMC; see the file COPYING.  If not, see
+- *  <http://www.gnu.org/licenses/>.
+- *
+- */
+-
+-#include <stddef.h> // TODO: This should go in fastmemcpy.h instead.
+-#include "utils/fastmemcpy.h"
+-
+-#include "gtest/gtest.h"
+-
+-static const char refdata[] = "\x01\x02\x03\x04\x05\x06\x07\x08"
+-                              "\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
+-                              "\x11\x12\x13\x14\x15\x16\x17\x18"
+-                              "\x19\x1a\x1b\x1c\x1d\x1e\x1f\x20"
+-                              "\x21\x22\x23\x24\x25\x26\x27\x28"
+-                              "\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30";
+-
+-TEST(Testfastmemcpy, General)
+-{
+-  char vardata[sizeof(refdata)];
+-  memset(vardata, 0, sizeof(vardata));
+-  EXPECT_NE(nullptr, fast_memcpy(vardata, refdata, sizeof(refdata)));
+-  EXPECT_EQ(0, memcmp(refdata, vardata, sizeof(refdata)));
+-}
diff --git a/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch b/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch
index 961fcff721..c60b9d9c25 100644
--- a/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch
+++ b/projects/RPi/patches/kodi/kodi-001-isengard-rpb-backports.patch
@@ -3110,29 +3110,6 @@ index f9b9232..33aa88c 100644
          if (pts == DVD_NOPTS_VALUE)
            pts = dts;
 
-From bd332e5190d098ab8d22309eec31c0a3a8a5dfa9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 12 Jun 2015 17:27:47 +0100
-Subject: [PATCH 48/51] [rbp] Disable fast_memcpy which is slower than memcpy
-
----
- xbmc/utils/fastmemcpy.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h
-index 43f5904..6d872b1 100644
---- a/xbmc/utils/fastmemcpy.h
-+++ b/xbmc/utils/fastmemcpy.h
-@@ -23,7 +23,7 @@
- extern "C" {
- #endif
- 
--#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
-+#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) && !defined(TARGET_RASPBERRY_PI)
- void * fast_memcpy(void * to, const void * from, size_t len);
- //#define fast_memcpy memcpy
- #else
-
 From 493d0d8dfac375bedb0e80c08213bb45a714a4bb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 10 Jun 2015 20:42:03 +0100
diff --git a/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch b/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch
index 961fcff721..c60b9d9c25 100644
--- a/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch
+++ b/projects/RPi2/patches/kodi/kodi-001-isengard-rpb-backports.patch
@@ -3110,29 +3110,6 @@ index f9b9232..33aa88c 100644
          if (pts == DVD_NOPTS_VALUE)
            pts = dts;
 
-From bd332e5190d098ab8d22309eec31c0a3a8a5dfa9 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 12 Jun 2015 17:27:47 +0100
-Subject: [PATCH 48/51] [rbp] Disable fast_memcpy which is slower than memcpy
-
----
- xbmc/utils/fastmemcpy.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h
-index 43f5904..6d872b1 100644
---- a/xbmc/utils/fastmemcpy.h
-+++ b/xbmc/utils/fastmemcpy.h
-@@ -23,7 +23,7 @@
- extern "C" {
- #endif
- 
--#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
-+#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS) && !defined(TARGET_RASPBERRY_PI)
- void * fast_memcpy(void * to, const void * from, size_t len);
- //#define fast_memcpy memcpy
- #else
-
 From 493d0d8dfac375bedb0e80c08213bb45a714a4bb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 10 Jun 2015 20:42:03 +0100