diff --git a/packages/mediacenter/kodi-theme-Confluence/package.mk b/packages/mediacenter/kodi-theme-Confluence/package.mk
index dc13f3adda..86f90fdc6e 100644
--- a/packages/mediacenter/kodi-theme-Confluence/package.mk
+++ b/packages/mediacenter/kodi-theme-Confluence/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="kodi-theme-Confluence"
-PKG_VERSION="15.0-rc1-a248db2"
+PKG_VERSION="15.0-rc1-8f081c2"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index 890874a28a..0d72df1834 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="kodi"
-PKG_VERSION="15.0-rc1-a248db2"
+PKG_VERSION="15.0-rc1-8f081c2"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch b/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch
deleted file mode 100644
index 0cfe4bbd31..0000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.22-PR7280.patch
+++ /dev/null
@@ -1,1364 +0,0 @@
-From 224c1919ad3f68e23e817f41036687343f34aaae Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 12 Jun 2015 17:27:47 +0100
-Subject: [PATCH] [utils] Disable fast_memcpy which is slower than memcpy
-
-The default glibc memcpy is likely to be better tuned than this code
-which hasn't been touched for four years.
-
-In a test with software video decode on Pi2 the skipped frames went
-from 189 to 172 when fast_memcpy was disabled.
----
- Kodi.xcodeproj/project.pbxproj                     |   6 -
- project/VS2010Express/XBMC.vcxproj                 |   4 -
- project/VS2010Express/XBMC.vcxproj.filters         |   3 -
- xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp    |   1 -
- xbmc/cores/VideoRenderers/RenderCapture.cpp        |   7 +-
- xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp   |  33 +-
- .../Video/libstagefrightICS/StageFrightVideo.cpp   |   3 +-
- xbmc/utils/Makefile.in                             |   2 -
- xbmc/utils/fastmemcpy-arm.S                        | 528 ---------------------
- xbmc/utils/fastmemcpy.c                            | 396 ----------------
- xbmc/utils/fastmemcpy.h                            |  35 --
- xbmc/utils/test/Makefile                           |   1 -
- xbmc/utils/test/Testfastmemcpy.cpp                 |  39 --
- 13 files changed, 20 insertions(+), 1038 deletions(-)
- delete mode 100644 xbmc/utils/fastmemcpy-arm.S
- delete mode 100644 xbmc/utils/fastmemcpy.c
- delete mode 100644 xbmc/utils/fastmemcpy.h
- delete mode 100644 xbmc/utils/test/Testfastmemcpy.cpp
-
-diff --git a/Kodi.xcodeproj/project.pbxproj b/Kodi.xcodeproj/project.pbxproj
-index 395c4ea..ce5a7f7 100644
---- a/Kodi.xcodeproj/project.pbxproj
-+++ b/Kodi.xcodeproj/project.pbxproj
-@@ -3192,7 +3192,6 @@
- 		F5E55B5D10741272006E788A /* DVDPlayerTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B5B10741272006E788A /* DVDPlayerTeletext.cpp */; };
- 		F5E55B66107412DE006E788A /* GUIDialogTeletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B65107412DE006E788A /* GUIDialogTeletext.cpp */; };
- 		F5E55B7010741340006E788A /* Teletext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E55B6E10741340006E788A /* Teletext.cpp */; };
--		F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */ = {isa = PBXBuildFile; fileRef = F5E5697210803FC3006E788A /* fastmemcpy.c */; };
- 		F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5E56BA51082A675006E788A /* PosixMountProvider.cpp */; };
- 		F5EA02260F6DA990005C2EC5 /* CocoaPowerSyscall.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA02200F6DA85C005C2EC5 /* CocoaPowerSyscall.cpp */; };
- 		F5EA02270F6DA9A5005C2EC5 /* PowerManager.cpp in Sources */ = {isa = PBXBuildFile; fileRef = F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */; };
-@@ -3632,7 +3631,6 @@
- 		43348AAB1077486D00F859CF /* PlayerSelectionRule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = PlayerSelectionRule.h; path = playercorefactory/PlayerSelectionRule.h; sourceTree = "<group>"; };
- 		436721A612D66A09002508E6 /* IAnnouncer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IAnnouncer.h; sourceTree = "<group>"; };
- 		436B38F3106628850049AB3B /* EndianSwap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = EndianSwap.h; sourceTree = "<group>"; };
--		43BF09DD1080D39300E25290 /* fastmemcpy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fastmemcpy.h; sourceTree = "<group>"; };
- 		43FAC87112D6349400F67914 /* IStorageProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = IStorageProvider.h; sourceTree = "<group>"; };
- 		551C3A43175A12010051AAAD /* VDA.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = VDA.cpp; sourceTree = "<group>"; };
- 		551C3A44175A12010051AAAD /* VDA.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = VDA.h; sourceTree = "<group>"; };
-@@ -5735,7 +5733,6 @@
- 		F5E55B6D10741340006E788A /* Teletext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Teletext.h; sourceTree = "<group>"; };
- 		F5E55B6E10741340006E788A /* Teletext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Teletext.cpp; sourceTree = "<group>"; };
- 		F5E55B6F10741340006E788A /* TeletextDefines.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TeletextDefines.h; sourceTree = "<group>"; };
--		F5E5697210803FC3006E788A /* fastmemcpy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = fastmemcpy.c; sourceTree = "<group>"; };
- 		F5E56BA41082A675006E788A /* PosixMountProvider.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PosixMountProvider.h; sourceTree = "<group>"; };
- 		F5E56BA51082A675006E788A /* PosixMountProvider.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PosixMountProvider.cpp; sourceTree = "<group>"; };
- 		F5EA021A0F6DA7E8005C2EC5 /* PowerManager.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PowerManager.cpp; sourceTree = "<group>"; };
-@@ -9202,8 +9199,6 @@
- 				DF529BAD1741697B00523FB4 /* Environment.h */,
- 				E36C29E90DA72486001F0C9D /* Fanart.cpp */,
- 				6E97BDC30DA2B620003A2A89 /* Fanart.h */,
--				F5E5697210803FC3006E788A /* fastmemcpy.c */,
--				43BF09DD1080D39300E25290 /* fastmemcpy.h */,
- 				F5F244641110DC6B009126C6 /* FileOperationJob.cpp */,
- 				F5F244631110DC6B009126C6 /* FileOperationJob.h */,
- 				F5F245EC1112C9AB009126C6 /* FileUtils.cpp */,
-@@ -10519,7 +10514,6 @@
- 				43348AAE1077486D00F859CF /* PlayerCoreFactory.cpp in Sources */,
- 				43348AAF1077486D00F859CF /* PlayerSelectionRule.cpp in Sources */,
- 				7CAA20511079C8160096DE39 /* BaseRenderer.cpp in Sources */,
--				F5E5697310803FC3006E788A /* fastmemcpy.c in Sources */,
- 				55D3604E1826CAB900DA66D2 /* OverlayRendererGUI.cpp in Sources */,
- 				F5E56BA61082A675006E788A /* PosixMountProvider.cpp in Sources */,
- 				7CAA25351085963B0096DE39 /* PasswordManager.cpp in Sources */,
-diff --git a/project/VS2010Express/XBMC.vcxproj b/project/VS2010Express/XBMC.vcxproj
-index 2d37c57..e8e8dce 100644
---- a/project/VS2010Express/XBMC.vcxproj
-+++ b/project/VS2010Express/XBMC.vcxproj
-@@ -1439,10 +1439,6 @@
-       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
-       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
-     </ClCompile>
--    <ClCompile Include="..\..\xbmc\utils\test\Testfastmemcpy.cpp">
--      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
--      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
--    </ClCompile>
-     <ClCompile Include="..\..\xbmc\utils\TimeSmoother.cpp" />
-     <ClCompile Include="..\..\xbmc\utils\TimeUtils.cpp" />
-     <ClCompile Include="..\..\xbmc\utils\URIUtils.cpp" />
-diff --git a/project/VS2010Express/XBMC.vcxproj.filters b/project/VS2010Express/XBMC.vcxproj.filters
-index c858f32..cada31e 100644
---- a/project/VS2010Express/XBMC.vcxproj.filters
-+++ b/project/VS2010Express/XBMC.vcxproj.filters
-@@ -2371,9 +2371,6 @@
-     <ClCompile Include="..\..\xbmc\utils\test\TestEndianSwap.cpp">
-       <Filter>utils\test</Filter>
-     </ClCompile>
--    <ClCompile Include="..\..\xbmc\utils\test\Testfastmemcpy.cpp">
--      <Filter>utils\test</Filter>
--    </ClCompile>
-     <ClCompile Include="..\..\xbmc\utils\test\TestFileOperationJob.cpp">
-       <Filter>utils\test</Filter>
-     </ClCompile>
-diff --git a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
-index 2b64121..fdad7f0 100644
---- a/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
-+++ b/xbmc/cores/VideoRenderers/LinuxRendererGLES.cpp
-@@ -31,7 +31,6 @@
- #include <locale.h>
- #include "guilib/MatrixGLES.h"
- #include "LinuxRendererGLES.h"
--#include "utils/fastmemcpy.h"
- #include "utils/MathUtils.h"
- #include "utils/GLUtils.h"
- #include "utils/log.h"
-diff --git a/xbmc/cores/VideoRenderers/RenderCapture.cpp b/xbmc/cores/VideoRenderers/RenderCapture.cpp
-index 603b68d..0456a27 100644
---- a/xbmc/cores/VideoRenderers/RenderCapture.cpp
-+++ b/xbmc/cores/VideoRenderers/RenderCapture.cpp
-@@ -21,7 +21,6 @@
- #include "RenderCapture.h"
- #include "utils/log.h"
- #include "windowing/WindowingFactory.h"
--#include "utils/fastmemcpy.h"
- #include "settings/AdvancedSettings.h"
- 
- CRenderCaptureBase::CRenderCaptureBase()
-@@ -297,7 +296,7 @@ void CRenderCaptureGL::PboToBuffer()
- 
-   if (pboPtr)
-   {
--    fast_memcpy(m_pixels, pboPtr, m_bufferSize);
-+    memcpy(m_pixels, pboPtr, m_bufferSize);
-     SetState(CAPTURESTATE_DONE);
-   }
-   else
-@@ -491,12 +490,12 @@ void CRenderCaptureDX::SurfaceToBuffer()
-     //if pitch is same, do a direct copy, otherwise copy one line at a time
-     if (lockedRect.Pitch == m_width * 4)
-     {
--      fast_memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4);
-+      memcpy(m_pixels, lockedRect.pBits, m_width * m_height * 4);
-     }
-     else
-     {
-       for (unsigned int y = 0; y < m_height; y++)
--        fast_memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4);
-+        memcpy(m_pixels + y * m_width * 4, (uint8_t*)lockedRect.pBits + y * lockedRect.Pitch, m_width * 4);
-     }
-     m_copySurface->UnlockRect();
-     SetState(CAPTURESTATE_DONE);
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
-index 56e68713..5f0e486 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/DVDCodecUtils.cpp
-@@ -22,7 +22,6 @@
- #include "DVDClock.h"
- #include "cores/VideoRenderers/RenderManager.h"
- #include "utils/log.h"
--#include "utils/fastmemcpy.h"
- #include "cores/FFmpeg.h"
- #include "Util.h"
- #ifdef HAS_DX
-@@ -95,7 +94,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
- 
-   for (int y = 0; y < h; y++)
-   {
--    fast_memcpy(d, s, w);
-+    memcpy(d, s, w);
-     s += pSrc->iLineSize[0];
-     d += pDst->iLineSize[0];
-   }
-@@ -107,7 +106,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
-   d = pDst->data[1];
-   for (int y = 0; y < h; y++)
-   {
--    fast_memcpy(d, s, w);
-+    memcpy(d, s, w);
-     s += pSrc->iLineSize[1];
-     d += pDst->iLineSize[1];
-   }
-@@ -116,7 +115,7 @@ bool CDVDCodecUtils::CopyPicture(DVDVideoPicture* pDst, DVDVideoPicture* pSrc)
-   d = pDst->data[2];
-   for (int y = 0; y < h; y++)
-   {
--    fast_memcpy(d, s, w);
-+    memcpy(d, s, w);
-     s += pSrc->iLineSize[2];
-     d += pDst->iLineSize[2];
-   }
-@@ -131,13 +130,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
-   int h = pImage->height;
-   if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
-   {
--    fast_memcpy(d, s, w*h);
-+    memcpy(d, s, w*h);
-   }
-   else
-   {
-     for (int y = 0; y < h; y++)
-     {
--      fast_memcpy(d, s, w);
-+      memcpy(d, s, w);
-       s += pSrc->iLineSize[0];
-       d += pImage->stride[0];
-     }
-@@ -148,13 +147,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
-   h =(pImage->height >> pImage->cshift_y);
-   if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1]))
-   {
--    fast_memcpy(d, s, w*h);
-+    memcpy(d, s, w*h);
-   }
-   else
-   {
-     for (int y = 0; y < h; y++)
-     {
--      fast_memcpy(d, s, w);
-+      memcpy(d, s, w);
-       s += pSrc->iLineSize[1];
-       d += pImage->stride[1];
-     }
-@@ -163,13 +162,13 @@ bool CDVDCodecUtils::CopyPicture(YV12Image* pImage, DVDVideoPicture *pSrc)
-   d = pImage->plane[2];
-   if ((w==pSrc->iLineSize[2]) && ((unsigned int) pSrc->iLineSize[2]==pImage->stride[2]))
-   {
--    fast_memcpy(d, s, w*h);
-+    memcpy(d, s, w*h);
-   }
-   else
-   {
-     for (int y = 0; y < h; y++)
-     {
--      fast_memcpy(d, s, w);
-+      memcpy(d, s, w);
-       s += pSrc->iLineSize[2];
-       d += pImage->stride[2];
-     }
-@@ -207,7 +206,7 @@ DVDVideoPicture* CDVDCodecUtils::ConvertToNV12Picture(DVDVideoPicture *pSrc)
-       uint8_t *d = pPicture->data[0];
-       for (int y = 0; y < (int)pSrc->iHeight; y++)
-       {
--        fast_memcpy(d, s, pSrc->iWidth);
-+        memcpy(d, s, pSrc->iWidth);
-         s += pSrc->iLineSize[0];
-         d += pPicture->iLineSize[0];
-       }
-@@ -298,13 +297,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
-   // Copy Y
-   if ((w == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
-   {
--    fast_memcpy(d, s, w*h);
-+    memcpy(d, s, w*h);
-   }
-   else
-   {
-     for (int y = 0; y < h; y++)
-     {
--      fast_memcpy(d, s, w);
-+      memcpy(d, s, w);
-       s += pSrc->iLineSize[0];
-       d += pImage->stride[0];
-     }
-@@ -317,13 +316,13 @@ bool CDVDCodecUtils::CopyNV12Picture(YV12Image* pImage, DVDVideoPicture *pSrc)
-   // Copy packed UV (width is same as for Y as it's both U and V components)
-   if ((w==pSrc->iLineSize[1]) && ((unsigned int) pSrc->iLineSize[1]==pImage->stride[1]))
-   {
--    fast_memcpy(d, s, w*h);
-+    memcpy(d, s, w*h);
-   }
-   else
-   {
-     for (int y = 0; y < h; y++)
-     {
--      fast_memcpy(d, s, w);
-+      memcpy(d, s, w);
-       s += pSrc->iLineSize[1];
-       d += pImage->stride[1];
-     }
-@@ -342,13 +341,13 @@ bool CDVDCodecUtils::CopyYUV422PackedPicture(YV12Image* pImage, DVDVideoPicture
-   // Copy YUYV
-   if ((w * 2 == pSrc->iLineSize[0]) && ((unsigned int) pSrc->iLineSize[0] == pImage->stride[0]))
-   {
--    fast_memcpy(d, s, w*h*2);
-+    memcpy(d, s, w*h*2);
-   }
-   else
-   {
-     for (int y = 0; y < h; y++)
-     {
--      fast_memcpy(d, s, w*2);
-+      memcpy(d, s, w*2);
-       s += pSrc->iLineSize[0];
-       d += pImage->stride[0];
-     }
-diff --git a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
-index 019bc7a..d5ca74f 100644
---- a/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
-+++ b/xbmc/cores/dvdplayer/DVDCodecs/Video/libstagefrightICS/StageFrightVideo.cpp
-@@ -30,7 +30,6 @@
- #include "guilib/GraphicContext.h"
- #include "DVDClock.h"
- #include "utils/log.h"
--#include "utils/fastmemcpy.h"
- #include "threads/Thread.h"
- #include "threads/Event.h"
- #include "Application.h"
-@@ -620,7 +619,7 @@ int  CStageFrightVideo::Decode(uint8_t *pData, int iSize, double dts, double pts
-       return VC_ERROR;
-     }
- 
--    fast_memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes);
-+    memcpy(frame->medbuf->data(), demuxer_content, demuxer_bytes);
-     frame->medbuf->set_range(0, demuxer_bytes);
-     frame->medbuf->meta_data()->clear();
-     frame->medbuf->meta_data()->setInt64(kKeyTime, frame->pts);
-diff --git a/xbmc/utils/Makefile.in b/xbmc/utils/Makefile.in
-index 438f025..dbd3db9 100644
---- a/xbmc/utils/Makefile.in
-+++ b/xbmc/utils/Makefile.in
-@@ -17,8 +17,6 @@ SRCS += DatabaseUtils.cpp
- SRCS += EndianSwap.cpp
- SRCS += Environment.cpp
- SRCS += Fanart.cpp
--SRCS += fastmemcpy.c
--SRCS += fastmemcpy-arm.S
- SRCS += FileOperationJob.cpp
- SRCS += FileUtils.cpp
- SRCS += fstrcmp.c
-diff --git a/xbmc/utils/fastmemcpy-arm.S b/xbmc/utils/fastmemcpy-arm.S
-deleted file mode 100644
-index 6cb8b0c..0000000
---- a/xbmc/utils/fastmemcpy-arm.S
-+++ /dev/null
-@@ -1,528 +0,0 @@
--/*
-- *      Copyright (C) 2008 The Android Open Source Project
-- *      All rights reserved.
-- *
-- *      Copyright (C) 2011-2013 Team XBMC
-- *      http://xbmc.org
-- *
-- *  This Program is free software; you can redistribute it and/or modify
-- *  it under the terms of the GNU General Public License as published by
-- *  the Free Software Foundation; either version 2, or (at your option)
-- *  any later version.
-- *
-- *  This Program is distributed in the hope that it will be useful,
-- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- *  GNU General Public License for more details.
-- *
-- *  You should have received a copy of the GNU General Public License
-- *  along with XBMC; see the file COPYING.  If not, see
-- *  <http://www.gnu.org/licenses/>.
-- *
-- */
--#if defined(__arm__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
--#if defined(__ARM_NEON__)
--
--        .text
--#ifndef __APPLE__
--        .fpu    neon
--        .global fast_memcpy
--        .type fast_memcpy, %function
--#else
--        .globl _fast_memcpy
--#endif
--        .align 4
--
--/* a prefetch distance of 4 cache-lines works best experimentally */
--#define CACHE_LINE_SIZE     64
--#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
--
--#ifndef __APPLE__
--        .fnstart
--        .save       {r0, lr}
--fast_memcpy:
--#else
--_fast_memcpy:
--#endif
--        stmfd       sp!, {r0, lr}
--
--        /* start preloading as early as possible */
--        pld         [r1, #(CACHE_LINE_SIZE*0)]
--        pld         [r1, #(CACHE_LINE_SIZE*1)]
--
--        /* do we have at least 16-bytes to copy (needed for alignment below) */
--        cmp         r2, #16
--        blo         5f
--
--        /* align destination to half cache-line for the write-buffer */
--        rsb         r3, r0, #0
--        ands        r3, r3, #0xF
--        beq         0f
--
--        /* copy up to 15-bytes (count in r3) */
--        sub         r2, r2, r3
--        movs        ip, r3, lsl #31
--        ldrmib      lr, [r1], #1
--        strmib      lr, [r0], #1
--        ldrcsb      ip, [r1], #1
--        ldrcsb      lr, [r1], #1
--        strcsb      ip, [r0], #1
--        strcsb      lr, [r0], #1
--        movs        ip, r3, lsl #29
--        bge         1f
--        // copies 4 bytes, destination 32-bits aligned
--        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
--        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
--1:      bcc         2f
--        // copies 8 bytes, destination 64-bits aligned
--        vld1.8      {d0}, [r1]!
--        vst1.8      {d0}, [r0, :64]!
--2:
--
--0:      /* preload immediately the next cache line, which we may need */
--        pld         [r1, #(CACHE_LINE_SIZE*0)]
--        pld         [r1, #(CACHE_LINE_SIZE*1)]
--
--        /* make sure we have at least 64 bytes to copy */
--        subs        r2, r2, #64
--        blo         2f
--
--        /* preload all the cache lines we need.
--         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
--         * ideally would would increase the distance in the main loop to
--         * avoid the goofy code below. In practice this doesn't seem to make
--         * a big difference.
--         */
--        pld         [r1, #(CACHE_LINE_SIZE*2)]
--        pld         [r1, #(CACHE_LINE_SIZE*3)]
--        pld         [r1, #(PREFETCH_DISTANCE)]
--
--1:      /* The main loop copies 64 bytes at a time */
--        vld1.8      {d0  - d3},   [r1]!
--        vld1.8      {d4  - d7},   [r1]!
--        pld         [r1, #(PREFETCH_DISTANCE)]
--        subs        r2, r2, #64
--        vst1.8      {d0  - d3},   [r0, :128]!
--        vst1.8      {d4  - d7},   [r0, :128]!
--        bhs         1b
--
--2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
--        add         r2, r2, #64
--        subs        r2, r2, #32
--        blo         4f
--
--3:      /* 32 bytes at a time. These cache lines were already preloaded */
--        vld1.8      {d0 - d3},  [r1]!
--        subs        r2, r2, #32
--        vst1.8      {d0 - d3},  [r0, :128]!
--        bhs         3b
--
--4:      /* less than 32 left */
--        add         r2, r2, #32
--        tst         r2, #0x10
--        beq         5f
--        // copies 16 bytes, 128-bits aligned
--        vld1.8      {d0, d1}, [r1]!
--        vst1.8      {d0, d1}, [r0, :128]!
--
--5:      /* copy up to 15-bytes (count in r2) */
--        movs        ip, r2, lsl #29
--        bcc         1f
--        vld1.8      {d0}, [r1]!
--        vst1.8      {d0}, [r0]!
--1:      bge         2f
--        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
--        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
--2:      movs        ip, r2, lsl #31
--        ldrmib      r3, [r1], #1
--        ldrcsb      ip, [r1], #1
--        ldrcsb      lr, [r1], #1
--        strmib      r3, [r0], #1
--        strcsb      ip, [r0], #1
--        strcsb      lr, [r0], #1
--
--        ldmfd       sp!, {r0, lr}
--        bx          lr
--#ifndef __APPLE__
--        .fnend
--#endif
--
--#else   /* __ARM_ARCH__ < 7 */
--
--
--	.text
--
--#ifndef __APPLE__
--    .global fast_memcpy
--    .type fast_memcpy, %function
--#else
--    .globl _fast_memcpy
--#endif
--    .align 4
--
--		/*
--		 * Optimized memcpy() for ARM.
--         *
--		 * note that memcpy() always returns the destination pointer,
--		 * so we have to preserve R0.
--		 */
--
--#ifndef __APPLE__
--fast_memcpy:
--#else
--_fast_memcpy:
--#endif
--		/* The stack must always be 64-bits aligned to be compliant with the
--		 * ARM ABI. Since we have to save R0, we might as well save R4
--		 * which we can use for better pipelining of the reads below
--		 */
--#ifndef __APPLE__
--        .fnstart
--        .save       {r0, r4, lr}
--#endif
--        stmfd       sp!, {r0, r4, lr}
--        /* Making room for r5-r11 which will be spilled later */
--        .pad        #28
--        sub         sp, sp, #28
--
--        // preload the destination because we'll align it to a cache line
--        // with small writes. Also start the source "pump".
--        //PLD         (r0, #0)
--        //PLD         (r1, #0)
--        //PLD         (r1, #32)
--
--		/* it simplifies things to take care of len<4 early */
--		cmp			r2, #4
--		blo			copy_last_3_and_return
--
--		/* compute the offset to align the source
--		 * offset = (4-(src&3))&3 = -src & 3
--		 */
--		rsb			r3, r1, #0
--		ands		r3, r3, #3
--		beq			src_aligned
--
--		/* align source to 32 bits. We need to insert 2 instructions between
--		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
--		 * stall 2 cycles.
--		 */
--		movs		r12, r3, lsl #31
--		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
--		ldrmib		r3, [r1], #1
--		ldrcsb		r4, [r1], #1
--		ldrcsb		r12,[r1], #1
--        strmib		r3, [r0], #1
--		strcsb		r4, [r0], #1
--		strcsb		r12,[r0], #1
--
--src_aligned:
--
--		/* see if src and dst are aligned together (congruent) */
--		eor			r12, r0, r1
--		tst			r12, #3
--		bne			non_congruent
--
--        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
--         * frame. Don't update sp.
--         */
--        stmea		sp, {r5-r11}
--
--		/* align the destination to a cache-line */
--		rsb         r3, r0, #0
--		ands		r3, r3, #0x1C
--		beq         congruent_aligned32
--		cmp         r3, r2
--		andhi		r3, r2, #0x1C
--
--		/* conditionnaly copies 0 to 7 words (length in r3) */
--		movs		r12, r3, lsl #28
--		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
--		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
--		stmcsia		r0!, {r4, r5, r6, r7}
--		stmmiia		r0!, {r8, r9}
--		tst         r3, #0x4
--		ldrne		r10,[r1], #4			/*  4 bytes */
--		strne		r10,[r0], #4
--		sub         r2, r2, r3
--
--congruent_aligned32:
--		/*
--		 * here source is aligned to 32 bytes.
--		 */
--
--cached_aligned32:
--        subs        r2, r2, #32
--        blo         less_than_32_left
--
--        /*
--         * We preload a cache-line up to 64 bytes ahead. On the 926, this will
--         * stall only until the requested world is fetched, but the linefill
--         * continues in the the background.
--         * While the linefill is going, we write our previous cache-line
--         * into the write-buffer (which should have some free space).
--         * When the linefill is done, the writebuffer will
--         * start dumping its content into memory
--         *
--         * While all this is going, we then load a full cache line into
--         * 8 registers, this cache line should be in the cache by now
--         * (or partly in the cache).
--         *
--         * This code should work well regardless of the source/dest alignment.
--         *
--         */
--
--        // Align the preload register to a cache-line because the cpu does
--        // "critical word first" (the first word requested is loaded first).
--        bic         r12, r1, #0x1F
--        add         r12, r12, #64
--
--1:      ldmia       r1!, { r4-r11 }
--        //PLD         (r12, #64)
--        subs        r2, r2, #32
--
--        // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
--        // for ARM9 preload will not be safely guarded by the preceding subs.
--        // When it is safely guarded the only possibility to have SIGSEGV here
--        // is because the caller overstates the length.
--        ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
--        stmia       r0!, { r4-r11 }
--		bhs         1b
--
--        add         r2, r2, #32
--
--
--
--
--less_than_32_left:
--		/*
--		 * less than 32 bytes left at this point (length in r2)
--		 */
--
--		/* skip all this if there is nothing to do, which should
--		 * be a common case (if not executed the code below takes
--		 * about 16 cycles)
--		 */
--		tst			r2, #0x1F
--		beq			1f
--
--		/* conditionnaly copies 0 to 31 bytes */
--		movs		r12, r2, lsl #28
--		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
--		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
--		stmcsia		r0!, {r4, r5, r6, r7}
--		stmmiia		r0!, {r8, r9}
--		movs		r12, r2, lsl #30
--		ldrcs		r3, [r1], #4			/*  4 bytes */
--		ldrmih		r4, [r1], #2			/*  2 bytes */
--		strcs		r3, [r0], #4
--		strmih		r4, [r0], #2
--		tst         r2, #0x1
--		ldrneb		r3, [r1]				/*  last byte  */
--		strneb		r3, [r0]
--
--		/* we're done! restore everything and return */
--1:		ldmfd		sp!, {r5-r11}
--		ldmfd		sp!, {r0, r4, lr}
--		bx			lr
--
--		/********************************************************************/
--
--non_congruent:
--		/*
--		 * here source is aligned to 4 bytes
--		 * but destination is not.
--		 *
--		 * in the code below r2 is the number of bytes read
--		 * (the number of bytes written is always smaller, because we have
--		 * partial words in the shift queue)
--		 */
--		cmp			r2, #4
--		blo			copy_last_3_and_return
--
--        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
--         * frame. Don't update sp.
--         */
--        stmea		sp, {r5-r11}
--
--		/* compute shifts needed to align src to dest */
--		rsb			r5, r0, #0
--		and			r5, r5, #3			/* r5 = # bytes in partial words */
--		mov			r12, r5, lsl #3		/* r12 = right */
--		rsb			lr, r12, #32		/* lr = left  */
--
--		/* read the first word */
--		ldr			r3, [r1], #4
--		sub			r2, r2, #4
--
--		/* write a partial word (0 to 3 bytes), such that destination
--		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
--		 */
--		movs		r5, r5, lsl #31
--		strmib		r3, [r0], #1
--		movmi		r3, r3, lsr #8
--		strcsb		r3, [r0], #1
--		movcs		r3, r3, lsr #8
--		strcsb		r3, [r0], #1
--		movcs		r3, r3, lsr #8
--
--		cmp			r2, #4
--		blo			partial_word_tail
--
--		/* Align destination to 32 bytes (cache line boundary) */
--1:		tst			r0, #0x1c
--		beq			2f
--		ldr			r5, [r1], #4
--		sub         r2, r2, #4
--		orr			r4, r3, r5,		lsl lr
--		mov			r3, r5,			lsr r12
--		str			r4, [r0], #4
--        cmp         r2, #4
--		bhs			1b
--		blo			partial_word_tail
--
--		/* copy 32 bytes at a time */
--2:		subs		r2, r2, #32
--		blo			less_than_thirtytwo
--
--		/* Use immediate mode for the shifts, because there is an extra cycle
--		 * for register shifts, which could account for up to 50% of
--		 * performance hit.
--		 */
--
--        cmp			r12, #24
--		beq			loop24
--		cmp			r12, #8
--		beq			loop8
--
--loop16:
--        ldr         r12, [r1], #4
--1:      mov         r4, r12
--		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
--        //PLD         (r1, #64)
--        subs        r2, r2, #32
--        ldrhs       r12, [r1], #4
--		orr			r3, r3, r4,		lsl #16
--		mov			r4, r4,			lsr #16
--		orr			r4, r4, r5,		lsl #16
--		mov			r5, r5,			lsr #16
--		orr			r5, r5, r6,		lsl #16
--		mov			r6, r6,			lsr #16
--		orr			r6, r6, r7,		lsl #16
--		mov			r7, r7,			lsr #16
--		orr			r7, r7, r8,		lsl #16
--		mov			r8, r8,			lsr #16
--		orr			r8, r8, r9,		lsl #16
--		mov			r9, r9,			lsr #16
--		orr			r9, r9, r10,	lsl #16
--		mov			r10, r10,		lsr #16
--		orr			r10, r10, r11,	lsl #16
--		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
--		mov			r3, r11,		lsr #16
--		bhs			1b
--		b			less_than_thirtytwo
--
--loop8:
--        ldr         r12, [r1], #4
--1:      mov         r4, r12
--		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
--        //PLD         (r1, #64)
--		subs		r2, r2, #32
--        ldrhs       r12, [r1], #4
--		orr			r3, r3, r4,		lsl #24
--		mov			r4, r4,			lsr #8
--		orr			r4, r4, r5,		lsl #24
--		mov			r5, r5,			lsr #8
--		orr			r5, r5, r6,		lsl #24
--		mov			r6, r6,			lsr #8
--		orr			r6, r6, r7,		lsl #24
--		mov			r7, r7,			lsr #8
--		orr			r7, r7, r8,		lsl #24
--		mov			r8, r8,			lsr #8
--		orr			r8, r8, r9,		lsl #24
--		mov			r9, r9,			lsr #8
--		orr			r9, r9, r10,	lsl #24
--		mov			r10, r10,		lsr #8
--		orr			r10, r10, r11,	lsl #24
--		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
--		mov			r3, r11,		lsr #8
--		bhs			1b
--		b			less_than_thirtytwo
--
--loop24:
--        ldr         r12, [r1], #4
--1:      mov         r4, r12
--		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
--        //PLD         (r1, #64)
--		subs		r2, r2, #32
--        ldrhs       r12, [r1], #4
--		orr			r3, r3, r4,		lsl #8
--		mov			r4, r4,			lsr #24
--		orr			r4, r4, r5,		lsl #8
--		mov			r5, r5,			lsr #24
--		orr			r5, r5, r6,		lsl #8
--		mov			r6, r6,			lsr #24
--		orr			r6, r6, r7,		lsl #8
--		mov			r7, r7,			lsr #24
--		orr			r7, r7, r8,		lsl #8
--		mov			r8, r8,			lsr #24
--		orr			r8, r8, r9,		lsl #8
--		mov			r9, r9,			lsr #24
--		orr			r9, r9, r10,	lsl #8
--		mov			r10, r10,		lsr #24
--		orr			r10, r10, r11,	lsl #8
--		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
--		mov			r3, r11,		lsr #24
--		bhs			1b
--
--
--less_than_thirtytwo:
--		/* copy the last 0 to 31 bytes of the source */
--		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
--		add			r2, r2, #32
--		cmp			r2, #4
--		blo			partial_word_tail
--
--1:		ldr			r5, [r1], #4
--		sub         r2, r2, #4
--		orr			r4, r3, r5,		lsl lr
--		mov			r3,	r5,			lsr r12
--		str			r4, [r0], #4
--        cmp         r2, #4
--		bhs			1b
--
--partial_word_tail:
--		/* we have a partial word in the input buffer */
--		movs		r5, lr, lsl #(31-3)
--		strmib		r3, [r0], #1
--		movmi		r3, r3, lsr #8
--		strcsb		r3, [r0], #1
--		movcs		r3, r3, lsr #8
--		strcsb		r3, [r0], #1
--
--		/* Refill spilled registers from the stack. Don't update sp. */
--		ldmfd		sp, {r5-r11}
--
--copy_last_3_and_return:
--		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
--		ldrmib		r2, [r1], #1
--		ldrcsb		r3, [r1], #1
--		ldrcsb		r12,[r1]
--		strmib		r2, [r0], #1
--		strcsb		r3, [r0], #1
--		strcsb		r12,[r0]
--
--        /* we're done! restore sp and spilled registers and return */
--        add         sp,  sp, #28
--		ldmfd		sp!, {r0, r4, lr}
--		bx			lr
--#ifndef __APPLE__
--        .fnend
--#endif
--
--#endif    /* __ARM_ARCH__ < 7 */
--#endif
--
--#if defined(__linux__) && defined(__ELF__)
--/* we don't need an executable stack */
--.section .note.GNU-stack,"",%progbits
--#endif
-diff --git a/xbmc/utils/fastmemcpy.c b/xbmc/utils/fastmemcpy.c
-deleted file mode 100644
-index ec9019a..0000000
---- a/xbmc/utils/fastmemcpy.c
-+++ /dev/null
-@@ -1,396 +0,0 @@
--/*
-- * fastmemcpy.h : fast memcpy routines
-- *****************************************************************************
-- *      $Id: fastmemcpy.h 13905 2006-01-12 23:10:04Z dionoea $
-- *
-- *      Authors: various Linux kernel hackers
-- *               various MPlayer hackers
-- *               Nick Kurshev <nickols_k@mail.ru>
-- *
-- *      Copyright (C) 2011-2013 Team XBMC
-- *      http://xbmc.org
-- *
-- *  This Program is free software; you can redistribute it and/or modify
-- *  it under the terms of the GNU General Public License as published by
-- *  the Free Software Foundation; either version 2, or (at your option)
-- *  any later version.
-- *
-- *  This Program is distributed in the hope that it will be useful,
-- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- *  GNU General Public License for more details.
-- *
-- *  You should have received a copy of the GNU General Public License
-- *  along with XBMC; see the file COPYING.  If not, see
-- *  <http://www.gnu.org/licenses/>.
-- *
-- */
--#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__arm__) && !defined(__mips__)
--#define HAVE_MMX2
--#define HAVE_SSE
--
--/*
--  aclib - advanced C library ;)
--  This file contains functions which improve and expand standard C-library
--*/
--#include <stddef.h>
--
--#define BLOCK_SIZE 4096
--#define CONFUSION_FACTOR 0
--/*Feel free to fine-tune the above 2, it might be possible to get some speedup with them :)*/
--
--/*#define STATISTICS*/
--
--#ifndef HAVE_SSE2
--/*
--   P3 processor has only one SSE decoder so can execute only 1 sse insn per
--   cpu clock, but it has 3 mmx decoders (include load/store unit)
--   and executes 3 mmx insns per cpu clock.
--   P4 processor has some chances, but after reading:
--   http://www.emulators.com/pentium4.htm
--   I have doubts. Anyway SSE2 version of this code can be written better.
--*/
--#undef HAVE_SSE
--#endif
--
--
--/*
-- This part of code was taken by me from Linux-2.4.3 and slightly modified
--for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
--blocks but mplayer uses weakly ordered data and original sources can not
--speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
--
-->From IA-32 Intel Architecture Software Developer's Manual Volume 1,
--
--Order Number 245470:
--"10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
--
--Data referenced by a program can be temporal (data will be used again) or
--non-temporal (data will be referenced once and not reused in the immediate
--future). To make efficient use of the processor's caches, it is generally
--desirable to cache temporal data and not cache non-temporal data. Overloading
--the processor's caches with non-temporal data is sometimes referred to as
--"polluting the caches".
--The non-temporal data is written to memory with Write-Combining semantics.
--
--The PREFETCHh instructions permits a program to load data into the processor
--at a suggested cache level, so that it is closer to the processors load and
--store unit when it is needed. If the data is already present in a level of
--the cache hierarchy that is closer to the processor, the PREFETCHh instruction
--will not result in any data movement.
--But we should you PREFETCHNTA: Non-temporal data fetch data into location
--close to the processor, minimizing cache pollution.
--
--The MOVNTQ (store quadword using non-temporal hint) instruction stores
--packed integer data from an MMX register to memory, using a non-temporal hint.
--The MOVNTPS (store packed single-precision floating-point values using
--non-temporal hint) instruction stores packed floating-point data from an
--XMM register to memory, using a non-temporal hint.
--
--The SFENCE (Store Fence) instruction controls write ordering by creating a
--fence for memory store operations. This instruction guarantees that the results
--of every store instruction that precedes the store fence in program order is
--globally visible before any store instruction that follows the fence. The
--SFENCE instruction provides an efficient way of ensuring ordering between
--procedures that produce weakly-ordered data and procedures that consume that
--data.
--
--If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
--*/
--
--/* 3dnow memcpy support from kernel 2.4.2 */
--/*  by Pontscho/fresh!mindworkz           */
--
--#if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX )
--
--#undef HAVE_MMX1
--#if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
--/*  means: mmx v.1. Note: Since we added alignment of destinition it speedups
--    of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
--    standard (non MMX-optimized) version.
--    Note: on K6-2+ it speedups memory copying upto 25% and
--          on K7 and P3 about 500% (5 times). */
--#define HAVE_MMX1
--#endif
--
--
--#undef HAVE_K6_2PLUS
--#if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
--#define HAVE_K6_2PLUS
--#endif
--
--/* for small memory blocks (<256 bytes) this version is faster */
--#define small_memcpy(to,from,n)\
--{\
--register unsigned long int dummy;\
--__asm__ __volatile__(\
--	"rep; movsb"\
--	:"=&D"(to), "=&S"(from), "=&c"(dummy)\
--/* It's most portable way to notify compiler */\
--/* that edi, esi and ecx are clobbered in asm block. */\
--/* Thanks to A'rpi for hint!!! */\
--        :"0" (to), "1" (from),"2" (n)\
--	: "memory");\
--}
--
--#ifdef HAVE_SSE
--#define MMREG_SIZE 16
--#else
--#define MMREG_SIZE 64 /*8*/
--#endif
--
--/* Small defines (for readability only) ;) */
--#ifdef HAVE_K6_2PLUS
--#define PREFETCH "prefetch"
--/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
--#define EMMS     "femms"
--#else
--#define PREFETCH "prefetchnta"
--#define EMMS     "emms"
--#endif
--
--#ifdef HAVE_MMX2
--#define MOVNTQ "movntq"
--#else
--#define MOVNTQ "movq"
--#endif
--
--#ifdef HAVE_MMX1
--#define MIN_LEN 0x800  /* 2K blocks */
--#else
--#define MIN_LEN 0x40  /* 64-byte blocks */
--#endif
--
--void * fast_memcpy(void * to, const void * from, size_t len)
--{
--	void *retval;
--	size_t i;
--	retval = to;
--#ifdef STATISTICS
--	{
--		static int freq[33];
--		static int t=0;
--		int i;
--		for(i=0; len>(1<<i); i++);
--		freq[i]++;
--		t++;
--		if(1024*1024*1024 % t == 0)
--			for(i=0; i<32; i++)
--				printf("freq < %8d %4d\n", 1<<i, freq[i]);
--	}
--#endif
--#ifndef HAVE_MMX1
--        /* PREFETCH has effect even for MOVSB instruction ;) */
--	__asm__ __volatile__ (
--	        PREFETCH" (%0)\n"
--	        PREFETCH" 64(%0)\n"
--	        PREFETCH" 128(%0)\n"
--        	PREFETCH" 192(%0)\n"
--        	PREFETCH" 256(%0)\n"
--		: : "r" (from) );
--#endif
--        if(len >= MIN_LEN)
--	{
--	  register unsigned long int delta;
--          /* Align destinition to MMREG_SIZE -boundary */
--          delta = ((unsigned long int)to)&(MMREG_SIZE-1);
--          if(delta)
--	  {
--	    delta=MMREG_SIZE-delta;
--	    len -= delta;
--	    small_memcpy(to, from, delta);
--	  }
--	  i = len >> 6; /* len/64 */
--	  len&=63;
--        /*
--           This algorithm is top effective when the code consequently
--           reads and writes blocks which have size of cache line.
--           Size of cache line is processor-dependent.
--           It will, however, be a minimum of 32 bytes on any processors.
--           It would be better to have a number of instructions which
--           perform reading and writing to be multiple to a number of
--           processor's decoders, but it's not always possible.
--        */
--#ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
--	if(((unsigned long)from) & 15)
--	/* if SRC is misaligned */
--	for(; i>0; i--)
--	{
--		__asm__ __volatile__ (
--		PREFETCH" 320(%0)\n"
--		"movups (%0), %%xmm0\n"
--		"movups 16(%0), %%xmm1\n"
--		"movups 32(%0), %%xmm2\n"
--		"movups 48(%0), %%xmm3\n"
--		"movntps %%xmm0, (%1)\n"
--		"movntps %%xmm1, 16(%1)\n"
--		"movntps %%xmm2, 32(%1)\n"
--		"movntps %%xmm3, 48(%1)\n"
--		:: "r" (from), "r" (to) : "memory");
--		((const unsigned char *)from)+=64;
--		((unsigned char *)to)+=64;
--	}
--	else
--	/*
--	   Only if SRC is aligned on 16-byte boundary.
--	   It allows to use movaps instead of movups, which required data
--	   to be aligned or a general-protection exception (#GP) is generated.
--	*/
--	for(; i>0; i--)
--	{
--		__asm__ __volatile__ (
--		PREFETCH" 320(%0)\n"
--		"movaps (%0), %%xmm0\n"
--		"movaps 16(%0), %%xmm1\n"
--		"movaps 32(%0), %%xmm2\n"
--		"movaps 48(%0), %%xmm3\n"
--		"movntps %%xmm0, (%1)\n"
--		"movntps %%xmm1, 16(%1)\n"
--		"movntps %%xmm2, 32(%1)\n"
--		"movntps %%xmm3, 48(%1)\n"
--		:: "r" (from), "r" (to) : "memory");
--		((const unsigned char *)from)+=64;
--		((unsigned char *)to)+=64;
--	}
--#else
--	/* Align destination at BLOCK_SIZE boundary */
--	for(; ((ptrdiff_t)to & (BLOCK_SIZE-1)) && i>0; i--)
--	{
--		__asm__ __volatile__ (
--#ifndef HAVE_MMX1
--        	PREFETCH" 320(%0)\n"
--#endif
--		"movq (%0), %%mm0\n"
--		"movq 8(%0), %%mm1\n"
--		"movq 16(%0), %%mm2\n"
--		"movq 24(%0), %%mm3\n"
--		"movq 32(%0), %%mm4\n"
--		"movq 40(%0), %%mm5\n"
--		"movq 48(%0), %%mm6\n"
--		"movq 56(%0), %%mm7\n"
--		MOVNTQ" %%mm0, (%1)\n"
--		MOVNTQ" %%mm1, 8(%1)\n"
--		MOVNTQ" %%mm2, 16(%1)\n"
--		MOVNTQ" %%mm3, 24(%1)\n"
--		MOVNTQ" %%mm4, 32(%1)\n"
--		MOVNTQ" %%mm5, 40(%1)\n"
--		MOVNTQ" %%mm6, 48(%1)\n"
--		MOVNTQ" %%mm7, 56(%1)\n"
--		:: "r" (from), "r" (to) : "memory");
--                from = (const void *) (((const unsigned char *)from)+64);
--		to = (void *) (((unsigned char *)to)+64);
--	}
--
--/*	printf(" %p %p\n", (ptrdiff_t)from&1023, (ptrdiff_t)to&1023); */
--	/* Pure Assembly cuz gcc is a bit unpredictable ;) */
--# if 0
--	if(i>=BLOCK_SIZE/64)
--		asm volatile(
--			"xorl %%eax, %%eax	\n\t"
--			".balign 16		\n\t"
--			"1:			\n\t"
--				"movl (%0, %%eax), %%ebx 	\n\t"
--				"movl 32(%0, %%eax), %%ebx 	\n\t"
--				"movl 64(%0, %%eax), %%ebx 	\n\t"
--				"movl 96(%0, %%eax), %%ebx 	\n\t"
--				"addl $128, %%eax		\n\t"
--				"cmpl %3, %%eax			\n\t"
--				" jb 1b				\n\t"
--
--			"xorl %%eax, %%eax	\n\t"
--
--				".balign 16		\n\t"
--				"2:			\n\t"
--				"movq (%0, %%eax), %%mm0\n"
--				"movq 8(%0, %%eax), %%mm1\n"
--				"movq 16(%0, %%eax), %%mm2\n"
--				"movq 24(%0, %%eax), %%mm3\n"
--				"movq 32(%0, %%eax), %%mm4\n"
--				"movq 40(%0, %%eax), %%mm5\n"
--				"movq 48(%0, %%eax), %%mm6\n"
--				"movq 56(%0, %%eax), %%mm7\n"
--				MOVNTQ" %%mm0, (%1, %%eax)\n"
--				MOVNTQ" %%mm1, 8(%1, %%eax)\n"
--				MOVNTQ" %%mm2, 16(%1, %%eax)\n"
--				MOVNTQ" %%mm3, 24(%1, %%eax)\n"
--				MOVNTQ" %%mm4, 32(%1, %%eax)\n"
--				MOVNTQ" %%mm5, 40(%1, %%eax)\n"
--				MOVNTQ" %%mm6, 48(%1, %%eax)\n"
--				MOVNTQ" %%mm7, 56(%1, %%eax)\n"
--				"addl $64, %%eax		\n\t"
--				"cmpl %3, %%eax		\n\t"
--				"jb 2b				\n\t"
--
--#if CONFUSION_FACTOR > 0
--	/* a few percent speedup on out of order executing CPUs */
--			"movl %5, %%eax		\n\t"
--				"2:			\n\t"
--				"movl (%0), %%ebx	\n\t"
--				"movl (%0), %%ebx	\n\t"
--				"movl (%0), %%ebx	\n\t"
--				"movl (%0), %%ebx	\n\t"
--				"decl %%eax		\n\t"
--				" jnz 2b		\n\t"
--#endif
--
--			"xorl %%eax, %%eax	\n\t"
--			"addl %3, %0		\n\t"
--			"addl %3, %1		\n\t"
--			"subl %4, %2		\n\t"
--			"cmpl %4, %2		\n\t"
--			" jae 1b		\n\t"
--				: "+r" (from), "+r" (to), "+r" (i)
--				: "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR)
--				: "%eax", "%ebx"
--		);
--#endif
--
--	for(; i>0; i--)
--	{
--		__asm__ __volatile__ (
--#ifndef HAVE_MMX1
--        	PREFETCH" 320(%0)\n"
--#endif
--		"movq (%0), %%mm0\n"
--		"movq 8(%0), %%mm1\n"
--		"movq 16(%0), %%mm2\n"
--		"movq 24(%0), %%mm3\n"
--		"movq 32(%0), %%mm4\n"
--		"movq 40(%0), %%mm5\n"
--		"movq 48(%0), %%mm6\n"
--		"movq 56(%0), %%mm7\n"
--		MOVNTQ" %%mm0, (%1)\n"
--		MOVNTQ" %%mm1, 8(%1)\n"
--		MOVNTQ" %%mm2, 16(%1)\n"
--		MOVNTQ" %%mm3, 24(%1)\n"
--		MOVNTQ" %%mm4, 32(%1)\n"
--		MOVNTQ" %%mm5, 40(%1)\n"
--		MOVNTQ" %%mm6, 48(%1)\n"
--		MOVNTQ" %%mm7, 56(%1)\n"
--		:: "r" (from), "r" (to) : "memory");
--		from = (const void *) (((const unsigned char *)from)+64);
--		to = (void *) (((unsigned char *)to)+64);
--	}
--
--#endif /* Have SSE */
--#ifdef HAVE_MMX2
--                /* since movntq is weakly-ordered, a "sfence"
--		 * is needed to become ordered again. */
--		__asm__ __volatile__ ("sfence":::"memory");
--#endif
--#ifndef HAVE_SSE
--		/* enables to use FPU */
--		__asm__ __volatile__ (EMMS:::"memory");
--#endif
--	}
--	/*
--	 *	Now do the tail of the block
--	 */
--	if(len) small_memcpy(to, from, len);
--	return retval;
--}
--
--
--#endif /* #if defined( HAVE_MMX2 ) || defined( HAVE_3DNOW ) || defined( HAVE_MMX ) */
--
--#endif
-diff --git a/xbmc/utils/fastmemcpy.h b/xbmc/utils/fastmemcpy.h
-deleted file mode 100644
-index 43f5904..0000000
---- a/xbmc/utils/fastmemcpy.h
-+++ /dev/null
-@@ -1,35 +0,0 @@
--/*
-- *      Copyright (C) 2005-2013 Team XBMC
-- *      http://xbmc.org
-- *
-- *  This Program is free software; you can redistribute it and/or modify
-- *  it under the terms of the GNU General Public License as published by
-- *  the Free Software Foundation; either version 2, or (at your option)
-- *  any later version.
-- *
-- *  This Program is distributed in the hope that it will be useful,
-- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- *  GNU General Public License for more details.
-- *
-- *  You should have received a copy of the GNU General Public License
-- *  along with XBMC; see the file COPYING.  If not, see
-- *  <http://www.gnu.org/licenses/>.
-- *
-- */
--#pragma once
--
--#ifdef __cplusplus
--extern "C" {
--#endif
--
--#if !defined(TARGET_WINDOWS) && !defined(__ppc__) && !defined(__powerpc__) && !defined(__mips__) && !defined(TARGET_ANDROID) && !defined(TARGET_DARWIN_IOS)
--void * fast_memcpy(void * to, const void * from, size_t len);
--//#define fast_memcpy memcpy
--#else
--#define fast_memcpy memcpy
--#endif
--
--#ifdef __cplusplus
--}
--#endif
-diff --git a/xbmc/utils/test/Makefile b/xbmc/utils/test/Makefile
-index 8fa0526..3a467ad 100644
---- a/xbmc/utils/test/Makefile
-+++ b/xbmc/utils/test/Makefile
-@@ -11,7 +11,6 @@ SRCS=	\
- 	TestCryptThreading.cpp \
- 	TestDatabaseUtils.cpp \
- 	TestEndianSwap.cpp \
--	Testfastmemcpy.cpp \
- 	TestFileOperationJob.cpp \
- 	TestFileUtils.cpp \
- 	Testfstrcmp.cpp \
-diff --git a/xbmc/utils/test/Testfastmemcpy.cpp b/xbmc/utils/test/Testfastmemcpy.cpp
-deleted file mode 100644
-index 93a9bb0..0000000
---- a/xbmc/utils/test/Testfastmemcpy.cpp
-+++ /dev/null
-@@ -1,39 +0,0 @@
--/*
-- *      Copyright (C) 2005-2013 Team XBMC
-- *      http://xbmc.org
-- *
-- *  This Program is free software; you can redistribute it and/or modify
-- *  it under the terms of the GNU General Public License as published by
-- *  the Free Software Foundation; either version 2, or (at your option)
-- *  any later version.
-- *
-- *  This Program is distributed in the hope that it will be useful,
-- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-- *  GNU General Public License for more details.
-- *
-- *  You should have received a copy of the GNU General Public License
-- *  along with XBMC; see the file COPYING.  If not, see
-- *  <http://www.gnu.org/licenses/>.
-- *
-- */
--
--#include <stddef.h> // TODO: This should go in fastmemcpy.h instead.
--#include "utils/fastmemcpy.h"
--
--#include "gtest/gtest.h"
--
--static const char refdata[] = "\x01\x02\x03\x04\x05\x06\x07\x08"
--                              "\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10"
--                              "\x11\x12\x13\x14\x15\x16\x17\x18"
--                              "\x19\x1a\x1b\x1c\x1d\x1e\x1f\x20"
--                              "\x21\x22\x23\x24\x25\x26\x27\x28"
--                              "\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30";
--
--TEST(Testfastmemcpy, General)
--{
--  char vardata[sizeof(refdata)];
--  memset(vardata, 0, sizeof(vardata));
--  EXPECT_NE(nullptr, fast_memcpy(vardata, refdata, sizeof(refdata)));
--  EXPECT_EQ(0, memcmp(refdata, vardata, sizeof(refdata)));
--}