From bca6ed0ccc353de666412720f90ae747bc67c4f6 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 19 Dec 2024 11:36:57 -0800
Subject: [PATCH] llama.cpp subdirs

---
 CMakeLists.txt                                | 21 +++++-
 Dockerfile2                                   | 66 +++++++++++++++++++
 Makefile2                                     | 48 ++++++++++++++
 llama/json-schema-to-grammar.h                | 34 ----------
 llama/llama.cpp/.rsync-filter                 | 24 +++++++
 llama/llama.cpp/LICENSE                       | 21 ++++++
 llama/{ => llama.cpp/common}/base64.hpp       |  0
 llama/{ => llama.cpp/common}/common.cpp       | 26 --------
 llama/llama.cpp/common/common.go              |  6 ++
 llama/{ => llama.cpp/common}/common.h         | 26 --------
 .../common}/json-schema-to-grammar.cpp        | 28 +-------
 .../llama.cpp/common/json-schema-to-grammar.h |  8 +++
 llama/{ => llama.cpp/common}/json.hpp         |  0
 llama/{ => llama.cpp/common}/log.cpp          | 26 --------
 llama/{ => llama.cpp/common}/log.h            | 26 --------
 llama/{ => llama.cpp/common}/sampling.cpp     | 26 --------
 llama/{ => llama.cpp/common}/sampling.h       | 26 --------
 llama/{ => llama.cpp/common}/stb_image.h      |  0
 llama/{ => llama.cpp/examples/llava}/clip.cpp | 26 --------
 llama/{ => llama.cpp/examples/llava}/clip.h   | 26 --------
 .../{ => llama.cpp/examples/llava}/llava.cpp  | 26 --------
 llama/llama.cpp/examples/llava/llava.go       |  6 ++
 llama/{ => llama.cpp/examples/llava}/llava.h  | 26 --------
 llama/{ => llama.cpp/include}/llama.h         | 26 --------
 llama/{ => llama.cpp/src}/llama-grammar.cpp   | 26 --------
 llama/{ => llama.cpp/src}/llama-grammar.h     | 26 --------
 llama/{ => llama.cpp/src}/llama-impl.h        | 26 --------
 llama/{ => llama.cpp/src}/llama-sampling.cpp  | 26 --------
 llama/{ => llama.cpp/src}/llama-sampling.h    | 26 --------
 llama/{ => llama.cpp/src}/llama-vocab.cpp     | 26 --------
 llama/{ => llama.cpp/src}/llama-vocab.h       | 26 --------
 llama/{ => llama.cpp/src}/llama.cpp           | 26 --------
 llama/llama.cpp/src/llama.go                  |  7 ++
 llama/{ => llama.cpp/src}/unicode-data.cpp    | 26 --------
 llama/llama.cpp/src/unicode-data.h            | 20 ++++++
 llama/{ => llama.cpp/src}/unicode.cpp         | 26 --------
 llama/{ => llama.cpp/src}/unicode.h           | 26 --------
 llama/llama.go                                | 15 +++--
 llama/patches/0001-cuda.patch                 |  7 +-
 llama/patches/0002-pretokenizer.patch         |  7 +-
 llama/patches/0003-embeddings.patch           |  7 +-
 llama/patches/0004-clip-unicode.patch         |  7 +-
 llama/patches/0005-solar-pro.patch            |  7 +-
 llama/patches/0006-conditional-fattn.patch    |  7 +-
 ...rt.patch => 0007-add-mllama-support.patch} |  7 +-
 llama/patches/0007-blas.patch                 | 29 --------
 ...or.patch => 0008-add-unpad-operator.patch} |  7 +-
 ... => 0009-fix-deepseek-deseret-regex.patch} |  7 +-
 ...tain-ordering-for-rules-for-grammar.patch} |  7 +-
 llama/unicode-data.h                          | 46 -------------
 ml/backend/ggml/ggml.go                       | 53 ++-------------
 ml/backend/ggml/ggml/.rsync-filter            | 21 ++++++
 .../ggml/ggml/src/ggml-blas/ggml-blas.cpp     |  4 --
 ml/backend/ggml/ggml/src/ggml-cpu/cpu.go      |  4 +-
 .../ggml/src/ggml-cpu/llamafile/llamafile.go  |  5 ++
 ml/backend/ggml/ggml/src/ggml.go              | 61 ++++++++++++++++-
 56 files changed, 352 insertions(+), 817 deletions(-)
 create mode 100644 Dockerfile2
 create mode 100644 Makefile2
 delete mode 100644 llama/json-schema-to-grammar.h
 create mode 100644 llama/llama.cpp/.rsync-filter
 create mode 100644 llama/llama.cpp/LICENSE
 rename llama/{ => llama.cpp/common}/base64.hpp (100%)
 rename llama/{ => llama.cpp/common}/common.cpp (98%)
 create mode 100644 llama/llama.cpp/common/common.go
 rename llama/{ => llama.cpp/common}/common.h (95%)
 rename llama/{ => llama.cpp/common}/json-schema-to-grammar.cpp (97%)
 create mode 100644 llama/llama.cpp/common/json-schema-to-grammar.h
 rename llama/{ => llama.cpp/common}/json.hpp (100%)
 rename llama/{ => llama.cpp/common}/log.cpp (89%)
 rename llama/{ => llama.cpp/common}/log.h (77%)
 rename llama/{ => llama.cpp/common}/sampling.cpp (93%)
 rename llama/{ => llama.cpp/common}/sampling.h (78%)
 rename llama/{ => llama.cpp/common}/stb_image.h (100%)
 rename llama/{ => llama.cpp/examples/llava}/clip.cpp (98%)
 rename llama/{ => llama.cpp/examples/llava}/clip.h (72%)
 rename llama/{ => llama.cpp/examples/llava}/llava.cpp (95%)
 create mode 100644 llama/llama.cpp/examples/llava/llava.go
 rename llama/{ => llama.cpp/examples/llava}/llava.h (59%)
 rename llama/{ => llama.cpp/include}/llama.h (98%)
 rename llama/{ => llama.cpp/src}/llama-grammar.cpp (97%)
 rename llama/{ => llama.cpp/src}/llama-grammar.h (78%)
 rename llama/{ => llama.cpp/src}/llama-impl.h (79%)
 rename llama/{ => llama.cpp/src}/llama-sampling.cpp (98%)
 rename llama/{ => llama.cpp/src}/llama-sampling.h (54%)
 rename llama/{ => llama.cpp/src}/llama-vocab.cpp (98%)
 rename llama/{ => llama.cpp/src}/llama-vocab.h (83%)
 rename llama/{ => llama.cpp/src}/llama.cpp (99%)
 create mode 100644 llama/llama.cpp/src/llama.go
 rename llama/{ => llama.cpp/src}/unicode-data.cpp (99%)
 create mode 100644 llama/llama.cpp/src/unicode-data.h
 rename llama/{ => llama.cpp/src}/unicode.cpp (96%)
 rename llama/{ => llama.cpp/src}/unicode.h (63%)
 rename llama/patches/{0008-add-mllama-support.patch => 0007-add-mllama-support.patch} (99%)
 delete mode 100644 llama/patches/0007-blas.patch
 rename llama/patches/{0009-add-unpad-operator.patch => 0008-add-unpad-operator.patch} (99%)
 rename llama/patches/{0010-fix-deepseek-deseret-regex.patch => 0009-fix-deepseek-deseret-regex.patch} (96%)
 rename llama/patches/{0011-Maintain-ordering-for-rules-for-grammar.patch => 0010-Maintain-ordering-for-rules-for-grammar.patch} (84%)
 delete mode 100644 llama/unicode-data.h
 create mode 100644 ml/backend/ggml/ggml/.rsync-filter
 create mode 100644 ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21d687b81..d6369a55d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,10 +9,15 @@ find_package(Threads REQUIRED)
 set(CMAKE_BUILD_TYPE Release)
 set(BUILD_SHARED_LIBS ON)
 
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 set(GGML_CCACHE ON)
 set(GGML_SCHED_MAX_COPIES 4)
 set(GGML_CPU_ALL_VARIANTS ON)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
+set(GGML_LLAMAFILE ON)
 
 add_compile_definitions(GGML_BUILD)
 add_compile_definitions(GGML_SHARED)
@@ -24,9 +29,21 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
 
+function(set_target_output_directory _target)
+    if(TARGET ${_target})
+        set_target_properties(${_target} PROPERTIES
+            RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+            LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+            ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+        )
+    endif()
+endfunction()
+
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src EXCLUDE_FROM_ALL)
+set_target_output_directory(ggml-base)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
+set_target_output_directory(ggml-cpu)
 
 find_package(BLAS)
 if(NOT BLAS_VENDOR)
@@ -36,14 +53,16 @@ else()
 endif()
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-blas)
-target_compile_features(ggml-blas PRIVATE cxx_std_11)
+set_target_output_directory(ggml-blas)
 
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
+    set_target_output_directory(ggml-cuda)
 endif()
 
 check_language(HIP)
 if(CMAKE_HIP_COMPILER)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
+    set_target_output_directory(ggml-hip)
 endif()
diff --git a/Dockerfile2 b/Dockerfile2
new file mode 100644
index 000000000..1a584488d
--- /dev/null
+++ b/Dockerfile2
@@ -0,0 +1,66 @@
+ARG CUDA_11_VERSION=11.3
+ARG CUDA_12_VERSION=12.4
+ARG ROCM_VERSION=6.1.2
+ARG JETPACK_5_VERSION=r35.4.1
+ARG JETPACK_6_VERSION=r36.2.0
+ARG CMAKE_VERSION=3.31.2
+
+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS base
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz | tar xz -C /usr --strip-components 1
+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+
+# FROM --platform=linux/arm64 rockylinux:8 AS base
+# ARG CMAKE_VERSION
+# RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+# RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
+
+FROM base AS amd64
+ARG CUDA_11_VERSION
+ARG CUDA_12_VERSION
+RUN yum install -y cuda-toolkit-${CUDA_11_VERSION//./-} \
+    && yum install -y cuda-toolkit-${CUDA_12_VERSION//./-}
+COPY CMakeLists.txt CMakeLists.txt
+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
+
+FROM --platform=linux/amd64 amd64 AS cuda_11
+ENV PATH=/usr/local/cuda-${CUDA_11_VERSION}/bin:$PATH
+RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
+RUN cmake --build build --target ggml-cuda -j
+
+FROM --platform=linux/amd64 amd64 AS cuda_12
+ENV PATH=/usr/local/cuda-${CUDA_12_VERSION}/bin:$PATH
+RUN cmake -S . -B build -DCMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
+RUN cmake --build build --target ggml-cuda -j
+
+FROM --platform=linux/amd64 amd64 AS rocm
+RUN cmake -S . -B build -DCMAKE_HIP_ARCHITECTURES="gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+RUN cmake --build build --target ggml-hip -j
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5_VERSION} AS jetpack_5
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+COPY CMakeLists.txt .
+COPY ml/backend/ggml/ggml .
+RUN cmake -S . -B build \
+    -DCMAKE_CUDA_ARCHITECTURES="72;87"
+RUN cmake --build build --target ggml-cuda
+
+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6_VERSION} AS jetpack_6
+ARG CMAKE_VERSION
+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
+COPY CMakeLists.txt .
+COPY ml/backend/ggml/ggml .
+RUN cmake -S . -B build \
+    -DCMAKE_CUDA_ARCHITECTURES="87"
+RUN cmake --build build --target ggml-cuda
+
+FROM --platform=linux/amd64 golang:1.23
+COPY --from=cuda_11 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-11.so
+COPY --from=cuda_12 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-12.so
+COPY --from=rocm build/ml/backend/ggml/ggml/src/ggml-hip/libggml-hip.so libggml-hip.so
+
+# FROM --platform=linux/arm64 golang:1.23
+# COPY --from=jetpack_5 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-5.so
+# COPY --from=jetpack_6 build/ml/backend/ggml/ggml/src/ggml-cuda/libggml-cuda.so libggml-cuda-jetpack-6.so
diff --git a/Makefile2 b/Makefile2
new file mode 100644
index 000000000..3fbff0e79
--- /dev/null
+++ b/Makefile2
@@ -0,0 +1,48 @@
+UPSTREAM=https://github.com/ggerganov/llama.cpp.git
+WORKDIR=llama/vendor
+FETCH_HEAD=40c6d79fb52f995f47507fedfeaae2ac05d9b35c
+
+all: sync
+
+.PHONY: sync
+sync: llama/llama.cpp ml/backend/ggml/ggml
+
+.PHONY: llama/llama.cpp
+llama/llama.cpp: llama/vendor apply_patches
+	rsync -arvzc --delete -f "merge $@/.rsync-filter" $< $@
+	rsync -arvzc --delete --include LICENSE --exclude $(WORKDIR) $@
+
+.PHONY: ml/backend/ggml/ggml apply_patches
+ml/backend/ggml/ggml: llama/vendor/ggml apply_patches
+	rsync -arvzc --delete -f "merge $@/.rsync-filter" $< $@
+	rsync -arvzc --delete --include LICENSE --exclude $(WORKDIR) $@
+
+PATCHES=$(wildcard llama/patches/*.patch)
+
+.PHONY: apply_patches
+.NOTPARALLEL:
+apply_patches: $(addsuffix ed, $(PATCHES))
+
+%.patched: %.patch
+	if git -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
+
+.PHONY: checkout
+checkout: $(WORKDIR)
+	git -C $(WORKDIR) fetch
+	git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
+
+$(WORKDIR):
+	git clone $(UPSTREAM) $(WORKDIR)
+
+.PHONE: format_patches
+format_patches: llama/patches
+	git -C $(WORKDIR) format-patch \
+		--no-signature \
+		--no-numbered \
+		--zero-commit \
+		-o $(realpath $<) \
+		$(FETCH_HEAD)
+
+.PHONE: clean
+clean: checkout
+	$(RM) $(addsuffix ed, $(PATCHES))
diff --git a/llama/json-schema-to-grammar.h b/llama/json-schema-to-grammar.h
deleted file mode 100644
index bfff9bc58..000000000
--- a/llama/json-schema-to-grammar.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
-
-std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
diff --git a/llama/llama.cpp/.rsync-filter b/llama/llama.cpp/.rsync-filter
new file mode 100644
index 000000000..bedb86972
--- /dev/null
+++ b/llama/llama.cpp/.rsync-filter
@@ -0,0 +1,24 @@
+protect **/*.go
+include common/
+include common/base64.*
+include common/common.*
+include common/json-schema-to-grammar.*
+include common/json.*
+include common/log.*
+include common/sampling.*
+include common/stb_image.*
+include include/
+include include/llama.*
+include examples/
+include examples/llava/
+include examples/llava/clip.*
+include examples/llava/llava.*
+include src/
+include src/llama-grammar.*
+include src/llama-impl.*
+include src/llama-sampling.*
+include src/llama-vocab.*
+include src/llama.*
+include src/unicode-data.*
+include src/unicode.*
+exclude *
diff --git a/llama/llama.cpp/LICENSE b/llama/llama.cpp/LICENSE
new file mode 100644
index 000000000..acb96ce78
--- /dev/null
+++ b/llama/llama.cpp/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023-2024 The ggml authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/llama/base64.hpp b/llama/llama.cpp/common/base64.hpp
similarity index 100%
rename from llama/base64.hpp
rename to llama/llama.cpp/common/base64.hpp
diff --git a/llama/common.cpp b/llama/llama.cpp/common/common.cpp
similarity index 98%
rename from llama/common.cpp
rename to llama/llama.cpp/common/common.cpp
index 1b90fb445..6143516d2 100644
--- a/llama/common.cpp
+++ b/llama/llama.cpp/common/common.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
diff --git a/llama/llama.cpp/common/common.go b/llama/llama.cpp/common/common.go
new file mode 100644
index 000000000..ebbb738f2
--- /dev/null
+++ b/llama/llama.cpp/common/common.go
@@ -0,0 +1,6 @@
+package common
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo CPPFLAGS: -I${SRCDIR}/../include
+// #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
+import "C"
diff --git a/llama/common.h b/llama/llama.cpp/common/common.h
similarity index 95%
rename from llama/common.h
rename to llama/llama.cpp/common/common.h
index 03d60aaff..0373fd3ea 100644
--- a/llama/common.h
+++ b/llama/llama.cpp/common/common.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // Various helper functions and utilities
 
 #pragma once
diff --git a/llama/json-schema-to-grammar.cpp b/llama/llama.cpp/common/json-schema-to-grammar.cpp
similarity index 97%
rename from llama/json-schema-to-grammar.cpp
rename to llama/llama.cpp/common/json-schema-to-grammar.cpp
index e5191967b..2a8dbd22d 100644
--- a/llama/json-schema-to-grammar.cpp
+++ b/llama/llama.cpp/common/json-schema-to-grammar.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "json-schema-to-grammar.h"
 #include <algorithm>
 #include <fstream>
@@ -417,7 +391,7 @@ class SchemaConverter {
 private:
     std::function<json(const std::string &)> _fetch_json;
     bool _dotall;
-    std::map<std::string, std::string> _rules;
+    std::unordered_map<std::string, std::string> _rules;
     std::unordered_map<std::string, json> _refs;
     std::unordered_set<std::string> _refs_being_resolved;
     std::vector<std::string> _errors;
diff --git a/llama/llama.cpp/common/json-schema-to-grammar.h b/llama/llama.cpp/common/json-schema-to-grammar.h
new file mode 100644
index 000000000..41623b346
--- /dev/null
+++ b/llama/llama.cpp/common/json-schema-to-grammar.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "ggml.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
+
+std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
diff --git a/llama/json.hpp b/llama/llama.cpp/common/json.hpp
similarity index 100%
rename from llama/json.hpp
rename to llama/llama.cpp/common/json.hpp
diff --git a/llama/log.cpp b/llama/llama.cpp/common/log.cpp
similarity index 89%
rename from llama/log.cpp
rename to llama/llama.cpp/common/log.cpp
index 1a98ff726..04c7c0ed1 100644
--- a/llama/log.cpp
+++ b/llama/llama.cpp/common/log.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "log.h"
 
 #include <condition_variable>
diff --git a/llama/log.h b/llama/llama.cpp/common/log.h
similarity index 77%
rename from llama/log.h
rename to llama/llama.cpp/common/log.h
index 951d0c21d..66605cc69 100644
--- a/llama/log.h
+++ b/llama/llama.cpp/common/log.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "ggml.h" // for ggml_log_level
diff --git a/llama/sampling.cpp b/llama/llama.cpp/common/sampling.cpp
similarity index 93%
rename from llama/sampling.cpp
rename to llama/llama.cpp/common/sampling.cpp
index 616555f06..0c4699a89 100644
--- a/llama/sampling.cpp
+++ b/llama/llama.cpp/common/sampling.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "sampling.h"
 
 #include "common.h"
diff --git a/llama/sampling.h b/llama/llama.cpp/common/sampling.h
similarity index 78%
rename from llama/sampling.h
rename to llama/llama.cpp/common/sampling.h
index 38a5f2b22..348911b18 100644
--- a/llama/sampling.h
+++ b/llama/llama.cpp/common/sampling.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/stb_image.h b/llama/llama.cpp/common/stb_image.h
similarity index 100%
rename from llama/stb_image.h
rename to llama/llama.cpp/common/stb_image.h
diff --git a/llama/clip.cpp b/llama/llama.cpp/examples/llava/clip.cpp
similarity index 98%
rename from llama/clip.cpp
rename to llama/llama.cpp/examples/llava/clip.cpp
index dd88a6e90..427d5e020 100644
--- a/llama/clip.cpp
+++ b/llama/llama.cpp/examples/llava/clip.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // NOTE: This is modified from clip.cpp only for LLaVA,
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
diff --git a/llama/clip.h b/llama/llama.cpp/examples/llava/clip.h
similarity index 72%
rename from llama/clip.h
rename to llama/llama.cpp/examples/llava/clip.h
index 2af04bf53..78588bdf1 100644
--- a/llama/clip.h
+++ b/llama/llama.cpp/examples/llava/clip.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef CLIP_H
 #define CLIP_H
 
diff --git a/llama/llava.cpp b/llama/llama.cpp/examples/llava/llava.cpp
similarity index 95%
rename from llama/llava.cpp
rename to llama/llama.cpp/examples/llava/llava.cpp
index d1d00eab5..d56644a89 100644
--- a/llama/llava.cpp
+++ b/llama/llama.cpp/examples/llava/llava.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "clip.h"
 #include "llava.h"
 
diff --git a/llama/llama.cpp/examples/llava/llava.go b/llama/llama.cpp/examples/llava/llava.go
new file mode 100644
index 000000000..37b031cb7
--- /dev/null
+++ b/llama/llama.cpp/examples/llava/llava.go
@@ -0,0 +1,6 @@
+package llava
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo CPPFLAGS: -I${SRCDIR}/../../include -I${SRCDIR}/../../common
+// #cgo CPPFLAGS: -I${SRCDIR}/../../../../ml/backend/ggml/ggml/include
+import "C"
diff --git a/llama/llava.h b/llama/llama.cpp/examples/llava/llava.h
similarity index 59%
rename from llama/llava.h
rename to llama/llama.cpp/examples/llava/llava.h
index 3acd9f615..b6feb3027 100644
--- a/llama/llava.h
+++ b/llama/llama.cpp/examples/llava/llava.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef LLAVA_H
 #define LLAVA_H
 
diff --git a/llama/llama.h b/llama/llama.cpp/include/llama.h
similarity index 98%
rename from llama/llama.h
rename to llama/llama.cpp/include/llama.h
index 4ff8c8621..aba85f860 100644
--- a/llama/llama.h
+++ b/llama/llama.cpp/include/llama.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #ifndef LLAMA_H
 #define LLAMA_H
 
diff --git a/llama/llama-grammar.cpp b/llama/llama.cpp/src/llama-grammar.cpp
similarity index 97%
rename from llama/llama-grammar.cpp
rename to llama/llama.cpp/src/llama-grammar.cpp
index d9e4839f5..74e9f64b3 100644
--- a/llama/llama-grammar.cpp
+++ b/llama/llama.cpp/src/llama-grammar.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-grammar.h"
 
 #include "llama-vocab.h"
diff --git a/llama/llama-grammar.h b/llama/llama.cpp/src/llama-grammar.h
similarity index 78%
rename from llama/llama-grammar.h
rename to llama/llama.cpp/src/llama-grammar.h
index 9052dd2f9..f529ce351 100644
--- a/llama/llama-grammar.h
+++ b/llama/llama.cpp/src/llama-grammar.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama-impl.h"
diff --git a/llama/llama-impl.h b/llama/llama.cpp/src/llama-impl.h
similarity index 79%
rename from llama/llama-impl.h
rename to llama/llama.cpp/src/llama-impl.h
index 3a33cf331..70f16b61c 100644
--- a/llama/llama-impl.h
+++ b/llama/llama.cpp/src/llama-impl.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama.h"
diff --git a/llama/llama-sampling.cpp b/llama/llama.cpp/src/llama-sampling.cpp
similarity index 98%
rename from llama/llama-sampling.cpp
rename to llama/llama.cpp/src/llama-sampling.cpp
index 154cc40eb..fd8ca8a9e 100644
--- a/llama/llama-sampling.cpp
+++ b/llama/llama.cpp/src/llama-sampling.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-sampling.h"
 
 #include "llama-vocab.h"
diff --git a/llama/llama-sampling.h b/llama/llama.cpp/src/llama-sampling.h
similarity index 54%
rename from llama/llama-sampling.h
rename to llama/llama.cpp/src/llama-sampling.h
index af63bb885..919f6fdfc 100644
--- a/llama/llama-sampling.h
+++ b/llama/llama.cpp/src/llama-sampling.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
diff --git a/llama/llama-vocab.cpp b/llama/llama.cpp/src/llama-vocab.cpp
similarity index 98%
rename from llama/llama-vocab.cpp
rename to llama/llama.cpp/src/llama-vocab.cpp
index 1a6c84fbf..05ef0e71f 100644
--- a/llama/llama-vocab.cpp
+++ b/llama/llama.cpp/src/llama-vocab.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-vocab.h"
 
 #include "unicode.h"
diff --git a/llama/llama-vocab.h b/llama/llama.cpp/src/llama-vocab.h
similarity index 83%
rename from llama/llama-vocab.h
rename to llama/llama.cpp/src/llama-vocab.h
index ec7329eb2..4bb16d2e4 100644
--- a/llama/llama-vocab.h
+++ b/llama/llama.cpp/src/llama-vocab.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include "llama-impl.h"
diff --git a/llama/llama.cpp b/llama/llama.cpp/src/llama.cpp
similarity index 99%
rename from llama/llama.cpp
rename to llama/llama.cpp/src/llama.cpp
index 181525f4b..468816428 100644
--- a/llama/llama.cpp
+++ b/llama/llama.cpp/src/llama.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #include "llama-impl.h"
 #include "llama-vocab.h"
 #include "llama-sampling.h"
diff --git a/llama/llama.cpp/src/llama.go b/llama/llama.cpp/src/llama.go
new file mode 100644
index 000000000..29385ccc0
--- /dev/null
+++ b/llama/llama.cpp/src/llama.go
@@ -0,0 +1,7 @@
+package llama
+
+// #cgo CXXFLAGS: -std=c++11
+// #cgo CPPFLAGS: -I${SRCDIR}/../include
+// #cgo CPPFLAGS: -I${SRCDIR}/../../../ml/backend/ggml/ggml/include
+import "C"
+import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
diff --git a/llama/unicode-data.cpp b/llama/llama.cpp/src/unicode-data.cpp
similarity index 99%
rename from llama/unicode-data.cpp
rename to llama/llama.cpp/src/unicode-data.cpp
index 4b3a8dec9..04dcd7fcf 100644
--- a/llama/unicode-data.cpp
+++ b/llama/llama.cpp/src/unicode-data.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 // generated with scripts/gen-unicode-data.py
 
 #include "unicode-data.h"
diff --git a/llama/llama.cpp/src/unicode-data.h b/llama/llama.cpp/src/unicode-data.h
new file mode 100644
index 000000000..f6973ebd2
--- /dev/null
+++ b/llama/llama.cpp/src/unicode-data.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+struct range_nfd {
+    uint32_t first;
+    uint32_t last;
+    uint32_t nfd;
+};
+
+static const uint32_t MAX_CODEPOINTS = 0x110000;
+
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::unordered_set<uint32_t> unicode_set_whitespace;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
diff --git a/llama/unicode.cpp b/llama/llama.cpp/src/unicode.cpp
similarity index 96%
rename from llama/unicode.cpp
rename to llama/llama.cpp/src/unicode.cpp
index d9cedd420..51dd81fba 100644
--- a/llama/unicode.cpp
+++ b/llama/llama.cpp/src/unicode.cpp
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
diff --git a/llama/unicode.h b/llama/llama.cpp/src/unicode.h
similarity index 63%
rename from llama/unicode.h
rename to llama/llama.cpp/src/unicode.h
index c6752ee0f..008532a24 100644
--- a/llama/unicode.h
+++ b/llama/llama.cpp/src/unicode.h
@@ -1,29 +1,3 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 #pragma once
 
 #include <cstdint>
diff --git a/llama/llama.go b/llama/llama.go
index ab6da37aa..c48f2708c 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -1,18 +1,19 @@
 package llama
 
-//go:generate make -j 8
-
 /*
 #cgo CFLAGS: -std=c11
 #cgo CXXFLAGS: -std=c++11
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/include
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/common
+#cgo CPPFLAGS: -I${SRCDIR}/llama.cpp/examples/llava
 #cgo CPPFLAGS: -I${SRCDIR}/../ml/backend/ggml/ggml/include
-#cgo darwin,arm64 CPPFLAGS: -DGGML_USE_METAL
 
 #include <stdlib.h>
+#include "ggml.h"
 #include "llama.h"
 #include "clip.h"
-#include "ggml.h"
 #include "llava.h"
+
 #include "mllama.h"
 #include "sampling_ext.h"
 
@@ -47,10 +48,14 @@ import (
 	"sync/atomic"
 	"unsafe"
 
-	_ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
+	_ "github.com/ollama/ollama/llama/llama.cpp/common"
+	_ "github.com/ollama/ollama/llama/llama.cpp/examples/llava"
+	_ "github.com/ollama/ollama/llama/llama.cpp/src"
+	"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )
 
 func BackendInit() {
+	ggml.OnceLoad()
 	C.llama_backend_init()
 }
 
diff --git a/llama/patches/0001-cuda.patch b/llama/patches/0001-cuda.patch
index c74885269..3f9ac4d33 100644
--- a/llama/patches/0001-cuda.patch
+++ b/llama/patches/0001-cuda.patch
@@ -1,7 +1,7 @@
-From 702ee500b229e910e3e6cd3c84d87763c51fb411 Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
 Date: Thu, 6 Jun 2024 23:55:47 -0700
-Subject: [PATCH 01/11] cuda
+Subject: [PATCH] cuda
 
 ---
  ggml/src/ggml-backend.cpp        | 2 +-
@@ -53,6 +53,3 @@ index 093ae900..a0cf4ba4 100644
  }
  
  static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
--- 
-2.46.0
-
diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch
index 72e4b268c..c87d1e1a6 100644
--- a/llama/patches/0002-pretokenizer.patch
+++ b/llama/patches/0002-pretokenizer.patch
@@ -1,7 +1,7 @@
-From 67eb186ccf062100835d413b1c3e2a0fc58e1c0f Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:13 -0700
-Subject: [PATCH 02/11] pretokenizer
+Subject: [PATCH] pretokenizer
 
 ---
  src/llama.cpp | 14 +++-----------
@@ -39,6 +39,3 @@ index 6a6f4c2a..fa09f3b3 100644
              }
          } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
              vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
--- 
-2.46.0
-
diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch
index 74832a2ec..996f8dbe6 100644
--- a/llama/patches/0003-embeddings.patch
+++ b/llama/patches/0003-embeddings.patch
@@ -1,7 +1,7 @@
-From a9a7820ae111d70e24d4f7004378b5321e8a29c7 Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:14 -0700
-Subject: [PATCH 03/11] embeddings
+Subject: [PATCH] embeddings
 
 ---
  src/llama.cpp | 9 ++++++---
@@ -45,6 +45,3 @@ index fa09f3b3..d1791af0 100644
          // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
  
          ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
--- 
-2.46.0
-
diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch
index 73bde706b..13e945c37 100644
--- a/llama/patches/0004-clip-unicode.patch
+++ b/llama/patches/0004-clip-unicode.patch
@@ -1,7 +1,7 @@
-From aa5ad04094458943643df789c5b7fd7d4c68dafb Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:15 -0700
-Subject: [PATCH 04/11] clip-unicode
+Subject: [PATCH] clip-unicode
 
 ---
  examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++-
@@ -74,6 +74,3 @@ index d7c94352..427d5e02 100644
      }
  
      // vision model
--- 
-2.46.0
-
diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch
index f69ed943d..35b8c55d8 100644
--- a/llama/patches/0005-solar-pro.patch
+++ b/llama/patches/0005-solar-pro.patch
@@ -1,7 +1,7 @@
-From 226de4f71ce73a87a805dc83484b32f9f9d9c24d Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Mon, 16 Sep 2024 15:53:16 -0700
-Subject: [PATCH 05/11] solar-pro
+Subject: [PATCH] solar-pro
 
 solar-pro introduces block skip connections where blocks are connected
 to other, non-sequential blocks with a scale multiple
@@ -404,6 +404,3 @@ index d1791af0..b01770d0 100644
              return LLAMA_ROPE_TYPE_NORM;
  
          // the pairs of head values are offset by n_rot/2
--- 
-2.46.0
-
diff --git a/llama/patches/0006-conditional-fattn.patch b/llama/patches/0006-conditional-fattn.patch
index c80864f17..9eb37d346 100644
--- a/llama/patches/0006-conditional-fattn.patch
+++ b/llama/patches/0006-conditional-fattn.patch
@@ -1,7 +1,7 @@
-From b9d893b5c7c3dcff42bce378ea26587a6c7d1113 Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Daniel Hiltgen <daniel@ollama.com>
 Date: Wed, 9 Oct 2024 17:26:23 -0700
-Subject: [PATCH 06/11] conditional-fattn
+Subject: [PATCH] conditional-fattn
 
 ---
  ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++
@@ -23,6 +23,3 @@ index a2fcfe5d..5eed90da 100644
          case GGML_OP_CROSS_ENTROPY_LOSS:
              ggml_cuda_cross_entropy_loss(ctx, dst);
              break;
--- 
-2.46.0
-
diff --git a/llama/patches/0008-add-mllama-support.patch b/llama/patches/0007-add-mllama-support.patch
similarity index 99%
rename from llama/patches/0008-add-mllama-support.patch
rename to llama/patches/0007-add-mllama-support.patch
index 4ed259fac..ae8b80177 100644
--- a/llama/patches/0008-add-mllama-support.patch
+++ b/llama/patches/0007-add-mllama-support.patch
@@ -1,7 +1,7 @@
-From c2f0b1c0eda94eea785a1de9098df9eb29d64eb5 Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
 Date: Thu, 17 Oct 2024 15:18:22 -0700
-Subject: [PATCH 08/11] add mllama support
+Subject: [PATCH] add mllama support
 
 mllama adds cross-attention layers to the standard llama architecture
 it also requires a way to input a new tensor: cross_attention_state
@@ -784,6 +784,3 @@ index b01770d0..46881642 100644
      } else {
          batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
      }
--- 
-2.46.0
-
diff --git a/llama/patches/0007-blas.patch b/llama/patches/0007-blas.patch
deleted file mode 100644
index d0c3eed22..000000000
--- a/llama/patches/0007-blas.patch
+++ /dev/null
@@ -1,29 +0,0 @@
-From 9a5a9479d9cdf2032ff989fd297e50490f53e4c2 Mon Sep 17 00:00:00 2001
-From: Jesse Gross <jesse@ollama.com>
-Date: Mon, 30 Sep 2024 16:31:04 -0700
-Subject: [PATCH 07/11] blas
-
----
- ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
-index ec158dfa..b3ac1fa4 100644
---- a/ggml/src/ggml-blas/ggml-blas.cpp
-+++ b/ggml/src/ggml-blas/ggml-blas.cpp
-@@ -1,3 +1,5 @@
-+#ifdef GGML_USE_BLAS
-+
- #include "ggml-impl.h"
- #include "ggml-blas.h"
- #include "ggml-backend-impl.h"
-@@ -515,3 +517,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
- }
- 
- GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
-+
-+#endif // GGML_USE_BLAS
-\ No newline at end of file
--- 
-2.46.0
-
diff --git a/llama/patches/0009-add-unpad-operator.patch b/llama/patches/0008-add-unpad-operator.patch
similarity index 99%
rename from llama/patches/0009-add-unpad-operator.patch
rename to llama/patches/0008-add-unpad-operator.patch
index 470b8b427..d30b5e041 100644
--- a/llama/patches/0009-add-unpad-operator.patch
+++ b/llama/patches/0008-add-unpad-operator.patch
@@ -1,7 +1,7 @@
-From 8e07a88fa87f31b6f2245c02a89a4a367ed6013c Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Michael Yang <mxyng@pm.me>
 Date: Thu, 17 Oct 2024 17:19:25 -0700
-Subject: [PATCH 09/11] add unpad operator
+Subject: [PATCH] add unpad operator
 
 ---
  ggml/include/ggml.h                  | 10 +++++
@@ -394,6 +394,3 @@ index 1a9a7efa..ea2b259b 100644
  // ggml_arange
  
  struct ggml_tensor * ggml_arange(
--- 
-2.46.0
-
diff --git a/llama/patches/0010-fix-deepseek-deseret-regex.patch b/llama/patches/0009-fix-deepseek-deseret-regex.patch
similarity index 96%
rename from llama/patches/0010-fix-deepseek-deseret-regex.patch
rename to llama/patches/0009-fix-deepseek-deseret-regex.patch
index 5e8a2e216..9ea501d06 100644
--- a/llama/patches/0010-fix-deepseek-deseret-regex.patch
+++ b/llama/patches/0009-fix-deepseek-deseret-regex.patch
@@ -1,7 +1,7 @@
-From 4236c07fc90fb758b89921fa7ef39dc0482c4bea Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Daniel Hiltgen <daniel@ollama.com>
 Date: Fri, 25 Oct 2024 16:25:18 -0700
-Subject: [PATCH 10/11] fix deepseek deseret regex
+Subject: [PATCH] fix deepseek deseret regex
 
 On windows compiled with gcc the c++ regex library failed to handle
 the characters
@@ -70,6 +70,3 @@ index 3d459263..51dd81fb 100644
  }
  
  static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
--- 
-2.46.0
-
diff --git a/llama/patches/0011-Maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
similarity index 84%
rename from llama/patches/0011-Maintain-ordering-for-rules-for-grammar.patch
rename to llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
index ccb6fce96..33b504ec1 100644
--- a/llama/patches/0011-Maintain-ordering-for-rules-for-grammar.patch
+++ b/llama/patches/0010-Maintain-ordering-for-rules-for-grammar.patch
@@ -1,7 +1,7 @@
-From 7752556d7922e92b455ed92d22a3bfa9725f4458 Mon Sep 17 00:00:00 2001
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: ParthSareen <parth.sareen@ollama.com>
 Date: Wed, 11 Dec 2024 15:37:32 -0800
-Subject: [PATCH 11/11] Maintain ordering for rules for grammar
+Subject: [PATCH] Maintain ordering for rules for grammar
 
 ---
  common/json-schema-to-grammar.cpp | 2 +-
@@ -20,6 +20,3 @@ index dadc18c8..2a8dbd22 100644
      std::unordered_map<std::string, json> _refs;
      std::unordered_set<std::string> _refs_being_resolved;
      std::vector<std::string> _errors;
--- 
-2.46.0
-
diff --git a/llama/unicode-data.h b/llama/unicode-data.h
deleted file mode 100644
index 393ea0bd4..000000000
--- a/llama/unicode-data.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-struct range_nfd {
-    uint32_t first;
-    uint32_t last;
-    uint32_t nfd;
-};
-
-static const uint32_t MAX_CODEPOINTS = 0x110000;
-
-extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
-extern const std::unordered_set<uint32_t> unicode_set_whitespace;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
-extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go
index 1fb057eee..108631266 100644
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -15,18 +15,15 @@ import (
 	"io"
 	"log/slog"
 	"os"
-	"path/filepath"
-	"runtime"
-	"strings"
 	"sync"
 	"unsafe"
 
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/fs/ggml"
+	fs "github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/ml"
 	"golang.org/x/sync/errgroup"
 
-	_ "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
+	"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
 )
 
 type device struct {
@@ -67,45 +64,7 @@ func (d device) LogValue() slog.Value {
 }
 
 var devices = sync.OnceValue(func() []device {
-	var lib struct{ name, pattern, defaultValue string }
-	if runtime.GOOS == "windows" {
-		lib.name = "PATH"
-		lib.pattern = "ggml-*.dll"
-		lib.defaultValue = "."
-	} else if runtime.GOOS == "linux" {
-		lib.name = "LD_LIBRARY_PATH"
-		lib.pattern = "libggml-*.so"
-		lib.defaultValue = "/usr/local/lib:/usr/lib"
-	}
-
-	if lib.name != "" {
-		paths, ok := os.LookupEnv(lib.name)
-		if !ok {
-			paths = lib.defaultValue
-		}
-
-		for _, path := range filepath.SplitList(paths) {
-			matches, err := filepath.Glob(filepath.Join(path, lib.pattern))
-			if err != nil {
-				slog.Error("failed to glob", "path", path, "error", err)
-				continue
-			}
-
-			for _, match := range matches {
-				if base := filepath.Base(match); strings.HasPrefix(base, "ggml-base") ||
-					strings.HasPrefix(base, "libggml-base") {
-					continue
-				}
-
-				func() {
-					cmatch := C.CString(match)
-					defer C.free(unsafe.Pointer(cmatch))
-
-					C.ggml_backend_load(cmatch)
-				}()
-			}
-		}
-	}
+	ggml.OnceLoad()
 
 	s := make([]device, C.ggml_backend_dev_count())
 	for i := range s {
@@ -116,13 +75,13 @@ var devices = sync.OnceValue(func() []device {
 })
 
 type Backend struct {
-	meta       *ggml.GGML
+	meta       *fs.GGML
 	cpus, gpus []Context
 	tensors    map[string]*Context
 }
 
 func New(r *os.File) (ml.Backend, error) {
-	meta, n, err := ggml.Decode(r, -1)
+	meta, n, err := fs.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -170,7 +129,7 @@ func New(r *os.File) (ml.Backend, error) {
 		return nil, fmt.Errorf("no devices available")
 	}
 
-	tensors := make(map[*ggml.Tensor]*Context, len(meta.Tensors().Items()))
+	tensors := make(map[*fs.Tensor]*Context, len(meta.Tensors().Items()))
 	for _, t := range meta.Tensors().Items() {
 		c, err := ctxFunc(append(gpus, cpus...))
 		if err != nil {
diff --git a/ml/backend/ggml/ggml/.rsync-filter b/ml/backend/ggml/ggml/.rsync-filter
new file mode 100644
index 000000000..147398174
--- /dev/null
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -0,0 +1,21 @@
+include include/
+include src/
+include src/ggml-blas/
+include src/ggml-cpu/
+include src/ggml-cpu/amx/
+include src/ggml-cpu/llamafile/
+include src/ggml-cuda/
+include src/ggml-cuda/template-instances/
+include src/ggml-hip/
+include src/ggml-metal/
+protect **/*.go
+protect **/*-embed.*
+include **/CMakeLists.txt
+include **/*.c
+include **/*.h
+include **/*.cpp
+include **/*.cu
+include **/*.cuh
+include **/*.m
+include **/*.metal
+exclude *
diff --git a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
index b3ac1fa45..ec158dfac 100644
--- a/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp
@@ -1,5 +1,3 @@
-#ifdef GGML_USE_BLAS
-
 #include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
@@ -517,5 +515,3 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
 }
 
 GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
-
-#endif // GGML_USE_BLAS
\ No newline at end of file
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
index b0cd99780..55915df98 100644
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -1,8 +1,10 @@
 package cpu
 
 // #cgo CXXFLAGS: -std=c++11
-// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/.. -I${SRCDIR}/../../include
+// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
+// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
 import "C"
+import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu/llamafile"
diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go
new file mode 100644
index 000000000..09b002ce5
--- /dev/null
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/llamafile.go
@@ -0,0 +1,5 @@
+package llamafile
+
+// #cgo CXXFLAGS: -std=c++17
+// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../.. -I${SRCDIR}/../../../include
+import "C"
diff --git a/ml/backend/ggml/ggml/src/ggml.go b/ml/backend/ggml/ggml/src/ggml.go
index f554b4550..f8f490484 100644
--- a/ml/backend/ggml/ggml/src/ggml.go
+++ b/ml/backend/ggml/ggml/src/ggml.go
@@ -3,5 +3,64 @@ package ggml
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
 // #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu
+// #include <stdlib.h>
+// #include "ggml-backend.h"
 import "C"
-import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu"
+import (
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"sync"
+	"unsafe"
+
+	_ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu"
+)
+
+var OnceLoad = sync.OnceFunc(func() {
+	var lib struct{ name, pattern, defaultValue string }
+	switch runtime.GOOS {
+	case "darwin":
+		lib.name = "LD_LIBRARY_PATH"
+		lib.pattern = "libggml-*.dylib"
+		lib.defaultValue = "/usr/local/lib:/usr/lib"
+	case "linux":
+		lib.name = "LD_LIBRARY_PATH"
+		lib.pattern = "libggml-*.so"
+		lib.defaultValue = "/usr/local/lib:/usr/lib"
+	case "windows":
+		lib.name = "PATH"
+		lib.pattern = "ggml-*.dll"
+		lib.defaultValue = "."
+	default:
+		return
+	}
+
+	paths, ok := os.LookupEnv(lib.name)
+	if !ok {
+		paths = lib.defaultValue
+	}
+
+	for _, path := range filepath.SplitList(paths) {
+		matches, err := filepath.Glob(filepath.Join(path, lib.pattern))
+		if err != nil {
+			slog.Error("failed to glob", "path", path, "error", err)
+			continue
+		}
+
+		for _, match := range matches {
+			if base := filepath.Base(match); strings.HasPrefix(base, "ggml-base") ||
+				strings.HasPrefix(base, "libggml-base") {
+				continue
+			}
+
+			func() {
+				cmatch := C.CString(match)
+				defer C.free(unsafe.Pointer(cmatch))
+
+				C.ggml_backend_load(cmatch)
+			}()
+		}
+	}
+})