diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..21d687b81 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,49 @@ +cmake_minimum_required(VERSION 3.21) + +project(Ollama C CXX) + +include(CheckLanguage) + +find_package(Threads REQUIRED) + +set(CMAKE_BUILD_TYPE Release) +set(BUILD_SHARED_LIBS ON) + +set(GGML_CCACHE ON) +set(GGML_SCHED_MAX_COPIES 4) +set(GGML_CPU_ALL_VARIANTS ON) +set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128) + +add_compile_definitions(GGML_BUILD) +add_compile_definitions(GGML_SHARED) +add_compile_definitions(GGML_BACKEND_DL) +add_compile_definitions(GGML_BACKEND_SHARED) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx) + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src EXCLUDE_FROM_ALL) + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu) + +find_package(BLAS) +if(NOT BLAS_VENDOR) + set(GGML_BLAS_VENDOR "Generic") +else() + set(GGML_BLAS_VENDOR ${BLAS_VENDOR}) +endif() + +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-blas) +target_compile_features(ggml-blas PRIVATE cxx_std_11) + +check_language(CUDA) +if(CMAKE_CUDA_COMPILER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda) +endif() + +check_language(HIP) +if(CMAKE_HIP_COMPILER) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip) +endif() diff --git a/Makefile b/Makefile deleted file mode 100644 index 26dc418d2..000000000 --- a/Makefile +++ /dev/null @@ -1,105 +0,0 @@ -# top level makefile for Ollama -include make/common-defs.make - - -# Determine which if any GPU runners we should build -include make/cuda-v11-defs.make -include make/cuda-v12-defs.make -include make/rocm-defs.make - -ifeq ($(CUSTOM_CPU_FLAGS),) -ifneq ($(OS),darwin) -ifeq ($(ARCH),amd64) - RUNNER_TARGETS=cpu -endif -endif -# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present -ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),) -ifneq ($(CUDA_11_COMPILER),) - RUNNER_TARGETS += cuda_v11 -endif -ifneq ($(CUDA_12_COMPILER),) - RUNNER_TARGETS += cuda_v12 -endif -endif -else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected -ifneq ($(CUDA_12_COMPILER),) - RUNNER_TARGETS += cuda_v12 -else ifneq ($(CUDA_11_COMPILER),) - RUNNER_TARGETS += cuda_v11 -endif -endif - -ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),) -ifneq ($(HIP_COMPILER),) - RUNNER_TARGETS += rocm -endif -endif - - -all: runners exe - -dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe - -dist_%: - @$(MAKE) --no-print-directory -f make/Makefile.$* dist - -runners: $(RUNNER_TARGETS) - -$(RUNNER_TARGETS): - @$(MAKE) --no-print-directory -f make/Makefile.$@ - -exe dist_exe: - @$(MAKE) --no-print-directory -f make/Makefile.ollama $@ - -help-sync apply-patches create-patches sync sync-clean: - @$(MAKE) --no-print-directory -f make/Makefile.sync $@ - -test integration lint: - @$(MAKE) --no-print-directory -f make/Makefile.test $@ - -clean: - rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE) - go clean -cache - -help: - @echo "The following make targets will help you build Ollama" - @echo "" - @echo " make all # (default target) Build Ollama llm subprocess runners, and the primary ollama executable" - @echo " make runners # Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable" - @echo " make # Build specific runners. Enabled: '$(RUNNER_TARGETS)'" - @echo " make dist # Build the runners and primary ollama executable for distribution" - @echo " make help-sync # Help information on vendor update targets" - @echo " make help-runners # Help information on runner targets" - @echo "" - @echo "The following make targets will help you test Ollama" - @echo "" - @echo " make test # Run unit tests" - @echo " make integration # Run integration tests. You must 'make all' first" - @echo " make lint # Run lint and style tests" - @echo "" - @echo "For more information see 'docs/development.md'" - @echo "" - - -help-runners: - @echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'" - @echo "" - @echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)' (Override with CUSTOM_CPU_FLAGS)" - @echo "" - @echo "# CUDA_PATH sets the location where CUDA toolkits are present" - @echo "CUDA_PATH=$(CUDA_PATH)" - @echo " CUDA_11_PATH=$(CUDA_11_PATH)" - @echo " CUDA_11_COMPILER=$(CUDA_11_COMPILER)" - @echo " CUDA_12_PATH=$(CUDA_12_PATH)" - @echo " CUDA_12_COMPILER=$(CUDA_12_COMPILER)" - @echo "" - @echo "# HIP_PATH sets the location where the ROCm toolkit is present" - @echo "HIP_PATH=$(HIP_PATH)" - @echo " HIP_COMPILER=$(HIP_COMPILER)" - -.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS) - -# Handy debugging for make variables -print-%: - @echo '$*=$($*)' diff --git a/Makefile2 b/Makefile2 deleted file mode 100644 index ce87e912e..000000000 --- a/Makefile2 +++ /dev/null @@ -1,112 +0,0 @@ -export GOOS?=$(shell go env GOOS) -export GOARCH?=$(shell go env GOARCH) - -build: llama/build/$(GOOS)-$(GOARCH) - -export GOFLAGS=-trimpath - -llama/build/%/runners/metal: GOFLAGS+=-tags=metal -llama/build/%/runners/cpu_avx: GOFLAGS+=-tags=avx -llama/build/%/runners/cpu_avx2: GOFLAGS+=-tags=avx2 -llama/build/%/runners/cuda_v11: GOFLAGS+=-tags=cuda,cuda_v11 -llama/build/%/runners/cuda_v12: GOFLAGS+=-tags=cuda,cuda_v12 -llama/build/%/runners/rocm: GOFLAGS+=-tags=cuda,rocm - -.PHONY: llama/build/darwin-amd64 llama/build/darwin-arm64 -llama/build/darwin-amd64: llama/build/darwin-amd64/runners/cpu_avx -llama/build/darwin-arm64: llama/build/darwin-arm64/runners/metal - -.PHONY: llama/build/linux-amd64 llama/build/linux-arm64 -llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx -llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx2 -llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx -llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx2 - -.PHONY: llama/build/windows-amd64 linux/build/windows-arm64 -llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx -llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx2 -llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx -llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx2 - -.PHONY: cuda_v11 cuda_v12 -cuda_v11 cuda_v12 rocm: - $(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $@ - -ifeq ($(GOOS),linux) -NVCC11=$(shell command -v /usr/local/cuda-11/bin/nvcc) -NVCC12=$(shell command -v /usr/local/cuda-12/bin/nvcc) -HIPCC=$(shell command -v hipcc) -else ifeq ($(GOOS),windows) -NVCC11=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.?\\bin\\nvcc.exe") -NVCC12=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.?\\bin\\nvcc.exe") -HIPCC=$(shell command -v hipcc) -endif - -ifneq ($(NVCC11),) -ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_11),) -CUDA_V11_TARGETS= \ - llama/build/linux-amd64/runners/cuda_v11 \ - llama/build/linux-arm64/runners/cuda_v11 \ - llama/build/windows-amd64/runners/cuda_v11 \ - llama/build/windows-arm64/runners/cuda_v11 \ - -$(CUDA_V11_TARGETS): cuda_v11 -cuda_v11: export NVCC=$(NVCC11) - -llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v11 -llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v11 -llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v11 -llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v11 -endif -endif - -ifneq ($(NVCC12),) -ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_12),) -CUDA_V12_TARGETS= \ - llama/build/linux-amd64/runners/cuda_v12 \ - llama/build/linux-arm64/runners/cuda_v12 \ - llama/build/windows-amd64/runners/cuda_v12 \ - llama/build/windows-arm64/runners/cuda_v12 \ - -$(CUDA_V12_TARGETS): cuda_v12 -cuda_v12: export NVCC=$(NVCC12) - -llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v12 -llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v12 -llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v12 -llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v12 -endif -endif - -ifneq ($(HIPCC),) -ifeq ($(OLLAMA_SKIP_GENERATE_ROCM),) -ROCM_TARGETS= \ - llama/build/linux-amd64/runners/rocm \ - llama/build/linux-arm64/runners/rocm \ - llama/build/windows-amd64/runners/rocm \ - llama/build/windows-arm64/runners/rocm \ - -$(ROCM_TARGETS): rocm -rocm: export NVCC=$(HIPCC) - -llama/build/linux-amd64: llama/build/linux-amd64/runners/rocm -llama/build/linux-arm64: llama/build/linux-arm64/runners/rocm -llama/build/windows-amd64: llama/build/windows-amd64/runners/rocm -llama/build/windows-arm64: llama/build/windows-arm64/runners/rocm -endif -endif - -export CGO_ENABLED=1 -export CGO_CPPFLAGS_ALLOW=-mfma|-mf16c - -llama/build/%: cmd/runner always - mkdir -p $@; go build -o $@ ./$< - -.PHONY: always -always: - -clean: - $(RM) -r llama/build - -realclean: clean - $(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $< diff --git a/fs/ggml/ggml.go b/fs/ggml/ggml.go index 781fb118e..326b3004e 100644 --- a/fs/ggml/ggml.go +++ b/fs/ggml/ggml.go @@ -134,17 +134,38 @@ func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, } type Tensors struct { - Items []*Tensor + items []*Tensor Offset uint64 } +func (s Tensors) Items(prefix ...string) []*Tensor { + if len(prefix) == 0 { + return s.items + } + + var items []*Tensor + for _, t := range s.items { + if strings.HasPrefix(t.Name, prefix[0]) { + items = append(items, t) + } + } + + return items +} + func (ts Tensors) Layers() map[string]Layer { layers := make(map[string]Layer) - for _, t := range ts.Items { + for _, t := range ts.items { parts := strings.Split(t.Name, ".") - if parts[0] == "blk" { - // join first and second part, e.g. blk.%d - parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...) + if i := slices.Index(parts, "blk"); i > 0 { + parts = append([]string{ + strings.Join(parts[:i], "."), + strings.Join(parts[i:i+2], "."), + }, parts[i+2:]...) + } else if i == 0 { + parts = append([]string{ + strings.Join(parts[i:i+2], "."), + }, parts[i+2:]...) } if _, ok := layers[parts[0]]; !ok { diff --git a/fs/ggml/gguf.go b/fs/ggml/gguf.go index f26b8a0c9..8954cb37d 100644 --- a/fs/ggml/gguf.go +++ b/fs/ggml/gguf.go @@ -111,7 +111,7 @@ func (llm *gguf) KV() KV { func (llm *gguf) Tensors() Tensors { return Tensors{ - Items: llm.tensors, + items: llm.tensors, Offset: llm.tensorOffset, } } diff --git a/llama/llama.go b/llama/llama.go index 54a3c49fc..ab6da37aa 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -47,7 +47,7 @@ import ( "sync/atomic" "unsafe" - _ "github.com/ollama/ollama/ml/backend/ggml/ggml" + _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ) func BackendInit() { diff --git a/llama/patches/0001-cuda.patch b/llama/patches/0001-cuda.patch index 3cddfd6cf..c74885269 100644 --- a/llama/patches/0001-cuda.patch +++ b/llama/patches/0001-cuda.patch @@ -1,42 +1,58 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From 702ee500b229e910e3e6cd3c84d87763c51fb411 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Thu, 6 Jun 2024 23:55:47 -0700 -Subject: [PATCH] cuda +Subject: [PATCH 01/11] cuda --- - ggml/src/ggml-backend.cpp | 5 +++++ - ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++ - 2 files changed, 9 insertions(+) + ggml/src/ggml-backend.cpp | 2 +- + ggml/src/ggml-cuda/ggml-cuda.cu | 1 + + ggml/src/ggml-metal/ggml-metal.m | 1 + + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp -index fdb4b986..9b80fe07 100644 +index fdb4b986..731e4078 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp -@@ -106,7 +106,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { +@@ -106,7 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } -+ -+// TODO: this needs to be freed in cuda and hip backends because -+// the cuda backend implementation compiled with msvc -+#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP) - delete buffer; -+#endif +- delete buffer; } size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { -diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index d6e4bfdd..52aec229 100644 ---- a/ggml/src/ggml-cuda/ggml-cuda.cu -+++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context { - static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - delete ctx; -+ -+ // TODO: this needs to be freed in cuda and hipblas backends because -+ // the cuda backend implementation compiled with msvc +@@ -1862,6 +1861,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { + + static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_aligned_free(buffer->context, buffer->size); + free(buffer); } + static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index d6e4bfdd..a2fcfe5d 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -424,6 +424,7 @@ struct ggml_backend_cuda_buffer_context { + static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + delete ctx; ++ delete buffer; + } + static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { +diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m +index 093ae900..a0cf4ba4 100644 +--- a/ggml/src/ggml-metal/ggml-metal.m ++++ b/ggml/src/ggml-metal/ggml-metal.m +@@ -4035,6 +4035,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) + } + + free(ctx); ++ free(buffer); + } + + static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +-- +2.46.0 + diff --git a/llama/patches/0002-pretokenizer.patch b/llama/patches/0002-pretokenizer.patch index c87d1e1a6..72e4b268c 100644 --- a/llama/patches/0002-pretokenizer.patch +++ b/llama/patches/0002-pretokenizer.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From 67eb186ccf062100835d413b1c3e2a0fc58e1c0f Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 16 Sep 2024 15:53:13 -0700 -Subject: [PATCH] pretokenizer +Subject: [PATCH 02/11] pretokenizer --- src/llama.cpp | 14 +++----------- @@ -39,3 +39,6 @@ index 6a6f4c2a..fa09f3b3 100644 } } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; +-- +2.46.0 + diff --git a/llama/patches/0003-embeddings.patch b/llama/patches/0003-embeddings.patch index 996f8dbe6..74832a2ec 100644 --- a/llama/patches/0003-embeddings.patch +++ b/llama/patches/0003-embeddings.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From a9a7820ae111d70e24d4f7004378b5321e8a29c7 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 16 Sep 2024 15:53:14 -0700 -Subject: [PATCH] embeddings +Subject: [PATCH 03/11] embeddings --- src/llama.cpp | 9 ++++++--- @@ -45,3 +45,6 @@ index fa09f3b3..d1791af0 100644 // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); +-- +2.46.0 + diff --git a/llama/patches/0004-clip-unicode.patch b/llama/patches/0004-clip-unicode.patch index 13e945c37..73bde706b 100644 --- a/llama/patches/0004-clip-unicode.patch +++ b/llama/patches/0004-clip-unicode.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From aa5ad04094458943643df789c5b7fd7d4c68dafb Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 16 Sep 2024 15:53:15 -0700 -Subject: [PATCH] clip-unicode +Subject: [PATCH 04/11] clip-unicode --- examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++- @@ -74,3 +74,6 @@ index d7c94352..427d5e02 100644 } // vision model +-- +2.46.0 + diff --git a/llama/patches/0005-solar-pro.patch b/llama/patches/0005-solar-pro.patch index 35b8c55d8..f69ed943d 100644 --- a/llama/patches/0005-solar-pro.patch +++ b/llama/patches/0005-solar-pro.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From 226de4f71ce73a87a805dc83484b32f9f9d9c24d Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 16 Sep 2024 15:53:16 -0700 -Subject: [PATCH] solar-pro +Subject: [PATCH 05/11] solar-pro solar-pro introduces block skip connections where blocks are connected to other, non-sequential blocks with a scale multiple @@ -404,3 +404,6 @@ index d1791af0..b01770d0 100644 return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 +-- +2.46.0 + diff --git a/llama/patches/0006-conditional-fattn.patch b/llama/patches/0006-conditional-fattn.patch index 57211a8d9..c80864f17 100644 --- a/llama/patches/0006-conditional-fattn.patch +++ b/llama/patches/0006-conditional-fattn.patch @@ -1,17 +1,17 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From b9d893b5c7c3dcff42bce378ea26587a6c7d1113 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 9 Oct 2024 17:26:23 -0700 -Subject: [PATCH] conditional-fattn +Subject: [PATCH 06/11] conditional-fattn --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index 52aec229..cbf4fddf 100644 +index a2fcfe5d..5eed90da 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg +@@ -2159,9 +2159,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_ARGSORT: ggml_cuda_op_argsort(ctx, dst); break; @@ -23,3 +23,6 @@ index 52aec229..cbf4fddf 100644 case GGML_OP_CROSS_ENTROPY_LOSS: ggml_cuda_cross_entropy_loss(ctx, dst); break; +-- +2.46.0 + diff --git a/llama/patches/0007-blas.patch b/llama/patches/0007-blas.patch index 121a1cd95..d0c3eed22 100644 --- a/llama/patches/0007-blas.patch +++ b/llama/patches/0007-blas.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From 9a5a9479d9cdf2032ff989fd297e50490f53e4c2 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 30 Sep 2024 16:31:04 -0700 -Subject: [PATCH] blas +Subject: [PATCH 07/11] blas --- ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++ @@ -24,3 +24,6 @@ index ec158dfa..b3ac1fa4 100644 + +#endif // GGML_USE_BLAS \ No newline at end of file +-- +2.46.0 + diff --git a/llama/patches/0008-add-mllama-support.patch b/llama/patches/0008-add-mllama-support.patch index ae8b80177..4ed259fac 100644 --- a/llama/patches/0008-add-mllama-support.patch +++ b/llama/patches/0008-add-mllama-support.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From c2f0b1c0eda94eea785a1de9098df9eb29d64eb5 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Thu, 17 Oct 2024 15:18:22 -0700 -Subject: [PATCH] add mllama support +Subject: [PATCH 08/11] add mllama support mllama adds cross-attention layers to the standard llama architecture it also requires a way to input a new tensor: cross_attention_state @@ -784,3 +784,6 @@ index b01770d0..46881642 100644 } else { batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc); } +-- +2.46.0 + diff --git a/llama/patches/0009-add-unpad-operator.patch b/llama/patches/0009-add-unpad-operator.patch index f7fd86737..470b8b427 100644 --- a/llama/patches/0009-add-unpad-operator.patch +++ b/llama/patches/0009-add-unpad-operator.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From 8e07a88fa87f31b6f2245c02a89a4a367ed6013c Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 17 Oct 2024 17:19:25 -0700 -Subject: [PATCH] add unpad operator +Subject: [PATCH 09/11] add unpad operator --- ggml/include/ggml.h | 10 +++++ @@ -125,10 +125,10 @@ index 23ae2e10..111ff3b0 100644 case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu -index cbf4fddf..9ca6cb77 100644 +index 5eed90da..053e392a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu -@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg +@@ -2082,6 +2082,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_PAD: ggml_cuda_op_pad(ctx, dst); break; @@ -138,7 +138,7 @@ index cbf4fddf..9ca6cb77 100644 case GGML_OP_ARANGE: ggml_cuda_op_arange(ctx, dst); break; -@@ -3012,6 +3015,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g +@@ -3009,6 +3012,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: case GGML_OP_PAD: @@ -210,10 +210,10 @@ index 8fd386b0..e2ededc3 100644 void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m -index 093ae900..cb9a1307 100644 +index a0cf4ba4..346dfb5b 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m -@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte +@@ -310,6 +310,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, @@ -221,7 +221,7 @@ index 093ae900..cb9a1307 100644 GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, -@@ -877,6 +878,7 @@ @implementation GGMLMetalClass +@@ -877,6 +878,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); @@ -394,3 +394,6 @@ index 1a9a7efa..ea2b259b 100644 // ggml_arange struct ggml_tensor * ggml_arange( +-- +2.46.0 + diff --git a/llama/patches/0010-fix-deepseek-deseret-regex.patch b/llama/patches/0010-fix-deepseek-deseret-regex.patch index 9ea501d06..5e8a2e216 100644 --- a/llama/patches/0010-fix-deepseek-deseret-regex.patch +++ b/llama/patches/0010-fix-deepseek-deseret-regex.patch @@ -1,7 +1,7 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From 4236c07fc90fb758b89921fa7ef39dc0482c4bea Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 25 Oct 2024 16:25:18 -0700 -Subject: [PATCH] fix deepseek deseret regex +Subject: [PATCH 10/11] fix deepseek deseret regex On windows compiled with gcc the c++ regex library failed to handle the characters @@ -70,3 +70,6 @@ index 3d459263..51dd81fb 100644 } static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { +-- +2.46.0 + diff --git a/llama/patches/0011-Maintain-ordering-for-rules-for-grammar.patch b/llama/patches/0011-Maintain-ordering-for-rules-for-grammar.patch new file mode 100644 index 000000000..ccb6fce96 --- /dev/null +++ b/llama/patches/0011-Maintain-ordering-for-rules-for-grammar.patch @@ -0,0 +1,25 @@ +From 7752556d7922e92b455ed92d22a3bfa9725f4458 Mon Sep 17 00:00:00 2001 +From: ParthSareen +Date: Wed, 11 Dec 2024 15:37:32 -0800 +Subject: [PATCH 11/11] Maintain ordering for rules for grammar + +--- + common/json-schema-to-grammar.cpp | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp +index dadc18c8..2a8dbd22 100644 +--- a/common/json-schema-to-grammar.cpp ++++ b/common/json-schema-to-grammar.cpp +@@ -391,7 +391,7 @@ class SchemaConverter { + private: + std::function _fetch_json; + bool _dotall; +- std::map _rules; ++ std::unordered_map _rules; + std::unordered_map _refs; + std::unordered_set _refs_being_resolved; + std::vector _errors; +-- +2.46.0 + diff --git a/llama/patches/0011-relative-include-paths.patch b/llama/patches/0011-relative-include-paths.patch deleted file mode 100644 index c25518d48..000000000 --- a/llama/patches/0011-relative-include-paths.patch +++ /dev/null @@ -1,64 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: jmorganca -Date: Tue, 3 Dec 2024 21:30:51 -0800 -Subject: [PATCH] relative include paths - ---- - ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +- - ggml/src/ggml-cpu/ggml-cpu.c | 2 +- - ggml/src/ggml-cpu/ggml-cpu.cpp | 2 +- - ggml/src/ggml-quants.c | 2 +- - 4 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c -index 11152385..bbf8934e 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c -+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c -@@ -4,7 +4,7 @@ - #include "ggml-quants.h" - #include "ggml-impl.h" - #include "ggml-cpu.h" --#include "ggml-cpu/ggml-cpu-impl.h" -+#include "ggml-cpu-impl.h" - - #include - #include -diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c -index 111ff3b0..df0bd3c6 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu.c -+++ b/ggml/src/ggml-cpu/ggml-cpu.c -@@ -10,7 +10,7 @@ - #include "ggml-quants.h" - #include "ggml-cpu-quants.h" - #include "ggml-threading.h" --#include "amx/amx.h" -+#include "amx.h" - #include "ggml.h" - - #if defined(_MSC_VER) || defined(__MINGW32__) -diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp -index 77e5d87a..91476ad0 100644 ---- a/ggml/src/ggml-cpu/ggml-cpu.cpp -+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp -@@ -3,7 +3,7 @@ - #include "ggml-cpu.h" - #include "ggml-cpu-aarch64.h" - #include "ggml-impl.h" --#include "amx/amx.h" -+#include "amx.h" - #include - #include - #include -diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c -index 7301a9c6..49ab3daf 100644 ---- a/ggml/src/ggml-quants.c -+++ b/ggml/src/ggml-quants.c -@@ -3,7 +3,7 @@ - - #include "ggml-quants.h" - #include "ggml-impl.h" --#include "ggml-cpu/ggml-cpu-impl.h" -+#include "ggml-cpu-impl.h" - #include "ggml-cpu.h" - - #include diff --git a/ml/backend/ggml/ggml.go b/ml/backend/ggml/ggml.go index ebaafb8d3..1fb057eee 100644 --- a/ml/backend/ggml/ggml.go +++ b/ml/backend/ggml/ggml.go @@ -10,74 +10,195 @@ import "C" import ( "bytes" + "encoding/binary" "fmt" "io" "log/slog" "os" + "path/filepath" + "runtime" + "strings" + "sync" "unsafe" - "golang.org/x/sync/errgroup" - "github.com/ollama/ollama/format" "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/ml" + "golang.org/x/sync/errgroup" - _ "github.com/ollama/ollama/ml/backend/ggml/ggml" + _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src" ) -func newCPUBackend() *C.struct_ggml_backend { - return C.ggml_backend_cpu_init() +type device struct { + d *C.struct_ggml_backend_device } -type Backend struct { - c *C.struct_ggml_context - b *C.struct_ggml_backend - bb *C.struct_ggml_backend_buffer +func (d device) name() string { + return C.GoString(C.ggml_backend_dev_name(d.d)) +} - ggml.KV - ggml.Tensors +func (d device) kind() string { + switch C.ggml_backend_dev_type(d.d) { + case C.GGML_BACKEND_DEVICE_TYPE_CPU: + return "cpu" + case C.GGML_BACKEND_DEVICE_TYPE_GPU: + return "gpu" + case C.GGML_BACKEND_DEVICE_TYPE_ACCEL: + return "accel" + default: + return "unknown" + } +} + +func (d device) memory() (total uint64, free uint64) { + C.ggml_backend_dev_memory(d.d, (*C.size_t)(&free), (*C.size_t)(&total)) + return +} + +func (d device) LogValue() slog.Value { + free, total := d.memory() + return slog.GroupValue( + slog.String("name", C.GoString(C.ggml_backend_dev_name(d.d))), + slog.String("description", C.GoString(C.ggml_backend_dev_description(d.d))), + slog.String("kind", d.kind()), + slog.String("free", format.HumanBytes2(free)), + slog.String("total", format.HumanBytes2(total)), + ) +} + +var devices = sync.OnceValue(func() []device { + var lib struct{ name, pattern, defaultValue string } + if runtime.GOOS == "windows" { + lib.name = "PATH" + lib.pattern = "ggml-*.dll" + lib.defaultValue = "." + } else if runtime.GOOS == "linux" { + lib.name = "LD_LIBRARY_PATH" + lib.pattern = "libggml-*.so" + lib.defaultValue = "/usr/local/lib:/usr/lib" + } + + if lib.name != "" { + paths, ok := os.LookupEnv(lib.name) + if !ok { + paths = lib.defaultValue + } + + for _, path := range filepath.SplitList(paths) { + matches, err := filepath.Glob(filepath.Join(path, lib.pattern)) + if err != nil { + slog.Error("failed to glob", "path", path, "error", err) + continue + } + + for _, match := range matches { + if base := filepath.Base(match); strings.HasPrefix(base, "ggml-base") || + strings.HasPrefix(base, "libggml-base") { + continue + } + + func() { + cmatch := C.CString(match) + defer C.free(unsafe.Pointer(cmatch)) + + C.ggml_backend_load(cmatch) + }() + } + } + } + + s := make([]device, C.ggml_backend_dev_count()) + for i := range s { + s[i] = device{C.ggml_backend_dev_get(C.size_t(i))} + } + + return s +}) + +type Backend struct { + meta *ggml.GGML + cpus, gpus []Context + tensors map[string]*Context } func New(r *os.File) (ml.Backend, error) { - f, _, err := ggml.Decode(r, -1) + meta, n, err := ggml.Decode(r, -1) if err != nil { return nil, err } slog.Info( "", - "architecture", f.KV().Architecture(), - "file_type", f.KV().FileType(), - "name", f.KV().String("general.name"), - "description", f.KV().String("general.description"), - "num_tensors", len(f.Tensors().Items), - "num_key_values", len(f.KV()), + "architecture", meta.KV().Architecture(), + "file_type", meta.KV().FileType(), + "name", meta.KV().String("general.name"), + "description", meta.KV().String("general.description"), + "num_tensors", len(meta.Tensors().Items()), + "num_key_values", len(meta.KV()), ) - c := C.ggml_init(C.struct_ggml_init_params{ - mem_size: C.size_t(len(f.Tensors().Items)) * C.ggml_tensor_overhead(), - mem_buffer: nil, - no_alloc: true, - }) + var cpus, gpus []Context + for _, d := range devices() { + switch C.ggml_backend_dev_type(d.d) { + case C.GGML_BACKEND_DEVICE_TYPE_CPU, + C.GGML_BACKEND_DEVICE_TYPE_ACCEL: + slog.Info("cpu", "device", d) + cpus = append(cpus, Context{ + ctx: C.ggml_init(C.struct_ggml_init_params{ + mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)), + no_alloc: true, + }), + backend: C.ggml_backend_dev_init(d.d, nil), + }) + case C.GGML_BACKEND_DEVICE_TYPE_GPU: + slog.Info("gpu", "device", d) + gpus = append(gpus, Context{ + ctx: C.ggml_init(C.struct_ggml_init_params{ + mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)), + no_alloc: true, + }), + backend: C.ggml_backend_dev_init(d.d, nil), + }) + } + } + + ctxFunc := func(s []Context) (*Context, error) { + for _, e := range s { + return &e, nil + } + + return nil, fmt.Errorf("no devices available") + } + + tensors := make(map[*ggml.Tensor]*Context, len(meta.Tensors().Items())) + for _, t := range meta.Tensors().Items() { + c, err := ctxFunc(append(gpus, cpus...)) + if err != nil { + return nil, err + } - for _, t := range f.Tensors().Items { func() { + tt := C.ggml_new_tensor(c.ctx, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0]))) + cname := C.CString(t.Name) defer C.free(unsafe.Pointer(cname)) - - tt := C.ggml_new_tensor(c, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0]))) C.ggml_set_name(tt, cname) + + tensors[t] = c }() } - b := newBackend() - bb := C.ggml_backend_alloc_ctx_tensors(c, b) + for _, b := range append(gpus, cpus...) { + C.ggml_backend_alloc_ctx_tensors(b.ctx, b.backend) + } + + sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset)) var g errgroup.Group - for _, t := range f.Tensors().Items { + for t, c := range tensors { g.Go(func() error { var b bytes.Buffer - n, err := io.Copy(&b, io.NewSectionReader(r, int64(f.Tensors().Offset+t.Offset), int64(t.Size()))) + n, err := io.Copy(&b, io.NewSectionReader(sr, int64(t.Offset), int64(t.Size()))) if err != nil { return err } @@ -89,10 +210,12 @@ func New(r *os.File) (ml.Backend, error) { cname := C.CString(t.Name) defer C.free(unsafe.Pointer(cname)) + tt := C.ggml_get_tensor(c.ctx, cname) + cbytes := C.CBytes(b.Bytes()) defer C.free(cbytes) - C.ggml_backend_tensor_set(C.ggml_get_tensor(c, cname), cbytes, 0, C.size_t(n)) + C.ggml_backend_tensor_set(tt, cbytes, 0, C.size_t(n)) return nil }) } @@ -101,7 +224,11 @@ func New(r *os.File) (ml.Backend, error) { return nil, err } - return &Backend{c, b, bb, f.KV(), f.Tensors()}, nil + return &Backend{ + meta: meta, + cpus: cpus, + gpus: gpus, + }, nil } func init() { @@ -109,55 +236,78 @@ func init() { } func (b *Backend) Config() ml.Config { - return b.KV + return b.meta.KV() } func (b *Backend) Get(name string) ml.Tensor { cname := C.CString(name) defer C.free(unsafe.Pointer(cname)) - if t := C.ggml_get_tensor(b.c, cname); t != nil { - return &Tensor{t} + + for _, c := range append(b.gpus, b.cpus...) { + if t := C.ggml_get_tensor(c.ctx, cname); t != nil { + return &Tensor{t: t} + } } return nil } func (b *Backend) NewContext() ml.Context { - n := max(8192, len(b.Tensors.Items)*5) - bts := make([]byte, C.size_t(n)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(n), false)) + nodes := max(8192, len(b.meta.Tensors().Items())*5) + bts := make([]byte, C.size_t(nodes)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(nodes), false)) c := C.ggml_init(C.struct_ggml_init_params{ mem_buffer: unsafe.Pointer(&bts[0]), mem_size: C.size_t(len(bts)), no_alloc: true, }) + + backends := make([]*C.struct_ggml_backend, len(b.gpus)+len(b.cpus)) + bufts := make([]*C.struct_ggml_backend_buffer_type, len(b.gpus)+len(b.cpus)) + for i, c := range append(b.gpus, b.cpus...) { + backends[i] = c.backend + bufts[i] = C.ggml_backend_get_default_buffer_type(c.backend) + } + return &Context{ - b: b.b, - c: c, - g: C.ggml_new_graph_custom(c, C.size_t(n), false), + ctx: c, + backend: backends[0], + nodes: nodes, + sched: C.ggml_backend_sched_new( + (*C.ggml_backend_t)(unsafe.Pointer(&backends[0])), + (*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])), + C.int(len(backends)), + C.size_t(nodes), + true, + ), } } type Context struct { - b *C.struct_ggml_backend - c *C.struct_ggml_context - g *C.struct_ggml_cgraph + ctx *C.struct_ggml_context + backend *C.struct_ggml_backend + + sched *C.struct_ggml_backend_sched + graph *C.struct_ggml_cgraph + nodes int } func (c *Context) Forward(t ml.Tensor) { - C.ggml_build_forward_expand(c.g, t.(*Tensor).t) + if c.graph == nil { + c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.nodes), false) + } + + C.ggml_build_forward_expand(c.graph, t.(*Tensor).t) } func (c *Context) Compute(t ml.Tensor) ml.Tensor { c.Forward(t) + C.ggml_backend_sched_graph_compute_async(c.sched, c.graph) - a := C.ggml_gallocr_new(C.ggml_backend_get_default_buffer_type(c.b)) - C.ggml_gallocr_alloc_graph(a, c.g) - slog.Debug("compute graph memory", "require", format.HumanBytes2(uint64(C.ggml_gallocr_get_buffer_size(a, 0)))) + backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t) - C.ggml_backend_graph_compute(c.b, c.g) - return &Tensor{ - C.ggml_graph_node(c.g, C.ggml_graph_n_nodes(c.g)-1), - } + t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t)) + C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t)) + return t } func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor { @@ -174,17 +324,17 @@ func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor { var t *C.struct_ggml_tensor switch dtype { case ml.DTypeF32: - t = C.ggml_new_tensor(c.c, C.GGML_TYPE_F32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0]))) + t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0]))) case ml.DTypeI32: - t = C.ggml_new_tensor(c.c, C.GGML_TYPE_I32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0]))) + t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0]))) default: panic("unsupported dtype") } - b := C.ggml_backend_alloc_buffer(c.b, C.ggml_nbytes(t)) + b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t)) C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b)) - C.ggml_set_f32(t, 0.) - return &Tensor{t} + C.ggml_set_zero(t) + return &Tensor{t: t} } func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) { @@ -197,11 +347,11 @@ func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype u return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s)) } - t := C.ggml_new_tensor(ctx.c, dtype, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0]))) - b := C.ggml_backend_alloc_buffer(ctx.b, C.ggml_nbytes(t)) + t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0]))) + b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t)) C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b)) C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t)) - return &Tensor{t}, nil + return &Tensor{t: t}, nil } func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) { @@ -213,12 +363,14 @@ func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) { } func (c *Context) Close() error { - C.ggml_free(c.c) + C.ggml_backend_sched_free(c.sched) + C.ggml_free(c.ctx) return nil } type Tensor struct { - t *C.struct_ggml_tensor + t *C.struct_ggml_tensor + data []byte } func (t *Tensor) LogValue() slog.Value { @@ -254,17 +406,13 @@ func (t *Tensor) Bytes() []byte { return nil } -func (t *Tensor) Floats() []float32 { - if s := C.ggml_get_data_f32(t.t); s != nil { - f32s := make([]float32, C.ggml_nelements(t.t)) - for i, v := range unsafe.Slice(s, C.ggml_nelements(t.t)) { - f32s[i] = float32(v) - } - - return f32s +func (t *Tensor) Floats() (f32s []float32) { + if t.data != nil { + f32s = make([]float32, C.ggml_nelements(t.t)) + _ = binary.Read(bytes.NewReader(t.data), binary.LittleEndian, f32s) } - return nil + return } func (t *Tensor) DType() ml.DType { @@ -280,7 +428,7 @@ func (t *Tensor) DType() ml.DType { func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return &Tensor{ - C.ggml_add(ctx.(*Context).c, t.t, t2.(*Tensor).t), + t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t), } } @@ -294,37 +442,37 @@ func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor { func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor { return &Tensor{ - C.ggml_concat(ctx.(*Context).c, t.t, t2.(*Tensor).t, C.int(dim)), + t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)), } } func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor { return &Tensor{ - C.ggml_cont(ctx.(*Context).c, t.t), + t: C.ggml_cont(ctx.(*Context).ctx, t.t), } } func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return &Tensor{ - C.ggml_mul(ctx.(*Context).c, t.t, t2.(*Tensor).t), + t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t), } } func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return &Tensor{ - C.ggml_mul_mat(ctx.(*Context).c, t.t, t2.(*Tensor).t), + t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t), } } func (t *Tensor) Norm(ctx ml.Context, eps float32) ml.Tensor { return &Tensor{ - C.ggml_norm(ctx.(*Context).c, t.t, (C.float)(eps)), + t: C.ggml_norm(ctx.(*Context).ctx, t.t, (C.float)(eps)), } } func (t *Tensor) RMSNorm(ctx ml.Context, eps float32) ml.Tensor { return &Tensor{ - C.ggml_rms_norm(ctx.(*Context).c, t.t, C.float(eps)), + t: C.ggml_rms_norm(ctx.(*Context).ctx, t.t, C.float(eps)), } } @@ -334,7 +482,7 @@ func (t *Tensor) Pad(ctx ml.Context, shape ...int64) ml.Tensor { } return &Tensor{ - C.ggml_pad(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), + t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), } } @@ -344,19 +492,19 @@ func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor { } return &Tensor{ - C.ggml_permute(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), + t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), } } func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return &Tensor{ - C.ggml_get_rows(ctx.(*Context).c, t.t, t2.(*Tensor).t), + t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t), } } func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return &Tensor{ - C.ggml_cpy(ctx.(*Context).c, t.t, t2.(*Tensor).t), + t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t), } } @@ -364,19 +512,19 @@ func (t *Tensor) Reshape(ctx ml.Context, shape ...int64) ml.Tensor { switch len(shape) { case 1: return &Tensor{ - C.ggml_reshape_1d(ctx.(*Context).c, t.t, C.int64_t(shape[0])), + t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])), } case 2: return &Tensor{ - C.ggml_reshape_2d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])), + t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])), } case 3: return &Tensor{ - C.ggml_reshape_3d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])), + t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])), } case 4: return &Tensor{ - C.ggml_reshape_4d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])), + t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])), } default: panic("unsupported number of dimensions") @@ -385,19 +533,19 @@ func (t *Tensor) Reshape(ctx ml.Context, shape ...int64) ml.Tensor { func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor { return &Tensor{ - C.ggml_scale(ctx.(*Context).c, t.t, (C.float)(s)), + t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)), } } func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor { return &Tensor{ - C.ggml_soft_max(ctx.(*Context).c, t.t), + t: C.ggml_soft_max(ctx.(*Context).ctx, t.t), } } func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor { return &Tensor{ - C.ggml_tanh_inplace(ctx.(*Context).c, t.t), + t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t), } } @@ -407,7 +555,7 @@ func (t *Tensor) Unpad(ctx ml.Context, shape ...int64) ml.Tensor { } return &Tensor{ - C.ggml_unpad(ctx.(*Context).c, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), + t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])), } } @@ -415,25 +563,25 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor { switch len(shape) { case 1: return &Tensor{ - C.ggml_view_1d(ctx.(*Context).c, t.t, C.int64_t(shape[0]), C.size_t(offset)), + t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)), } case 3: return &Tensor{ - C.ggml_view_2d(ctx.(*Context).c, t.t, + t: C.ggml_view_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[2]), C.size_t(shape[1]), C.size_t(offset)), } case 5: return &Tensor{ - C.ggml_view_3d(ctx.(*Context).c, t.t, + t: C.ggml_view_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(offset)), } case 7: return &Tensor{ - C.ggml_view_4d(ctx.(*Context).c, t.t, + t: C.ggml_view_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]), C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]), C.size_t(offset)), @@ -449,8 +597,8 @@ const ( func (t *Tensor) Rope(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor { return &Tensor{ - C.ggml_rope_ext( - ctx.(*Context).c, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t, + t: C.ggml_rope_ext( + ctx.(*Context).ctx, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t, C.int(ropeDim), 131072, // YaRN n_ctx_train ropeTypeNorm, // ROPE_TYPE_NORM @@ -466,18 +614,18 @@ func (t *Tensor) Rope(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi func (t *Tensor) GELU(ctx ml.Context) ml.Tensor { return &Tensor{ - C.ggml_gelu_inplace(ctx.(*Context).c, t.t), + t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t), } } func (t *Tensor) SILU(ctx ml.Context) ml.Tensor { return &Tensor{ - C.ggml_silu_inplace(ctx.(*Context).c, t.t), + t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t), } } func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor { return &Tensor{ - C.ggml_conv_2d(ctx.(*Context).c, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)), + t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)), } } diff --git a/ml/backend/ggml/ggml/ggml-blas/blas.go b/ml/backend/ggml/ggml/ggml-blas/blas.go deleted file mode 100644 index 75b75d890..000000000 --- a/ml/backend/ggml/ggml/ggml-blas/blas.go +++ /dev/null @@ -1,3 +0,0 @@ -package blas - -import "C" diff --git a/ml/backend/ggml/ggml/ggml-cpu/amx/amx.go b/ml/backend/ggml/ggml/ggml-cpu/amx/amx.go deleted file mode 100644 index b55d2ca7d..000000000 --- a/ml/backend/ggml/ggml/ggml-cpu/amx/amx.go +++ /dev/null @@ -1,5 +0,0 @@ -package amx - -// #cgo CXXFLAGS: -std=c++11 -// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../.. -I${SRCDIR}/../../include -import "C" diff --git a/ml/backend/ggml/ggml/ggml-cpu/cpu.go b/ml/backend/ggml/ggml/ggml-cpu/cpu.go deleted file mode 100644 index a8e12a28d..000000000 --- a/ml/backend/ggml/ggml/ggml-cpu/cpu.go +++ /dev/null @@ -1,13 +0,0 @@ -package cpu - -// #cgo CXXFLAGS: -std=c++11 -// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../include -I${SRCDIR}/amx -// #cgo CPPFLAGS: -D_GNU_SOURCE -// #cgo amd64,avx CPPFLAGS: -mavx -// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c -// #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -import "C" -import ( - _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cpu/amx" - _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cpu/llamafile" -) diff --git a/ml/backend/ggml/ggml/ggml-cpu/llamafile/llamafile.go b/ml/backend/ggml/ggml/ggml-cpu/llamafile/llamafile.go deleted file mode 100644 index 7e5f18b85..000000000 --- a/ml/backend/ggml/ggml/ggml-cpu/llamafile/llamafile.go +++ /dev/null @@ -1,9 +0,0 @@ -package llamafile - -// #cgo CXXFLAGS: -std=c++11 -// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../.. -I${SRCDIR}/../../include -// #cgo amd64,avx CPPFLAGS: -mavx -// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -// #cgo amd64,f16c CPPFLAGS: -mf16c -// #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -import "C" diff --git a/ml/backend/ggml/ggml/ggml-cuda/.gitignore b/ml/backend/ggml/ggml/ggml-cuda/.gitignore deleted file mode 100644 index 5761abcfd..000000000 --- a/ml/backend/ggml/ggml/ggml-cuda/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.o diff --git a/ml/backend/ggml/ggml/ggml-cuda/Makefile b/ml/backend/ggml/ggml/ggml-cuda/Makefile deleted file mode 100644 index 170eaf7d2..000000000 --- a/ml/backend/ggml/ggml/ggml-cuda/Makefile +++ /dev/null @@ -1,64 +0,0 @@ -NVCC?=nvcc - -NVCC_PREPEND_FLAGS= \ - -t 2 \ - -split-compile 0 \ - -std=c++17 \ - -I.. \ - -I../include \ - $(foreach ARCH,$(subst ;, ,$(CUDA_ARCHS)),--generate-code=arch=compute_$(ARCH),code=sm_$(ARCH)) \ - -NVCC_APPEND_FLAGS= \ - # -DGGML_CUDA_USE_GRAPHS=1 \ - -ALL_SOURCES=$(wildcard *.cu) -FATTN_SOURCES=$(wildcard fattn*.cu) - -SOURCES= \ - $(filter-out $(FATTN_SOURCES),$(ALL_SOURCES)) \ - $(wildcard template-instances/mmq*.cu) \ - -ifneq ($(OLLAMA_FAST_BUILD),) -NVCC_APPEND_FLAGS+=-DGGML_DISABLE_FLASH_ATTN -else -SOURCES+= \ - $(FATTN_SOURCES) \ - $(wildcard template-instances/fattn-wmma*.cu) \ - $(wildcard template-instances/fattn-vec*q4_0-q4_0.cu) \ - $(wildcard template-instances/fattn-vec*q8_0-q8_0.cu) \ - $(wildcard template-instances/fattn-vec*f16-f16.cu) -endif - -all: cuda_v11 cuda_v12 - -cuda_v11: CUDA_ARCHS?=50;52;53;60;61;62;70;72;75;80;86 -cuda_v11: OBJECTS=$(patsubst %.cu,%.v11.o,$(SOURCES)) - -cuda_v12: CUDA_ARCHS?=60;61;62;70;72;75;80;86;87;89;90;90a -cuda_v12: OBJECTS=$(patsubst %.cu,%.v12.o,$(SOURCES)) - -rocm: CPPFLAGS+=-DGGML_USE_HIP -rocm: OBJECTS=$(patsubst %.cu,%.rocm.o,$(SOURCES)) - -ifeq ($(OS),Windows_NT) -cuda_v11: libggml_cuda_v11.dll -cuda_v12: libggml_cuda_v12.dll -rocm: libggml_rocm.dll -else -cuda_v11: libggml_cuda_v11.a -cuda_v12: libggml_cuda_v12.a -rocm: libggml_rocm.a -endif - -clean: - $(RM) *.a *.o template-instances/*.o - -%.v11.o %.v12.o %.rocm.o: %.cu - $(NVCC) $(NVCC_PREPEND_FLAGS) -c $< -o $@ $(NVCC_APPEND_FLAGS) - -.SECONDEXPANSION: -%.a: $$(OBJECTS) - $(AR) rcs $@ $^ - -%.dll: $$(OBJECTS) - $(NVCC) -shared -o $@ $^ diff --git a/ml/backend/ggml/ggml/ggml-cuda/cuda.go b/ml/backend/ggml/ggml/ggml-cuda/cuda.go deleted file mode 100644 index 529e0e388..000000000 --- a/ml/backend/ggml/ggml/ggml-cuda/cuda.go +++ /dev/null @@ -1,7 +0,0 @@ -package cuda - -// #cgo cuda_v11 LDFLAGS: -L. -lggml_cuda_v11 -// #cgo cuda_v12 LDFLAGS: -L. -lggml_cuda_v12 -// #cgo cuda_v11 cuda_v12 LDFLAGS: -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublasLt -// #cgo rocm LDFLAGS: -L. -lggml_rocm -L/opt/rocm/lib -lhipblas -lamdhip64 -lrocblas -import "C" diff --git a/ml/backend/ggml/ggml/ggml-metal/metal.go b/ml/backend/ggml/ggml/ggml-metal/metal.go deleted file mode 100644 index 74166ad7f..000000000 --- a/ml/backend/ggml/ggml/ggml-metal/metal.go +++ /dev/null @@ -1,7 +0,0 @@ -package metal - -// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../include -// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -// #cgo LDFLAGS: -framework Metal -framework MetalKit -framework Accelerate -import "C" -import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-blas" diff --git a/ml/backend/ggml/ggml/ggml.go b/ml/backend/ggml/ggml/ggml.go deleted file mode 100644 index a3c1ed1fc..000000000 --- a/ml/backend/ggml/ggml/ggml.go +++ /dev/null @@ -1,11 +0,0 @@ -package ggml - -// #cgo CXXFLAGS: -std=c++17 -// #cgo CPPFLAGS: -I${SRCDIR} -I${SRCDIR}/include -I${SRCDIR}/ggml-cpu -// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU -// #cgo darwin LDFLAGS: -framework Foundation -// #cgo amd64,avx CPPFLAGS: -mavx -// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c -// #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA -import "C" -import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cpu" diff --git a/ml/backend/ggml/ggml/ggml_cuda.go b/ml/backend/ggml/ggml/ggml_cuda.go deleted file mode 100644 index 09f685f84..000000000 --- a/ml/backend/ggml/ggml/ggml_cuda.go +++ /dev/null @@ -1,8 +0,0 @@ -//go:build cuda - -package ggml - -// #cgo CPPFLAGS: -DGGML_USE_CUDA -// #cgo rocm CPPFLAGS: -DGGML_USE_HIP -import "C" -import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cuda" diff --git a/ml/backend/ggml/ggml/ggml_darwin_arm64.go b/ml/backend/ggml/ggml/ggml_darwin_arm64.go deleted file mode 100644 index 61380411f..000000000 --- a/ml/backend/ggml/ggml/ggml_darwin_arm64.go +++ /dev/null @@ -1,5 +0,0 @@ -package ggml - -// #cgo CPPFLAGS: -DGGML_USE_METAL -import "C" -import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-metal" diff --git a/ml/backend/ggml/ggml/include/ggml-cann.h b/ml/backend/ggml/ggml/include/ggml-cann.h new file mode 100644 index 000000000..b469e228d --- /dev/null +++ b/ml/backend/ggml/ggml/include/ggml-cann.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Maximum number of CANN devices supported. + */ +#define GGML_CANN_MAX_DEVICES 16 + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void); + +/** + * @brief Initializes the CANN backend for a specified device. + * + * This function initializes the CANN backend for the given device. + * It verifies the device index, allocates a context, and creates a backend + * instance. + * + * @param device The index of the device to initialize. + * @return A pointer to the initialized backend instance, or nullptr on failure. + */ +GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device); + +/** + * @brief Checks if a given backend is a CANN backend. + * + * This function verifies if the provided backend is a CANN backend by comparing + * its GUID with the CANN backend's GUID. + * + * @param backend The backend instance to check. + * @return True if the backend is a CANN backend, false otherwise. + */ +GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend); + +/** + * @brief Retrieves the CANN buffer type for a specified device. + * + * This function initializes and returns the buffer type interface associated + * with the given device. It ensures thread-safe access using a mutex. + * + * @param device The device index for which to retrieve the buffer type. + * @return A pointer to the buffer type interface for the specified device, or + * nullptr if the device index is out of range. + */ +GGML_BACKEND_API ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device); + +/** + * @brief Retrieves the number of CANN devices available. + * + * This function returns the number of CANN devices available based on + * information obtained from `ggml_cann_info()`. + * + * @return The number of CANN devices available. + */ +GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void); + +/** + * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. + * + * @return A pointer to the host buffer type interface. + */ +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); + +/** + * @brief Retrieves the description of a specific CANN device. + * + * This function sets the specified device, retrieves the SoC name, + * and writes it into the provided description buffer. + * + * @param device The device index to retrieve the description for. + * @param description Pointer to a buffer where the description will be written. + * @param description_size Size of the description buffer. + */ +GGML_BACKEND_API void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size); + +/** + * @brief Retrieves the memory information of a specific CANN device. + * + * This function sets the specified device, retrieves the free and total + * memory information of the specified type (ACL_HBM_MEM), and stores them + * in the provided pointers. + * + * @param device The device index to retrieve memory information for. + * @param free Pointer to a variable where the free memory size will be stored. + * @param total Pointer to a variable where the total memory size will be + * stored. + */ +GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device, + size_t* free, + size_t* total); + +#ifdef __cplusplus +} +#endif diff --git a/ml/backend/ggml/ggml/include/ggml-kompute.h b/ml/backend/ggml/ggml/include/ggml-kompute.h new file mode 100644 index 000000000..154aa56a7 --- /dev/null +++ b/ml/backend/ggml/ggml/include/ggml-kompute.h @@ -0,0 +1,50 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_KOMPUTE_MAX_DEVICES 16 + +struct ggml_vk_device { + int index; + int type; // same as VkPhysicalDeviceType + size_t heapSize; + const char * name; + const char * vendor; + int subgroupSize; + uint64_t bufferAlignment; + uint64_t maxAlloc; +}; + +struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count); +bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name); +bool ggml_vk_has_vulkan(void); +bool ggml_vk_has_device(void); +struct ggml_vk_device ggml_vk_current_device(void); + +// +// backend API +// + +// forward declaration +typedef struct ggml_backend * ggml_backend_t; + +GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device); + +GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ml/backend/ggml/ggml/include/ggml-rpc.h b/ml/backend/ggml/ggml/include/ggml-rpc.h new file mode 100644 index 000000000..ade6c3b0e --- /dev/null +++ b/ml/backend/ggml/ggml/include/ggml-rpc.h @@ -0,0 +1,28 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_RPC_MAX_SERVERS 16 + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); +GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); + +GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); + +GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void); + +GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); + +#ifdef __cplusplus +} +#endif diff --git a/ml/backend/ggml/ggml/include/ggml-sycl.h b/ml/backend/ggml/ggml/include/ggml-sycl.h new file mode 100644 index 000000000..5ce349a88 --- /dev/null +++ b/ml/backend/ggml/ggml/include/ggml-sycl.h @@ -0,0 +1,49 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#define GGML_SYCL_NAME "SYCL" +#define GGML_SYCL_MAX_DEVICES 48 + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device); + +GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend); + +// devide buffer +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); + +GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void); +GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len); +GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device, + char *description, + size_t description_size); +GGML_BACKEND_API int ggml_backend_sycl_get_device_count(); +GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); + +// SYCL doesn't support registering host memory, keep here for reference +// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); +// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ml/backend/ggml/ggml/include/ggml-vulkan.h b/ml/backend/ggml/ggml/include/ggml-vulkan.h new file mode 100644 index 000000000..53cdba072 --- /dev/null +++ b/ml/backend/ggml/ggml/include/ggml-vulkan.h @@ -0,0 +1,31 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_VK_NAME "Vulkan" +#define GGML_VK_MAX_DEVICES 16 + +GGML_BACKEND_API void ggml_vk_instance_init(void); + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); + +GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend); +GGML_BACKEND_API int ggml_backend_vk_get_device_count(void); +GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ml/backend/ggml/ggml/src/CMakeLists.txt b/ml/backend/ggml/ggml/src/CMakeLists.txt new file mode 100644 index 000000000..19289f32b --- /dev/null +++ b/ml/backend/ggml/ggml/src/CMakeLists.txt @@ -0,0 +1,308 @@ +include(CheckCXXCompilerFlag) + +add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) + +# enable libstdc++ assertions for debug builds +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) +endif() + +if (NOT MSVC) + if (GGML_SANITIZE_THREAD) + add_compile_options(-fsanitize=thread) + link_libraries (-fsanitize=thread) + endif() + + if (GGML_SANITIZE_ADDRESS) + add_compile_options(-fsanitize=address -fno-omit-frame-pointer) + link_libraries (-fsanitize=address) + endif() + + if (GGML_SANITIZE_UNDEFINED) + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) + endif() +endif() + +function(ggml_get_flags CCID CCVER) + set(C_FLAGS "") + set(CXX_FLAGS "") + + if (CCID MATCHES "Clang") + set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) + set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) + + if ( + (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR + (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) + ) + list(APPEND C_FLAGS -Wdouble-promotion) + endif() + elseif (CCID STREQUAL "GNU") + set(C_FLAGS -Wdouble-promotion) + set(CXX_FLAGS -Wno-array-bounds) + + if (CCVER VERSION_GREATER_EQUAL 8.1.0) + list(APPEND CXX_FLAGS -Wextra-semi) + endif() + endif() + + set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) + set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) +endfunction() + +if (GGML_FATAL_WARNINGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) + endif() +endif() + +if (GGML_ALL_WARNINGS) + if (NOT MSVC) + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) + + ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) + + add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" + "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") + else() + # todo : msvc + set(C_FLAGS "") + set(CXX_FLAGS "") + endif() +endif() + +if (GGML_LTO) + include(CheckIPOSupported) + check_ipo_supported(RESULT result OUTPUT output) + if (result) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + else() + message(WARNING "IPO is not supported: ${output}") + endif() +endif() + +if (GGML_CCACHE) + find_program(GGML_CCACHE_FOUND ccache) + + if (GGML_CCACHE_FOUND) + # TODO: should not be set globally + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set(ENV{CCACHE_SLOPPINESS} time_macros) + message(STATUS "ccache found, compilation results will be cached. Disable with GGML_CCACHE=OFF.") + else() + message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with GGML_CCACHE=OFF") + endif () +endif() + +# this version of Apple ld64 is buggy +execute_process( + COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v + ERROR_VARIABLE output + OUTPUT_QUIET +) + +if (output MATCHES "dyld-1015\.7") + add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) +endif() + +# architecture specific +# TODO: probably these flags need to be tweaked on some architectures +# feel free to update the Makefile for your architecture and send a pull request or issue +message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +if (MSVC) + string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) + message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") +else () + set(CMAKE_GENERATOR_PLATFORM_LWR "") +endif () + +if (NOT MSVC) + if (GGML_STATIC) + add_link_options(-static) + if (MINGW) + add_link_options(-static-libgcc -static-libstdc++) + endif() + endif() + if (GGML_GPROF) + add_compile_options(-pg) + endif() +endif() + +if (MINGW) + # Target Windows 8 for PrefetchVirtualMemory + add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) +endif() + +# +# POSIX conformance +# + +# clock_gettime came in POSIX.1b (1993) +# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional +# posix_memalign came in POSIX.1-2001 / SUSv3 +# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985) + +# Somehow in OpenBSD whenever POSIX conformance is specified +# some string functions rely on locale_t availability, +# which was introduced in POSIX.1-2008, forcing us to go higher +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_compile_definitions(_XOPEN_SOURCE=700) +else() + add_compile_definitions(_XOPEN_SOURCE=600) +endif() + +# Data types, macros and functions related to controlling CPU affinity and +# some memory allocation are available on Linux through GNU extensions in libc +if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android") + add_compile_definitions(_GNU_SOURCE) +endif() + +# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, +# and on macOS its availability depends on enabling Darwin extensions +# similarly on DragonFly, enabling BSD extensions is necessary +if ( + CMAKE_SYSTEM_NAME MATCHES "Darwin" OR + CMAKE_SYSTEM_NAME MATCHES "iOS" OR + CMAKE_SYSTEM_NAME MATCHES "tvOS" OR + CMAKE_SYSTEM_NAME MATCHES "DragonFly" +) + add_compile_definitions(_DARWIN_C_SOURCE) +endif() + +# alloca is a non-standard interface that is not visible on BSDs when +# POSIX conformance is specified, but not all of them provide a clean way +# to enable it in such cases +if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + add_compile_definitions(__BSD_VISIBLE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "NetBSD") + add_compile_definitions(_NETBSD_SOURCE) +endif() +if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD") + add_compile_definitions(_BSD_SOURCE) +endif() + +if (WIN32) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + + if (BUILD_SHARED_LIBS) + # TODO: should not use this + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + endif() +endif() + +# ggml + +if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS) + message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS") +endif() + +add_library(ggml-base + ../include/ggml.h + ../include/ggml-alloc.h + ../include/ggml-backend.h + ../include/ggml-cpp.h + ../include/ggml-opt.h + ggml.c + ggml-alloc.c + ggml-backend.cpp + ggml-opt.cpp + ggml-threading.cpp + ggml-threading.h + ggml-quants.c + ggml-quants.h + ggml-aarch64.c + ggml-aarch64.h) + +target_include_directories(ggml-base PRIVATE .) + +add_library(ggml + ggml-backend-reg.cpp) + +target_link_libraries(ggml PUBLIC ggml-base) + +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(ggml PRIVATE dl) +endif() + +function(ggml_add_backend_library backend) + if (GGML_BACKEND_DL) + add_library(${backend} MODULE ${ARGN}) + # write the shared library to the output directory + set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) + else() + add_library(${backend} ${ARGN}) + target_link_libraries(ggml PUBLIC ${backend}) + install(TARGETS ${backend} LIBRARY) + endif() + + target_link_libraries(${backend} PRIVATE ggml-base) + target_include_directories(${backend} PRIVATE ..) + + if (${BUILD_SHARED_LIBS}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD) + target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED) + endif() +endfunction() + +function(ggml_add_backend backend) + string(TOUPPER "GGML_${backend}" backend_id) + if (${backend_id}) + string(TOLOWER "ggml-${backend}" backend_target) + add_subdirectory(${backend_target}) + message(STATUS "Including ${backend} backend") + if (NOT GGML_BACKEND_DL) + string(TOUPPER "GGML_USE_${backend}" backend_use) + target_compile_definitions(ggml PUBLIC ${backend_use}) + endif() + endif() +endfunction() + +ggml_add_backend(CPU) +ggml_add_backend(BLAS) +ggml_add_backend(CANN) +ggml_add_backend(CUDA) +ggml_add_backend(HIP) +ggml_add_backend(Kompute) +ggml_add_backend(METAL) +ggml_add_backend(MUSA) +ggml_add_backend(RPC) +ggml_add_backend(SYCL) +ggml_add_backend(Vulkan) + +foreach (target ggml-base ggml) + target_include_directories(${target} PUBLIC $ $) + target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump +endforeach() + +target_link_libraries(ggml-base PRIVATE Threads::Threads) + +find_library(MATH_LIBRARY m) +if (MATH_LIBRARY) + if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT}) + target_link_libraries(ggml-base PRIVATE m) + endif() +endif() + +if (CMAKE_SYSTEM_NAME MATCHES "Android") + target_link_libraries(ggml-base PRIVATE dl) +endif() + +if (BUILD_SHARED_LIBS) + foreach (target ggml-base ggml) + set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(${target} PRIVATE GGML_BUILD) + target_compile_definitions(${target} PUBLIC GGML_SHARED) + endforeach() +endif() diff --git a/ml/backend/ggml/ggml/ggml-aarch64.c b/ml/backend/ggml/ggml/src/ggml-aarch64.c similarity index 100% rename from ml/backend/ggml/ggml/ggml-aarch64.c rename to ml/backend/ggml/ggml/src/ggml-aarch64.c diff --git a/ml/backend/ggml/ggml/ggml-aarch64.h b/ml/backend/ggml/ggml/src/ggml-aarch64.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-aarch64.h rename to ml/backend/ggml/ggml/src/ggml-aarch64.h diff --git a/ml/backend/ggml/ggml/ggml-alloc.c b/ml/backend/ggml/ggml/src/ggml-alloc.c similarity index 100% rename from ml/backend/ggml/ggml/ggml-alloc.c rename to ml/backend/ggml/ggml/src/ggml-alloc.c diff --git a/ml/backend/ggml/ggml/ggml-backend-impl.h b/ml/backend/ggml/ggml/src/ggml-backend-impl.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-backend-impl.h rename to ml/backend/ggml/ggml/src/ggml-backend-impl.h diff --git a/ml/backend/ggml/ggml/ggml-backend-reg.cpp b/ml/backend/ggml/ggml/src/ggml-backend-reg.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-backend-reg.cpp rename to ml/backend/ggml/ggml/src/ggml-backend-reg.cpp diff --git a/ml/backend/ggml/ggml/ggml-backend.cpp b/ml/backend/ggml/ggml/src/ggml-backend.cpp similarity index 99% rename from ml/backend/ggml/ggml/ggml-backend.cpp rename to ml/backend/ggml/ggml/src/ggml-backend.cpp index 9b80fe07d..731e40789 100644 --- a/ml/backend/ggml/ggml/ggml-backend.cpp +++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp @@ -106,12 +106,6 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } - -// TODO: this needs to be freed in cuda and hip backends because -// the cuda backend implementation compiled with msvc -#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP) - delete buffer; -#endif } size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { @@ -1867,6 +1861,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_aligned_free(buffer->context, buffer->size); + free(buffer); } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { diff --git a/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt new file mode 100644 index 000000000..0bf3c05d9 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-blas/CMakeLists.txt @@ -0,0 +1,87 @@ +if (GGML_STATIC) + set(BLA_STATIC ON) +endif() +#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) +# set(BLA_SIZEOF_INTEGER 8) +#endif() + +set(BLA_VENDOR ${GGML_BLAS_VENDOR}) +find_package(BLAS) + +if (BLAS_FOUND) + message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") + + ggml_add_backend_library(ggml-blas + ggml-blas.cpp + ) + + if (${GGML_BLAS_VENDOR} MATCHES "Apple") + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) + add_compile_definitions(GGML_BLAS_USE_ACCELERATE) + elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "") + # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. + # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 + find_package(PkgConfig REQUIRED) + if (${GGML_BLAS_VENDOR} MATCHES "Generic") + pkg_check_modules(DepBLAS blas) + elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS") + # As of openblas v0.3.22, the 64-bit is named openblas64.pc + pkg_check_modules(DepBLAS openblas64) + if (NOT DepBLAS_FOUND) + pkg_check_modules(DepBLAS openblas) + endif() + elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME") + add_compile_definitions(GGML_BLAS_USE_BLIS) + pkg_check_modules(DepBLAS blis) + elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS") + pkg_check_modules(DepBLAS blas-atlas) + elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS") + pkg_check_modules(DepBLAS flexiblas_api) + elseif (${GGML_BLAS_VENDOR} MATCHES "Intel") + add_compile_definitions(GGML_BLAS_USE_MKL) + # all Intel* libraries share the same include path + pkg_check_modules(DepBLAS mkl-sdl) + elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC") + # this doesn't provide pkg-config + # suggest to assign BLAS_INCLUDE_DIRS on your own + if ("${NVHPC_VERSION}" STREQUAL "") + message(WARNING "Better to set NVHPC_VERSION") + else() + set(DepBLAS_FOUND ON) + set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include") + endif() + endif() + if (DepBLAS_FOUND) + set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS}) + else() + message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically" + " detected by pkgconfig, trying to find cblas.h from possible paths...") + find_path(BLAS_INCLUDE_DIRS + NAMES cblas.h + HINTS + /usr/include + /usr/local/include + /usr/include/openblas + /opt/homebrew/opt/openblas/include + /usr/local/opt/openblas/include + /usr/include/x86_64-linux-gnu/openblas/include + ) + endif() + endif() + + message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + + target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS}) + + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) + add_compile_definitions(GGML_BLAS_USE_MKL) + endif() + + target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES}) + target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS}) +else() + message(ERROR "BLAS not found, please refer to " + "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" + " to set correct GGML_BLAS_VENDOR") +endif() diff --git a/ml/backend/ggml/ggml/src/ggml-blas/blas.go b/ml/backend/ggml/ggml/src/ggml-blas/blas.go new file mode 100644 index 000000000..c6a976d29 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-blas/blas.go @@ -0,0 +1,8 @@ +package blas + +// #cgo CXXFLAGS: -std=c++11 +// #cgo CPPFLAGS: -DGGML_USE_BLAS +// #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../include +// #cgo darwin,arm64 CPPFLAGS: -DGGML_BLAS_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 +// #cgo darwin,arm64 LDFLAGS: -framework Accelerate +import "C" diff --git a/ml/backend/ggml/ggml/ggml-blas/ggml-blas.cpp b/ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-blas/ggml-blas.cpp rename to ml/backend/ggml/ggml/src/ggml-blas/ggml-blas.cpp diff --git a/ml/backend/ggml/ggml/ggml-common.h b/ml/backend/ggml/ggml/src/ggml-common.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-common.h rename to ml/backend/ggml/ggml/src/ggml-common.h diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt new file mode 100644 index 000000000..5df63884c --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-cpu/CMakeLists.txt @@ -0,0 +1,319 @@ +ggml_add_backend_library(ggml-cpu) + +list (APPEND GGML_CPU_SOURCES + ggml-cpu.c + ggml-cpu.cpp + ggml-cpu-aarch64.c + ggml-cpu-aarch64.h + ggml-cpu-quants.c + ggml-cpu-quants.h + amx/amx.cpp + amx/amx.h + amx/mmq.cpp + amx/mmq.h + ggml-cpu-impl.h + ) + +target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17) +target_include_directories(ggml-cpu PRIVATE .) + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE) + target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK) + target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64) + + target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + message(STATUS "OpenMP found") + + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP) + + target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() +endif() + +if (GGML_LLAMAFILE) + message(STATUS "Using llamafile") + + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE) + + list(APPEND GGML_CPU_SOURCES + llamafile/sgemm.cpp + llamafile/sgemm.h) +endif() + +if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) + + message(STATUS "Using memkind for CPU HBM") + + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM) + + target_link_libraries(ggml-cpu PUBLIC memkind) +endif() + +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + + message(STATUS "ARM detected") + + if (MSVC) + list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead + list(APPEND ARCH_DEFINITIONS __ARM_NEON) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) + + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + + message(STATUS "ARM feature DOTPROD enabled") + endif () + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () + + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + + message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + elseif (APPLE) + if (GGML_NATIVE) + set(USER_PROVIDED_MARCH FALSE) + foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) + if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") + set(USER_PROVIDED_MARCH TRUE) + break() + endif() + endforeach() + + if (NOT USER_PROVIDED_MARCH) + set(MARCH_FLAGS "-march=armv8.2a") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + + message(STATUS "ARM feature DOTPROD enabled") + endif () + + set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") + + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + endif () + endif () + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) + endif() + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) + endif() + endif() +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + message(STATUS "x86 detected") + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + include(cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (GGML_AVX512_VBMI) + list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_DEFINITIONS __AVX512VNNI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_DEFINITIONS __AVX512BF16__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_DEFINITIONS __AMX_TILE__) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_DEFINITIONS __AMX_INT8__) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_DEFINITIONS __AMX_BF16__) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_DEFINITIONS __AVXVNNI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavxvnni) + endif() + endif() + else() + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + endif() + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_FLAGS -mavxvnni) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512dq) + list(APPEND ARCH_FLAGS -mavx512bw) + endif() + if (GGML_AVX512_VBMI) + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_FLAGS -mamx-tile) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_FLAGS -mamx-int8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_FLAGS -mamx-bf16) + endif() + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) + string(FIND "${POWER10_M}" "POWER10" substring_index) + if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") + set(substring_index -1) + endif() + + if (${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) + endif() + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") + message(STATUS "RISC-V detected") + if (GGML_RVV) + list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) + endif() +else() + message(STATUS "Unknown architecture") +endif() + +if (GGML_CPU_AARCH64) + message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64) +endif() + +target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) +set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}") +set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") + +# the feature detection code must be compiled without any architecture flags +target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp) +# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection + +if (EMSCRIPTEN) + set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") +endif() diff --git a/ml/backend/ggml/ggml/ggml-cpu/amx/amx.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/amx/amx.cpp rename to ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.cpp diff --git a/ml/backend/ggml/ggml/ggml-cpu/amx/amx.h b/ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/amx/amx.h rename to ml/backend/ggml/ggml/src/ggml-cpu/amx/amx.h diff --git a/ml/backend/ggml/ggml/ggml-cpu/amx/common.h b/ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/amx/common.h rename to ml/backend/ggml/ggml/src/ggml-cpu/amx/common.h diff --git a/ml/backend/ggml/ggml/ggml-cpu/amx/mmq.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/amx/mmq.cpp rename to ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.cpp diff --git a/ml/backend/ggml/ggml/ggml-cpu/amx/mmq.h b/ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/amx/mmq.h rename to ml/backend/ggml/ggml/src/ggml-cpu/amx/mmq.h diff --git a/ml/backend/ggml/ggml/ggml-cpu/cpu-feats-x86.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/cpu-feats-x86.cpp rename to ml/backend/ggml/ggml/src/ggml-cpu/cpu-feats-x86.cpp diff --git a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go new file mode 100644 index 000000000..b0cd99780 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go @@ -0,0 +1,8 @@ +package cpu + +// #cgo CXXFLAGS: -std=c++11 +// #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/.. -I${SRCDIR}/../../include +// #cgo linux CPPFLAGS: -D_GNU_SOURCE +// #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 +// #cgo darwin,arm64 LDFLAGS: -framework Accelerate +import "C" diff --git a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-aarch64.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.c similarity index 99% rename from ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-aarch64.c rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.c index bbf8934e6..11152385e 100644 --- a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-aarch64.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.c @@ -4,7 +4,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" -#include "ggml-cpu-impl.h" +#include "ggml-cpu/ggml-cpu-impl.h" #include #include diff --git a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-aarch64.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-aarch64.h rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-aarch64.h diff --git a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-impl.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-impl.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-impl.h rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-impl.h diff --git a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-quants.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-quants.c rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.c diff --git a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-quants.h b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/ggml-cpu-quants.h rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu-quants.h diff --git a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu.c b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c similarity index 99% rename from ml/backend/ggml/ggml/ggml-cpu/ggml-cpu.c rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c index df0bd3c6b..111ff3b0f 100644 --- a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu.c +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.c @@ -10,7 +10,7 @@ #include "ggml-quants.h" #include "ggml-cpu-quants.h" #include "ggml-threading.h" -#include "amx.h" +#include "amx/amx.h" #include "ggml.h" #if defined(_MSC_VER) || defined(__MINGW32__) diff --git a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp similarity index 99% rename from ml/backend/ggml/ggml/ggml-cpu/ggml-cpu.cpp rename to ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp index 91476ad01..77e5d87a8 100644 --- a/ml/backend/ggml/ggml/ggml-cpu/ggml-cpu.cpp +++ b/ml/backend/ggml/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -3,7 +3,7 @@ #include "ggml-cpu.h" #include "ggml-cpu-aarch64.h" #include "ggml-impl.h" -#include "amx.h" +#include "amx/amx.h" #include #include #include diff --git a/ml/backend/ggml/ggml/ggml-cpu/llamafile/sgemm.cpp b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/llamafile/sgemm.cpp rename to ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.cpp diff --git a/ml/backend/ggml/ggml/ggml-cpu/llamafile/sgemm.h b/ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cpu/llamafile/sgemm.h rename to ml/backend/ggml/ggml/src/ggml-cpu/llamafile/sgemm.h diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt new file mode 100644 index 000000000..14761650f --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt @@ -0,0 +1,152 @@ +cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES + +find_package(CUDAToolkit) + +if (CUDAToolkit_FOUND) + message(STATUS "CUDA Toolkit found") + + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # native == GPUs available at build time + # 52 == Maxwell, lowest CUDA 12 standard + # 60 == P100, FP16 CUDA intrinsics + # 61 == Pascal, __dp4a instruction (per-byte integer dot product) + # 70 == V100, FP16 tensor cores + # 75 == Turing, int8 tensor cores + if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24") + set(CMAKE_CUDA_ARCHITECTURES "native") + elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") + else() + set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75") + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + + enable_language(CUDA) + + file(GLOB GGML_HEADERS_CUDA "*.cuh") + list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h") + + file(GLOB GGML_SOURCES_CUDA "*.cu") + file(GLOB SRCS "template-instances/fattn-wmma*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/mmq*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + + if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) + else() + file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) + endif() + + ggml_add_backend_library(ggml-cuda + ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_CUDA} + ) + + add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + + if (GGML_CUDA_GRAPHS) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) + endif() + + if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + + if (GGML_CUDA_FORCE_CUBLAS) + add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) + endif() + + if (GGML_CUDA_NO_VMM) + add_compile_definitions(GGML_CUDA_NO_VMM) + endif() + + if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16) + add_compile_definitions(GGML_CUDA_F16) + endif() + + if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() + + if (GGML_STATIC) + if (WIN32) + # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt) + else () + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) + endif() + else() + target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() + + if (GGML_CUDA_NO_VMM) + # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) + else() + target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver) + endif() + + set(CUDA_CXX_FLAGS "") + + set(CUDA_FLAGS -use_fast_math) + + if (GGML_FATAL_WARNINGS) + list(APPEND CUDA_FLAGS -Werror all-warnings) + endif() + + if (GGML_ALL_WARNINGS AND NOT MSVC) + set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) + if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + endif() + + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler --version + OUTPUT_VARIABLE CUDA_CCFULLVER + ERROR_QUIET + ) + + if (NOT CUDA_CCFULLVER MATCHES clang) + set(CUDA_CCID "GNU") + execute_process( + COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion" + OUTPUT_VARIABLE CUDA_CCVER + ERROR_QUIET + ) + else() + if (CUDA_CCFULLVER MATCHES Apple) + set(CUDA_CCID "AppleClang") + else() + set(CUDA_CCID "Clang") + endif() + string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER}) + endif() + + message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") + + ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER}) + list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later + endif() + + if (NOT MSVC) + list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) + endif() + + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() + + target_compile_options(ggml-cuda PRIVATE "$<$:${CUDA_FLAGS}>") +else() + message(FATAL_ERROR "CUDA Toolkit not found") +endif() diff --git a/ml/backend/ggml/ggml/ggml-cuda/acc.cu b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/acc.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/acc.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/acc.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/acc.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/acc.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/arange.cu b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/arange.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/arange.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/arange.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/arange.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/arange.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/argmax.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/argmax.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/argmax.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/argmax.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/argmax.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/argmax.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/argsort.cu b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/argsort.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/argsort.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/argsort.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/argsort.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/argsort.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/binbcast.cu b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/binbcast.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/binbcast.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/binbcast.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/binbcast.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/clamp.cu b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/clamp.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/clamp.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/clamp.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/clamp.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/clamp.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/common.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/common.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/common.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/concat.cu b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/concat.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/concat.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/concat.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/concat.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/concat.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/conv-transpose-1d.cu b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/conv-transpose-1d.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/conv-transpose-1d.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/conv-transpose-1d.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/conv-transpose-1d.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/convert.cu b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/convert.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/convert.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/convert.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/convert.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/convert.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/count-equal.cu b/ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/count-equal.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/count-equal.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/count-equal.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/count-equal.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/cpy.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/cpy.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/cpy.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/cpy.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/cpy.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/cpy.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/cross-entropy-loss.cu b/ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/cross-entropy-loss.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/cross-entropy-loss.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/cross-entropy-loss.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/cross-entropy-loss.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/dequantize.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/dequantize.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/dequantize.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/dequantize.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/diagmask.cu b/ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/diagmask.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/diagmask.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/diagmask.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/diagmask.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-common.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-common.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-common.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f16.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f16.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f32.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f32.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-tile-f32.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-tile-f32.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-vec-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-vec-f16.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f16.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-vec-f32.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-vec-f32.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-vec-f32.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn-wmma-f16.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn-wmma-f16.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn-wmma-f16.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn.cu b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/fattn.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/fattn.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/fattn.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/fattn.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/getrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/getrows.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/getrows.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/getrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/getrows.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/getrows.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu similarity index 99% rename from ml/backend/ggml/ggml/ggml-cuda/ggml-cuda.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu index 9ca6cb774..053e392ac 100644 --- a/ml/backend/ggml/ggml/ggml-cuda/ggml-cuda.cu +++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu @@ -424,10 +424,7 @@ struct ggml_backend_cuda_buffer_context { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; delete ctx; - - // TODO: this needs to be freed in cuda and hipblas backends because - // the cuda backend implementation compiled with msvc - free(buffer); + delete buffer; } static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { diff --git a/ml/backend/ggml/ggml/ggml-cuda/im2col.cu b/ml/backend/ggml/ggml/src/ggml-cuda/im2col.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/im2col.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/im2col.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/im2col.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/im2col.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/im2col.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/im2col.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/mma.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mma.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/mma.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/mma.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/mmq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/mmq.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/mmq.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/mmq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/mmq.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/mmq.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/mmv.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/mmv.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/mmv.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/mmv.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/mmv.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/mmv.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/mmvq.cu b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/mmvq.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/mmvq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/mmvq.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/mmvq.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/norm.cu b/ml/backend/ggml/ggml/src/ggml-cuda/norm.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/norm.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/norm.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/norm.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/norm.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/norm.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/norm.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/opt-step-adamw.cu b/ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/opt-step-adamw.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/opt-step-adamw.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/opt-step-adamw.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/opt-step-adamw.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/out-prod.cu b/ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/out-prod.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/out-prod.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/out-prod.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/out-prod.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/pad.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/pad.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/pad.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/pad.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/pad.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/pad.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/pool2d.cu b/ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/pool2d.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/pool2d.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/pool2d.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/pool2d.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/quantize.cu b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/quantize.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/quantize.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/quantize.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/quantize.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/quantize.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/rope.cu b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/rope.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/rope.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/rope.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/rope.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/rope.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/rope.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/scale.cu b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/scale.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/scale.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/scale.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/scale.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/scale.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/scale.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/softmax.cu b/ml/backend/ggml/ggml/src/ggml-cuda/softmax.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/softmax.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/softmax.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/softmax.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/softmax.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/softmax.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/softmax.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/sum.cu b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/sum.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/sum.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/sum.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/sum.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/sum.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/sum.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/sumrows.cu b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/sumrows.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/sumrows.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/sumrows.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/sumrows.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/generate_cu_files.py b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/generate_cu_files.py rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq1_s.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq1_s.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq1_s.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq2_s.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq2_s.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_s.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xs.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq2_xxs.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq3_s.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq3_s.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_s.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq3_xxs.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_nl.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-iq4_xs.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q2_k.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q2_k.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q3_k.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q3_k.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q4_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q4_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q4_k.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q4_k.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q5_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q5_1.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_1.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q5_k.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q5_k.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q6_k.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q6_k.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/template-instances/mmq-instance-q8_0.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/template-instances/mmq-instance-q8_0.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/tsembd.cu b/ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/tsembd.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/tsembd.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/tsembd.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/tsembd.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/unary.cu b/ml/backend/ggml/ggml/src/ggml-cuda/unary.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/unary.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/unary.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/unary.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/unary.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/unary.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/unary.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/upscale.cu b/ml/backend/ggml/ggml/src/ggml-cuda/upscale.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/upscale.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/upscale.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/upscale.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/upscale.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/upscale.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/upscale.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/vecdotq.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/vecdotq.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/vecdotq.cuh diff --git a/ml/backend/ggml/ggml/ggml-cuda/vendors/cuda.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/vendors/cuda.h rename to ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h diff --git a/ml/backend/ggml/ggml/ggml-cuda/vendors/hip.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/vendors/hip.h rename to ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h diff --git a/ml/backend/ggml/ggml/ggml-cuda/vendors/musa.h b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/musa.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/vendors/musa.h rename to ml/backend/ggml/ggml/src/ggml-cuda/vendors/musa.h diff --git a/ml/backend/ggml/ggml/ggml-cuda/wkv6.cu b/ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/wkv6.cu rename to ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cu diff --git a/ml/backend/ggml/ggml/ggml-cuda/wkv6.cuh b/ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cuh similarity index 100% rename from ml/backend/ggml/ggml/ggml-cuda/wkv6.cuh rename to ml/backend/ggml/ggml/src/ggml-cuda/wkv6.cuh diff --git a/ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt new file mode 100644 index 000000000..b15fbd24d --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-hip/CMakeLists.txt @@ -0,0 +1,104 @@ +if (NOT EXISTS $ENV{ROCM_PATH}) + if (NOT EXISTS /opt/rocm) + set(ROCM_PATH /usr) + else() + set(ROCM_PATH /opt/rocm) + endif() +else() + set(ROCM_PATH $ENV{ROCM_PATH}) +endif() + +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) +list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") + +# CMake on Windows doesn't support the HIP language yet +if (WIN32) + set(CXX_IS_HIPCC TRUE) +else() + string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") +endif() + +if (CXX_IS_HIPCC) + if (LINUX) + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." + " Prefer setting the HIP compiler directly. See README for details.") + endif() +else() + # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. + if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) + endif() + cmake_minimum_required(VERSION 3.21) + enable_language(HIP) +endif() + +find_package(hip REQUIRED) +find_package(hipblas REQUIRED) +find_package(rocblas REQUIRED) + +message(STATUS "HIP and hipBLAS found") + +file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh") +list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h") + +file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu") +file(GLOB SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu") +list(APPEND GGML_SOURCES_ROCM ${SRCS}) +file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu") +list(APPEND GGML_SOURCES_ROCM ${SRCS}) + +if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) +else() + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) +endif() + +ggml_add_backend_library(ggml-hip + ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_ROCM} + ) + +# TODO: do not use CUDA definitions for HIP +target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) + +add_compile_definitions(GGML_USE_HIP) + +if (GGML_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) +endif() + +if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) +endif() + +if (GGML_CUDA_FORCE_CUBLAS) + add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) +endif() + +if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) +endif() + +if (CXX_IS_HIPCC) + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-hip PRIVATE hip::device) +else() + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) +endif() + +if (GGML_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") +endif() + +target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas) diff --git a/ml/backend/ggml/ggml/ggml-impl.h b/ml/backend/ggml/ggml/src/ggml-impl.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-impl.h rename to ml/backend/ggml/ggml/src/ggml-impl.h diff --git a/ml/backend/ggml/ggml/src/ggml-metal/CMakeLists.txt b/ml/backend/ggml/ggml/src/ggml-metal/CMakeLists.txt new file mode 100644 index 000000000..1bad27206 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-metal/CMakeLists.txt @@ -0,0 +1,105 @@ +find_library(FOUNDATION_LIBRARY Foundation REQUIRED) +find_library(METAL_FRAMEWORK Metal REQUIRED) +find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + +message(STATUS "Metal framework found") + +ggml_add_backend_library(ggml-metal + ggml-metal.m + ) + +target_link_libraries(ggml-metal PRIVATE + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ) + +if (GGML_METAL_NDEBUG) + add_compile_definitions(GGML_METAL_NDEBUG) +endif() + +if (GGML_METAL_USE_BF16) + add_compile_definitions(GGML_METAL_USE_BF16) +endif() + +# copy metal files to bin directory +configure_file(../ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) +configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) +configure_file(ggml-metal-impl.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal-impl.h COPYONLY) + +if (GGML_METAL_EMBED_LIBRARY) + enable_language(ASM) + + add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + + set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/../ggml-common.h") + set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + set(METALLIB_IMPL "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal-impl.h") + + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") + + # merge ggml-common.h and ggml-metal.metal into a single file + set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") + set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp") + + add_custom_command( + OUTPUT ${METALLIB_EMBED_ASM} + COMMAND echo "Embedding Metal library" + COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP} + COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED} + COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h + COMMENT "Generate assembly for embedded Metal library" + ) + + target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM}) +else() + if (GGML_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + else() + set(XC_FLAGS -O3) + endif() + + # Append macOS metal versioning flags + if (GGML_METAL_MACOSX_VERSION_MIN) + message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation") + list (APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN}) + endif() + + if (GGML_METAL_STD) + message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation") + list (APPEND XC_FLAGS -std=${GGML_METAL_STD}) + endif() + + add_custom_command( + OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal + DEPENDS ggml-metal.metal ggml-common.h + COMMENT "Compiling Metal kernels" + ) + + # FIXME: only add to the ggml-metal target? + add_custom_target( + ggml-metal-lib ALL + DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + ) +endif() # GGML_METAL_EMBED_LIBRARY diff --git a/ml/backend/ggml/ggml/ggml-metal/ggml-metal-embed.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal similarity index 100% rename from ml/backend/ggml/ggml/ggml-metal/ggml-metal-embed.metal rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.metal diff --git a/ml/backend/ggml/ggml/ggml-metal/ggml-metal.s b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s similarity index 100% rename from ml/backend/ggml/ggml/ggml-metal/ggml-metal.s rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s diff --git a/ml/backend/ggml/ggml/ggml-metal/ggml-metal-impl.h b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-metal/ggml-metal-impl.h rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-impl.h diff --git a/ml/backend/ggml/ggml/ggml-metal/ggml-metal.m b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m similarity index 99% rename from ml/backend/ggml/ggml/ggml-metal/ggml-metal.m rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m index cb9a13070..346dfb5bf 100644 --- a/ml/backend/ggml/ggml/ggml-metal/ggml-metal.m +++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m @@ -4068,6 +4068,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) } free(ctx); + free(buffer); } static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { diff --git a/ml/backend/ggml/ggml/ggml-metal/ggml-metal.metal b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal similarity index 100% rename from ml/backend/ggml/ggml/ggml-metal/ggml-metal.metal rename to ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.metal diff --git a/ml/backend/ggml/ggml/src/ggml-metal/metal.go b/ml/backend/ggml/ggml/src/ggml-metal/metal.go new file mode 100644 index 000000000..379d18b32 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go @@ -0,0 +1,5 @@ +package metal + +// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I${SRCDIR}/.. -I${SRCDIR}/../../include +// #cgo LDFLAGS: -framework Metal -framework MetalKit +import "C" diff --git a/ml/backend/ggml/ggml/ggml-opt.cpp b/ml/backend/ggml/ggml/src/ggml-opt.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-opt.cpp rename to ml/backend/ggml/ggml/src/ggml-opt.cpp diff --git a/ml/backend/ggml/ggml/ggml-quants.c b/ml/backend/ggml/ggml/src/ggml-quants.c similarity index 99% rename from ml/backend/ggml/ggml/ggml-quants.c rename to ml/backend/ggml/ggml/src/ggml-quants.c index 49ab3dafc..7301a9c6c 100644 --- a/ml/backend/ggml/ggml/ggml-quants.c +++ b/ml/backend/ggml/ggml/src/ggml-quants.c @@ -3,7 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" -#include "ggml-cpu-impl.h" +#include "ggml-cpu/ggml-cpu-impl.h" #include "ggml-cpu.h" #include diff --git a/ml/backend/ggml/ggml/ggml-quants.h b/ml/backend/ggml/ggml/src/ggml-quants.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-quants.h rename to ml/backend/ggml/ggml/src/ggml-quants.h diff --git a/ml/backend/ggml/ggml/ggml-threading.cpp b/ml/backend/ggml/ggml/src/ggml-threading.cpp similarity index 100% rename from ml/backend/ggml/ggml/ggml-threading.cpp rename to ml/backend/ggml/ggml/src/ggml-threading.cpp diff --git a/ml/backend/ggml/ggml/ggml-threading.h b/ml/backend/ggml/ggml/src/ggml-threading.h similarity index 100% rename from ml/backend/ggml/ggml/ggml-threading.h rename to ml/backend/ggml/ggml/src/ggml-threading.h diff --git a/ml/backend/ggml/ggml/ggml.c b/ml/backend/ggml/ggml/src/ggml.c similarity index 100% rename from ml/backend/ggml/ggml/ggml.c rename to ml/backend/ggml/ggml/src/ggml.c diff --git a/ml/backend/ggml/ggml/src/ggml.go b/ml/backend/ggml/ggml/src/ggml.go new file mode 100644 index 000000000..f554b4550 --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml.go @@ -0,0 +1,7 @@ +package ggml + +// #cgo CXXFLAGS: -std=c++17 +// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU +// #cgo CPPFLAGS: -I${SRCDIR}/../include -I${SRCDIR}/ggml-cpu +import "C" +import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-cpu" diff --git a/ml/backend/ggml/ggml/src/ggml_darwin_arm64.go b/ml/backend/ggml/ggml/src/ggml_darwin_arm64.go new file mode 100644 index 000000000..173cea2bf --- /dev/null +++ b/ml/backend/ggml/ggml/src/ggml_darwin_arm64.go @@ -0,0 +1,9 @@ +package ggml + +// #cgo CPPFLAGS: -DGGML_USE_METAL -DGGML_USE_BLAS +// #cgo LDFLAGS: -framework Foundation +import "C" +import ( + _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-blas" + _ "github.com/ollama/ollama/ml/backend/ggml/ggml/src/ggml-metal" +) diff --git a/ml/backend/ggml/ggml_darwin_amd64.go b/ml/backend/ggml/ggml_darwin_amd64.go deleted file mode 100644 index 084e11abe..000000000 --- a/ml/backend/ggml/ggml_darwin_amd64.go +++ /dev/null @@ -1,8 +0,0 @@ -package ggml - -// #include "ggml-backend.h" -import "C" - -func newBackend() *C.struct_ggml_backend { - return newCPUBackend() -} diff --git a/ml/backend/ggml/ggml_darwin_arm64.go b/ml/backend/ggml/ggml_darwin_arm64.go deleted file mode 100644 index 117736069..000000000 --- a/ml/backend/ggml/ggml_darwin_arm64.go +++ /dev/null @@ -1,8 +0,0 @@ -package ggml - -// #include "ggml-metal.h" -import "C" - -func newBackend() *C.struct_ggml_backend { - return C.ggml_backend_metal_init() -} diff --git a/ml/backend/ggml/ggml_linux.go b/ml/backend/ggml/ggml_linux.go deleted file mode 100644 index 084e11abe..000000000 --- a/ml/backend/ggml/ggml_linux.go +++ /dev/null @@ -1,8 +0,0 @@ -package ggml - -// #include "ggml-backend.h" -import "C" - -func newBackend() *C.struct_ggml_backend { - return newCPUBackend() -} diff --git a/ml/backend/ggml/ggml_windows.go b/ml/backend/ggml/ggml_windows.go deleted file mode 100644 index 084e11abe..000000000 --- a/ml/backend/ggml/ggml_windows.go +++ /dev/null @@ -1,8 +0,0 @@ -package ggml - -// #include "ggml-backend.h" -import "C" - -func newBackend() *C.struct_ggml_backend { - return newCPUBackend() -} diff --git a/model/cmd/main.go b/model/cmd/main.go index 53d8fab6e..8083956a7 100644 --- a/model/cmd/main.go +++ b/model/cmd/main.go @@ -9,6 +9,7 @@ import ( "log/slog" "os" "path/filepath" + "strings" "github.com/ollama/ollama/cache" "github.com/ollama/ollama/ml" @@ -33,7 +34,17 @@ func temp() error { flag.Parse() - if len(flag.Args()) != 1 { + var prompt string + if n := len(flag.Args()); n == 1 { + bts, err := io.ReadAll(os.Stdin) + if err != nil { + return err + } + + prompt = string(bts) + } else if n > 1 { + prompt = strings.Join(flag.Args()[1:], " ") + } else { return fmt.Errorf("usage: %s path/to/file