From 054e31146ef7ee33a82ac3c870b71f93791c4cbe Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 9 Dec 2024 15:38:39 -0800 Subject: [PATCH] build: recursive make ggml-cuda --- Makefile2 | 112 ++++++++++++++++++++++ llama/README.md | 3 +- make/common-defs.make | 1 - make/gpu.make | 61 ++---------- ml/backend/ggml/ggml/ggml-cpu/cpu.go | 3 +- ml/backend/ggml/ggml/ggml-cuda/.gitignore | 1 + ml/backend/ggml/ggml/ggml-cuda/Makefile | 64 +++++++++++++ ml/backend/ggml/ggml/ggml-cuda/cuda.go | 4 + ml/backend/ggml/ggml/ggml.go | 3 +- ml/backend/ggml/ggml/ggml_cuda.go | 1 + scripts/build_darwin.sh | 4 +- 11 files changed, 195 insertions(+), 62 deletions(-) create mode 100644 Makefile2 create mode 100644 ml/backend/ggml/ggml/ggml-cuda/.gitignore create mode 100644 ml/backend/ggml/ggml/ggml-cuda/Makefile diff --git a/Makefile2 b/Makefile2 new file mode 100644 index 000000000..ce87e912e --- /dev/null +++ b/Makefile2 @@ -0,0 +1,112 @@ +export GOOS?=$(shell go env GOOS) +export GOARCH?=$(shell go env GOARCH) + +build: llama/build/$(GOOS)-$(GOARCH) + +export GOFLAGS=-trimpath + +llama/build/%/runners/metal: GOFLAGS+=-tags=metal +llama/build/%/runners/cpu_avx: GOFLAGS+=-tags=avx +llama/build/%/runners/cpu_avx2: GOFLAGS+=-tags=avx2 +llama/build/%/runners/cuda_v11: GOFLAGS+=-tags=cuda,cuda_v11 +llama/build/%/runners/cuda_v12: GOFLAGS+=-tags=cuda,cuda_v12 +llama/build/%/runners/rocm: GOFLAGS+=-tags=cuda,rocm + +.PHONY: llama/build/darwin-amd64 llama/build/darwin-arm64 +llama/build/darwin-amd64: llama/build/darwin-amd64/runners/cpu_avx +llama/build/darwin-arm64: llama/build/darwin-arm64/runners/metal + +.PHONY: llama/build/linux-amd64 llama/build/linux-arm64 +llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx +llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx2 +llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx +llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx2 + +.PHONY: llama/build/windows-amd64 linux/build/windows-arm64 +llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx +llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx2 +llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx +llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx2 + +.PHONY: cuda_v11 cuda_v12 +cuda_v11 cuda_v12 rocm: + $(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $@ + +ifeq ($(GOOS),linux) +NVCC11=$(shell command -v /usr/local/cuda-11/bin/nvcc) +NVCC12=$(shell command -v /usr/local/cuda-12/bin/nvcc) +HIPCC=$(shell command -v hipcc) +else ifeq ($(GOOS),windows) +NVCC11=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.?\\bin\\nvcc.exe") +NVCC12=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.?\\bin\\nvcc.exe") +HIPCC=$(shell command -v hipcc) +endif + +ifneq ($(NVCC11),) +ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_11),) +CUDA_V11_TARGETS= \ + llama/build/linux-amd64/runners/cuda_v11 \ + llama/build/linux-arm64/runners/cuda_v11 \ + llama/build/windows-amd64/runners/cuda_v11 \ + llama/build/windows-arm64/runners/cuda_v11 \ + +$(CUDA_V11_TARGETS): cuda_v11 +cuda_v11: export NVCC=$(NVCC11) + +llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v11 +llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v11 +llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v11 +llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v11 +endif +endif + +ifneq ($(NVCC12),) +ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_12),) +CUDA_V12_TARGETS= \ + llama/build/linux-amd64/runners/cuda_v12 \ + llama/build/linux-arm64/runners/cuda_v12 \ + llama/build/windows-amd64/runners/cuda_v12 \ + llama/build/windows-arm64/runners/cuda_v12 \ + +$(CUDA_V12_TARGETS): cuda_v12 +cuda_v12: export NVCC=$(NVCC12) + +llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v12 +llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v12 +llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v12 +llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v12 +endif +endif + +ifneq ($(HIPCC),) +ifeq ($(OLLAMA_SKIP_GENERATE_ROCM),) +ROCM_TARGETS= \ + llama/build/linux-amd64/runners/rocm \ + llama/build/linux-arm64/runners/rocm \ + llama/build/windows-amd64/runners/rocm \ + llama/build/windows-arm64/runners/rocm \ + +$(ROCM_TARGETS): rocm +rocm: export NVCC=$(HIPCC) + +llama/build/linux-amd64: llama/build/linux-amd64/runners/rocm +llama/build/linux-arm64: llama/build/linux-arm64/runners/rocm +llama/build/windows-amd64: llama/build/windows-amd64/runners/rocm +llama/build/windows-arm64: llama/build/windows-arm64/runners/rocm +endif +endif + +export CGO_ENABLED=1 +export CGO_CPPFLAGS_ALLOW=-mfma|-mf16c + +llama/build/%: cmd/runner always + mkdir -p $@; go build -o $@ ./$< + +.PHONY: always +always: + +clean: + $(RM) -r llama/build + +realclean: clean + $(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $< diff --git a/llama/README.md b/llama/README.md index 3b6b20672..a1fef48c2 100644 --- a/llama/README.md +++ b/llama/README.md @@ -37,8 +37,7 @@ go build -tags avx . ```shell # go doesn't recognize `-mfma` as a valid compiler flag # see https://github.com/golang/go/issues/17895 -go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c" -go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c" +go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c" go build -tags=avx,avx2 . ``` diff --git a/make/common-defs.make b/make/common-defs.make index 310fc37f3..853c526a3 100644 --- a/make/common-defs.make +++ b/make/common-defs.make @@ -94,4 +94,3 @@ CPPFLAGS += \ -I../ml/backend/ggml/ggml/include \ -I../ml/backend/ggml/ggml/ggml-cpu \ -I../ml/backend/ggml/ggml/ggml-cpu/amx \ ->>>>>>> 22320f09 (preserve vendor directory structure):llama/make/common-defs.make diff --git a/make/gpu.make b/make/gpu.make index d08e794bc..262a07291 100644 --- a/make/gpu.make +++ b/make/gpu.make @@ -11,74 +11,31 @@ GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS # today, cuda is bundled, but rocm is split out. Should split them each out by runner DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR) - -GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) - -GPU_RUNNER_SRCS := \ - $(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \ - $(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \ - llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-aarch64.c llama/ggml-threading.cpp -GPU_RUNNER_HDRS := \ - $(wildcard llama/ggml-cuda/*.cuh) - - -# Conditional flags and components to speed up developer builds -ifneq ($(OLLAMA_FAST_BUILD),) - GPU_COMPILER_CUFLAGS += \ - -DGGML_DISABLE_FLASH_ATTN -else - GPU_RUNNER_SRCS += \ - $(wildcard llama/ggml-cuda/fattn*.cu) \ - $(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \ - $(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \ - $(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \ - $(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu) -endif - -GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) -GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) -GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT))) - DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT))) BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT))) - -$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) +$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) dist: $(DIST_RUNNERS) # Build targets -$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu - @-mkdir -p $(dir $@) - $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $< -$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c - @-mkdir -p $(dir $@) - $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $< -$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp - @-mkdir -p $(dir $@) - $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $< $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/" -$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) - @-mkdir -p $(dir $@) +$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) + @-mkdir -p $(@D) + $(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $(GPU_RUNNER_NAME) CXX=$(GPU_COMPILER) GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner -$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) - @-mkdir -p $(dir $@) - $(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ # Distribution targets $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% - @-mkdir -p $(dir $@) + @-mkdir -p $(@D) $(CP) $< $@ $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS) -$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) - @-mkdir -p $(dir $@) - $(CP) $< $@ $(GPU_DIST_LIB_DEPS): - @-mkdir -p $(dir $@) - $(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@) + @-mkdir -p $(@D) + $(CP) $(GPU_LIB_DIR)/$(@F) $(@D) -clean: - rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS) +clean: + $(RM) $(BUILD_RUNNERS) $(DIST_RUNNERS) .PHONY: clean $(GPU_RUNNER_NAME) diff --git a/ml/backend/ggml/ggml/ggml-cpu/cpu.go b/ml/backend/ggml/ggml/ggml-cpu/cpu.go index 3358c856f..a8e12a28d 100644 --- a/ml/backend/ggml/ggml/ggml-cpu/cpu.go +++ b/ml/backend/ggml/ggml/ggml-cpu/cpu.go @@ -4,8 +4,7 @@ package cpu // #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../include -I${SRCDIR}/amx // #cgo CPPFLAGS: -D_GNU_SOURCE // #cgo amd64,avx CPPFLAGS: -mavx -// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -// #cgo amd64,f16c CPPFLAGS: -mf16c +// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c // #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA import "C" import ( diff --git a/ml/backend/ggml/ggml/ggml-cuda/.gitignore b/ml/backend/ggml/ggml/ggml-cuda/.gitignore new file mode 100644 index 000000000..5761abcfd --- /dev/null +++ b/ml/backend/ggml/ggml/ggml-cuda/.gitignore @@ -0,0 +1 @@ +*.o diff --git a/ml/backend/ggml/ggml/ggml-cuda/Makefile b/ml/backend/ggml/ggml/ggml-cuda/Makefile new file mode 100644 index 000000000..170eaf7d2 --- /dev/null +++ b/ml/backend/ggml/ggml/ggml-cuda/Makefile @@ -0,0 +1,64 @@ +NVCC?=nvcc + +NVCC_PREPEND_FLAGS= \ + -t 2 \ + -split-compile 0 \ + -std=c++17 \ + -I.. \ + -I../include \ + $(foreach ARCH,$(subst ;, ,$(CUDA_ARCHS)),--generate-code=arch=compute_$(ARCH),code=sm_$(ARCH)) \ + +NVCC_APPEND_FLAGS= \ + # -DGGML_CUDA_USE_GRAPHS=1 \ + +ALL_SOURCES=$(wildcard *.cu) +FATTN_SOURCES=$(wildcard fattn*.cu) + +SOURCES= \ + $(filter-out $(FATTN_SOURCES),$(ALL_SOURCES)) \ + $(wildcard template-instances/mmq*.cu) \ + +ifneq ($(OLLAMA_FAST_BUILD),) +NVCC_APPEND_FLAGS+=-DGGML_DISABLE_FLASH_ATTN +else +SOURCES+= \ + $(FATTN_SOURCES) \ + $(wildcard template-instances/fattn-wmma*.cu) \ + $(wildcard template-instances/fattn-vec*q4_0-q4_0.cu) \ + $(wildcard template-instances/fattn-vec*q8_0-q8_0.cu) \ + $(wildcard template-instances/fattn-vec*f16-f16.cu) +endif + +all: cuda_v11 cuda_v12 + +cuda_v11: CUDA_ARCHS?=50;52;53;60;61;62;70;72;75;80;86 +cuda_v11: OBJECTS=$(patsubst %.cu,%.v11.o,$(SOURCES)) + +cuda_v12: CUDA_ARCHS?=60;61;62;70;72;75;80;86;87;89;90;90a +cuda_v12: OBJECTS=$(patsubst %.cu,%.v12.o,$(SOURCES)) + +rocm: CPPFLAGS+=-DGGML_USE_HIP +rocm: OBJECTS=$(patsubst %.cu,%.rocm.o,$(SOURCES)) + +ifeq ($(OS),Windows_NT) +cuda_v11: libggml_cuda_v11.dll +cuda_v12: libggml_cuda_v12.dll +rocm: libggml_rocm.dll +else +cuda_v11: libggml_cuda_v11.a +cuda_v12: libggml_cuda_v12.a +rocm: libggml_rocm.a +endif + +clean: + $(RM) *.a *.o template-instances/*.o + +%.v11.o %.v12.o %.rocm.o: %.cu + $(NVCC) $(NVCC_PREPEND_FLAGS) -c $< -o $@ $(NVCC_APPEND_FLAGS) + +.SECONDEXPANSION: +%.a: $$(OBJECTS) + $(AR) rcs $@ $^ + +%.dll: $$(OBJECTS) + $(NVCC) -shared -o $@ $^ diff --git a/ml/backend/ggml/ggml/ggml-cuda/cuda.go b/ml/backend/ggml/ggml/ggml-cuda/cuda.go index f1fead2bf..529e0e388 100644 --- a/ml/backend/ggml/ggml/ggml-cuda/cuda.go +++ b/ml/backend/ggml/ggml/ggml-cuda/cuda.go @@ -1,3 +1,7 @@ package cuda +// #cgo cuda_v11 LDFLAGS: -L. -lggml_cuda_v11 +// #cgo cuda_v12 LDFLAGS: -L. -lggml_cuda_v12 +// #cgo cuda_v11 cuda_v12 LDFLAGS: -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublasLt +// #cgo rocm LDFLAGS: -L. -lggml_rocm -L/opt/rocm/lib -lhipblas -lamdhip64 -lrocblas import "C" diff --git a/ml/backend/ggml/ggml/ggml.go b/ml/backend/ggml/ggml/ggml.go index efbc645dd..a3c1ed1fc 100644 --- a/ml/backend/ggml/ggml/ggml.go +++ b/ml/backend/ggml/ggml/ggml.go @@ -5,8 +5,7 @@ package ggml // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU // #cgo darwin LDFLAGS: -framework Foundation // #cgo amd64,avx CPPFLAGS: -mavx -// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -// #cgo amd64,f16c CPPFLAGS: -mf16c +// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c // #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA import "C" import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cpu" diff --git a/ml/backend/ggml/ggml/ggml_cuda.go b/ml/backend/ggml/ggml/ggml_cuda.go index 009d5bbcd..09f685f84 100644 --- a/ml/backend/ggml/ggml/ggml_cuda.go +++ b/ml/backend/ggml/ggml/ggml_cuda.go @@ -3,5 +3,6 @@ package ggml // #cgo CPPFLAGS: -DGGML_USE_CUDA +// #cgo rocm CPPFLAGS: -DGGML_USE_HIP import "C" import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cuda" diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh index 595c3ce71..c2151841d 100755 --- a/scripts/build_darwin.sh +++ b/scripts/build_darwin.sh @@ -10,9 +10,7 @@ mkdir -p dist # If installed to an alternate location use the following to enable # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk # export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer -export CGO_CFLAGS=-mmacosx-version-min=11.3 -export CGO_CXXFLAGS=-mmacosx-version-min=11.3 -export CGO_LDFLAGS=-mmacosx-version-min=11.3 +export CGO_CPPFLAGS=-mmacosx-version-min=11.3 rm -rf llama/build dist/darwin-* echo "Building darwin arm64"