From bd933c24bc2220917584e2df3ecabdc3663fdf20 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 3 Mar 2024 00:51:07 -0800 Subject: [PATCH] testing new cmake script --- llm/dyn_ext_server.c | 145 ---------- llm/dyn_ext_server.h | 74 ----- llm/ext_server/CMakeLists.txt | 25 -- llm/ext_server/README.md | 18 -- llm/ext_server/ext_server.cpp | 381 ------------------------- llm/ext_server/ext_server.h | 95 ------ llm/generate/gen_common.sh | 125 -------- llm/generate/gen_darwin.sh | 77 ----- llm/generate/gen_linux.sh | 196 ------------- llm/generate/gen_windows.ps1 | 209 -------------- llm/generate/generate_darwin.go | 3 - llm/generate/generate_linux.go | 3 - llm/generate/generate_windows.go | 3 - llm/llama.cpp | 1 - llm/llm.go | 10 + llm/llm_darwin_amd64.go | 14 + llm/llm_darwin_arm64.go | 8 + llm/patches/02-cudaleaks.diff | 114 -------- llm/payload_darwin_amd64.go | 8 - llm/payload_darwin_arm64.go | 8 - llm/payload_linux.go | 8 - llm/payload_test.go | 58 ---- llm/payload_windows.go | 8 - llm/server/.gitignore | 1 + llm/server/CMakeLists.txt | 93 ++++++ llm/{ => server}/patches/01-cache.diff | 0 llm/utils.go | 15 - 27 files changed, 126 insertions(+), 1574 deletions(-) delete mode 100644 llm/dyn_ext_server.c delete mode 100644 llm/dyn_ext_server.h delete mode 100644 llm/ext_server/CMakeLists.txt delete mode 100644 llm/ext_server/README.md delete mode 100644 llm/ext_server/ext_server.cpp delete mode 100644 llm/ext_server/ext_server.h delete mode 100644 llm/generate/gen_common.sh delete mode 100755 llm/generate/gen_darwin.sh delete mode 100755 llm/generate/gen_linux.sh delete mode 100644 llm/generate/gen_windows.ps1 delete mode 100644 llm/generate/generate_darwin.go delete mode 100644 llm/generate/generate_linux.go delete mode 100644 llm/generate/generate_windows.go delete mode 160000 llm/llama.cpp create mode 100644 llm/llm_darwin_amd64.go create mode 100644 llm/llm_darwin_arm64.go delete mode 100644 llm/patches/02-cudaleaks.diff delete mode 100644 llm/payload_darwin_amd64.go delete mode 100644 llm/payload_darwin_arm64.go delete mode 100644 llm/payload_linux.go delete mode 100644 llm/payload_test.go delete mode 100644 llm/payload_windows.go create mode 100644 llm/server/.gitignore create mode 100644 llm/server/CMakeLists.txt rename llm/{ => server}/patches/01-cache.diff (100%) delete mode 100644 llm/utils.go diff --git a/llm/dyn_ext_server.c b/llm/dyn_ext_server.c deleted file mode 100644 index 47dc4e99b..000000000 --- a/llm/dyn_ext_server.c +++ /dev/null @@ -1,145 +0,0 @@ -#include "dyn_ext_server.h" - -#include -#include - -#ifdef __linux__ -#include -#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) -#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) -#define LOAD_ERR() strdup(dlerror()) -#define UNLOAD_LIBRARY(handle) dlclose(handle) -#elif _WIN32 -#include -#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib) -#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym) -#define UNLOAD_LIBRARY(handle) FreeLibrary(handle) -inline char *LOAD_ERR() { - LPSTR messageBuffer = NULL; - size_t size = FormatMessageA( - FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - (LPSTR)&messageBuffer, 0, NULL); - char *resp = strdup(messageBuffer); - LocalFree(messageBuffer); - return resp; -} -#else -#include -#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) -#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) -#define LOAD_ERR() strdup(dlerror()) -#define UNLOAD_LIBRARY(handle) dlclose(handle) -#endif - -void dyn_init(const char *libPath, struct dynamic_llama_server *s, - ext_server_resp_t *err) { - int i = 0; - struct lookup { - char *s; - void **p; - } l[] = { - {"llama_server_init", (void *)&s->llama_server_init}, - {"llama_server_start", (void *)&s->llama_server_start}, - {"llama_server_stop", (void *)&s->llama_server_stop}, - {"llama_server_completion", (void *)&s->llama_server_completion}, - {"llama_server_completion_next_result", - (void *)&s->llama_server_completion_next_result}, - {"llama_server_completion_cancel", - (void *)&s->llama_server_completion_cancel}, - {"llama_server_release_task_result", - (void *)&s->llama_server_release_task_result}, - {"llama_server_tokenize", (void *)&s->llama_server_tokenize}, - {"llama_server_detokenize", (void *)&s->llama_server_detokenize}, - {"llama_server_embedding", (void *)&s->llama_server_embedding}, - {"llama_server_release_json_resp", - (void *)&s->llama_server_release_json_resp}, - {"", NULL}, - }; - - printf("loading library %s\n", libPath); - s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW); - if (!s->handle) { - err->id = -1; - char *msg = LOAD_ERR(); - snprintf(err->msg, err->msg_len, - "Unable to load dynamic server library: %s", msg); - free(msg); - return; - } - - for (i = 0; l[i].p != NULL; i++) { - *l[i].p = LOAD_SYMBOL(s->handle, l[i].s); - if (!l[i].p) { - UNLOAD_LIBRARY(s->handle); - err->id = -1; - char *msg = LOAD_ERR(); - snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s", - l[i].s, msg); - free(msg); - return; - } - } -} - -inline void dyn_llama_server_init(struct dynamic_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err) { - s.llama_server_init(sparams, err); -} - -inline void dyn_llama_server_start(struct dynamic_llama_server s) { - s.llama_server_start(); -} - -inline void dyn_llama_server_stop(struct dynamic_llama_server s) { - s.llama_server_stop(); -} - -inline void dyn_llama_server_completion(struct dynamic_llama_server s, - const char *json_req, - ext_server_resp_t *resp) { - s.llama_server_completion(json_req, resp); -} - -inline void dyn_llama_server_completion_next_result( - struct dynamic_llama_server s, const int task_id, - ext_server_task_result_t *result) { - s.llama_server_completion_next_result(task_id, result); -} - -inline void dyn_llama_server_completion_cancel( - struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) { - s.llama_server_completion_cancel(task_id, err); -} -inline void dyn_llama_server_release_task_result( - struct dynamic_llama_server s, ext_server_task_result_t *result) { - s.llama_server_release_task_result(result); -} - -inline void dyn_llama_server_tokenize(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { - s.llama_server_tokenize(json_req, json_resp, err); -} - -inline void dyn_llama_server_detokenize(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { - s.llama_server_detokenize(json_req, json_resp, err); -} - -inline void dyn_llama_server_embedding(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err) { - s.llama_server_embedding(json_req, json_resp, err); -} - -inline void dyn_llama_server_release_json_resp( - struct dynamic_llama_server s, char **json_resp) { - s.llama_server_release_json_resp(json_resp); -} diff --git a/llm/dyn_ext_server.h b/llm/dyn_ext_server.h deleted file mode 100644 index cddf4a1f0..000000000 --- a/llm/dyn_ext_server.h +++ /dev/null @@ -1,74 +0,0 @@ -#include - -#include "ext_server.h" - -#ifdef __cplusplus -extern "C" { -#endif -struct dynamic_llama_server { - void *handle; - void (*llama_server_init)(ext_server_params_t *sparams, - ext_server_resp_t *err); - void (*llama_server_start)(); - void (*llama_server_stop)(); - void (*llama_server_completion)(const char *json_req, - ext_server_resp_t *resp); - void (*llama_server_completion_next_result)(const int task_id, - ext_server_task_result_t *result); - void (*llama_server_completion_cancel)(const int task_id, - ext_server_resp_t *err); - void (*llama_server_release_task_result)(ext_server_task_result_t *result); - void (*llama_server_tokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_detokenize)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_embedding)(const char *json_req, char **json_resp, - ext_server_resp_t *err); - void (*llama_server_release_json_resp)(char **json_resp); -}; - -void dyn_init(const char *libPath, struct dynamic_llama_server *s, - ext_server_resp_t *err); - -// No good way to call C function pointers from Go so inline the indirection -void dyn_llama_server_init(struct dynamic_llama_server s, - ext_server_params_t *sparams, - ext_server_resp_t *err); - -void dyn_llama_server_start(struct dynamic_llama_server s); - -void dyn_llama_server_stop(struct dynamic_llama_server s); - -void dyn_llama_server_completion(struct dynamic_llama_server s, - const char *json_req, - ext_server_resp_t *resp); - -void dyn_llama_server_completion_next_result( - struct dynamic_llama_server s, const int task_id, - ext_server_task_result_t *result); - -void dyn_llama_server_completion_cancel(struct dynamic_llama_server s, - const int task_id, - ext_server_resp_t *err); - -void dyn_llama_server_release_task_result( - struct dynamic_llama_server s, ext_server_task_result_t *result); - -void dyn_llama_server_tokenize(struct dynamic_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); - -void dyn_llama_server_detokenize(struct dynamic_llama_server s, - const char *json_req, - char **json_resp, - ext_server_resp_t *err); - -void dyn_llama_server_embedding(struct dynamic_llama_server s, - const char *json_req, char **json_resp, - ext_server_resp_t *err); -void dyn_llama_server_release_json_resp(struct dynamic_llama_server s, - char **json_resp); - -#ifdef __cplusplus -} -#endif \ No newline at end of file diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt deleted file mode 100644 index dd1831fcf..000000000 --- a/llm/ext_server/CMakeLists.txt +++ /dev/null @@ -1,25 +0,0 @@ -# Ollama specific CMakefile to include in llama.cpp/examples/server - -set(TARGET ext_server) -option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -if (WIN32) - add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp) -else() - add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp) -endif() -target_include_directories(${TARGET} PRIVATE ../../common) -target_include_directories(${TARGET} PRIVATE ../..) -target_include_directories(${TARGET} PRIVATE ../../..) -target_compile_features(${TARGET} PRIVATE cxx_std_11) -target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1) -target_link_libraries(${TARGET} PRIVATE ggml llava common ) -set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) -target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$) -install(TARGETS ext_server LIBRARY) - -if (CUDAToolkit_FOUND) - target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) - if (WIN32) - target_link_libraries(${TARGET} PRIVATE nvml) - endif() -endif() \ No newline at end of file diff --git a/llm/ext_server/README.md b/llm/ext_server/README.md deleted file mode 100644 index bfb0d4a6a..000000000 --- a/llm/ext_server/README.md +++ /dev/null @@ -1,18 +0,0 @@ -# Extern C Server - -This directory contains a thin facade we layer on top of the Llama.cpp server to -expose `extern C` interfaces to access the functionality through direct API -calls in-process. The llama.cpp code uses compile time macros to configure GPU -type along with other settings. During the `go generate ./...` execution, the -build will generate one or more copies of the llama.cpp `extern C` server based -on what GPU libraries are detected to support multiple GPU types as well as CPU -only support. The Ollama go build then embeds these different servers to support -different GPUs and settings at runtime. - -If you are making changes to the code in this directory, make sure to disable -caching during your go build to ensure you pick up your changes. A typical -iteration cycle from the top of the source tree looks like: - -``` -go generate ./... && go build -a . -``` \ No newline at end of file diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp deleted file mode 100644 index 679029d98..000000000 --- a/llm/ext_server/ext_server.cpp +++ /dev/null @@ -1,381 +0,0 @@ -#include "ext_server.h" -#include - -// Necessary evil since the server types are not defined in a header -#include "server.cpp" - -// Low level API access to verify GPU access -#if defined(GGML_USE_CUBLAS) -#if defined(GGML_USE_HIPBLAS) -#include -#include -#include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define cudaGetDevice hipGetDevice -#define cudaError_t hipError_t -#define cudaSuccess hipSuccess -#define cudaGetErrorString hipGetErrorString -#else -#include -#include -#include -#endif // defined(GGML_USE_HIPBLAS) -#endif // GGML_USE_CUBLAS - -// Expose the llama server as a callable extern "C" API -llama_server_context *llama = NULL; -std::thread ext_server_thread; -bool shutting_down = false; -std::atomic_int recv_counter; - -// RAII wrapper for tracking in-flight recv calls -class atomicRecv { - public: - atomicRecv(std::atomic &atomic) : atomic(atomic) { - ++this->atomic; - } - ~atomicRecv() { - --this->atomic; - } - private: - std::atomic &atomic; -}; - -void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { - recv_counter = 0; - assert(err != NULL && sparams != NULL); - log_set_target(stderr); - if (!sparams->verbose_logging) { - server_verbose = true; - log_disable(); - } - - LOG_TEE("system info: %s\n", llama_print_system_info()); - err->id = 0; - err->msg[0] = '\0'; - try { - llama = new llama_server_context; - gpt_params params; - params.n_ctx = sparams->n_ctx; - params.n_batch = sparams->n_batch; - if (sparams->n_threads > 0) { - params.n_threads = sparams->n_threads; - } - params.n_parallel = sparams->n_parallel; - params.rope_freq_base = sparams->rope_freq_base; - params.rope_freq_scale = sparams->rope_freq_scale; - - if (sparams->memory_f16) { - params.cache_type_k = "f16"; - params.cache_type_v = "f16"; - } else { - params.cache_type_k = "f32"; - params.cache_type_v = "f32"; - } - - params.n_gpu_layers = sparams->n_gpu_layers; - params.main_gpu = sparams->main_gpu; - params.use_mlock = sparams->use_mlock; - params.use_mmap = sparams->use_mmap; - params.numa = (ggml_numa_strategy)sparams->numa; - params.embedding = sparams->embedding; - if (sparams->model != NULL) { - params.model = sparams->model; - } - - if (sparams->lora_adapters != NULL) { - for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; - la = la->next) { - params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale)); - } - - params.use_mmap = false; - } - - if (sparams->mmproj != NULL) { - params.mmproj = std::string(sparams->mmproj); - } - -#if defined(GGML_USE_CUBLAS) - // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible - LOG_TEE("Performing pre-initialization of GPU\n"); - int id; - cudaError_t cudaErr = cudaGetDevice(&id); - if (cudaErr != cudaSuccess) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr)); - return; - } -#endif - - llama_backend_init(); - llama_numa_init(params.numa); - - // load the model - if (!llama->load_model(params)) { - // TODO - consider modifying the logging logic or patching load_model so - // we can capture more detailed error messages and pass them back to the - // caller for better UX - err->id = -1; - snprintf(err->msg, err->msg_len, "error loading model %s", - params.model.c_str()); - return; - } - - llama->initialize(); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, - "Unknown exception initializing llama server"); - } -} - -void llama_server_start() { - assert(llama != NULL); - // TODO mutex to protect thread creation - ext_server_thread = std::thread([&]() { - try { - LOG_TEE("llama server main loop starting\n"); - ggml_time_init(); - llama->queue_tasks.on_new_task(std::bind( - &llama_server_context::process_single_task, llama, std::placeholders::_1)); - llama->queue_tasks.on_finish_multitask(std::bind( - &llama_server_context::on_finish_multitask, llama, std::placeholders::_1)); - llama->queue_tasks.on_run_slots(std::bind( - &llama_server_context::update_slots, llama)); - llama->queue_results.on_multitask_update(std::bind( - &llama_server_queue::update_multitask, - &llama->queue_tasks, - std::placeholders::_1, - std::placeholders::_2, - std::placeholders::_3 - )); - llama->queue_tasks.start_loop(); - } catch (std::exception &e) { - LOG_TEE("caught exception in llama server main loop: %s\n", e.what()); - } catch (...) { - LOG_TEE("caught unknown exception in llama server main loop\n"); - } - LOG_TEE("\nllama server shutting down\n"); - llama_backend_free(); - }); -} - -void llama_server_stop() { - assert(llama != NULL); - // Shutdown any in-flight requests and block incoming requests. - LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n"); - shutting_down = true; - - while (recv_counter.load() > 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - } - - // This may take a while for any pending tasks to drain - // TODO - consider a timeout to cancel tasks if it's taking too long - llama->queue_tasks.terminate(); - ext_server_thread.join(); - delete llama; - llama = NULL; - LOG_TEE("llama server shutdown complete\n"); - shutting_down = false; -} - -void llama_server_completion(const char *json_req, ext_server_resp_t *resp) { - assert(llama != NULL && json_req != NULL && resp != NULL); - resp->id = -1; - resp->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - json data = json::parse(json_req); - resp->id = llama->queue_tasks.get_new_id(); - llama->queue_results.add_waiting_task_id(resp->id); - llama->request_completion(resp->id, data, false, false, -1); - } catch (std::exception &e) { - snprintf(resp->msg, resp->msg_len, "exception %s", e.what()); - } catch (...) { - snprintf(resp->msg, resp->msg_len, "Unknown exception during completion"); - } -} - -void llama_server_completion_next_result(const int task_id, - ext_server_task_result_t *resp) { - assert(llama != NULL && resp != NULL); - resp->id = -1; - resp->stop = false; - resp->error = false; - resp->json_resp = NULL; - std::string result_json; - try { - atomicRecv ar(recv_counter); - task_result result = llama->queue_results.recv(task_id); - result_json = - result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); - resp->id = result.id; - resp->stop = result.stop; - resp->error = result.error; - if (result.error) { - LOG_TEE("next result cancel on error\n"); - llama->request_cancel(task_id); - LOG_TEE("next result removing waiting tak ID: %d\n", task_id); - llama->queue_results.remove_waiting_task_id(task_id); - } else if (result.stop) { - LOG_TEE("next result cancel on stop\n"); - llama->request_cancel(task_id); - LOG_TEE("next result removing waiting task ID: %d\n", task_id); - llama->queue_results.remove_waiting_task_id(task_id); - } else if (shutting_down) { - LOG_TEE("aborting completion due to shutdown %d\n", task_id); - llama->request_cancel(task_id); - llama->queue_results.remove_waiting_task_id(task_id); - resp->stop = true; - } - } catch (std::exception &e) { - resp->error = true; - resp->id = -1; - result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}"; - LOG_TEE("llama server completion exception %s\n", e.what()); - } catch (...) { - resp->error = true; - resp->id = -1; - result_json = "{\"error\":\"Unknown exception during completion\"}"; - LOG_TEE("llama server completion unknown exception\n"); - } - const std::string::size_type size = result_json.size() + 1; - resp->json_resp = new char[size]; - snprintf(resp->json_resp, size, "%s", result_json.c_str()); -} - -void llama_server_release_task_result(ext_server_task_result_t *result) { - if (result == NULL || result->json_resp == NULL) { - return; - } - delete[] result->json_resp; -} - -void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) { - assert(llama != NULL && err != NULL); - err->id = 0; - err->msg[0] = '\0'; - try { - llama->request_cancel(task_id); - llama->queue_results.remove_waiting_task_id(task_id); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, - "Unknown exception completion cancel in llama server"); - } -} - -void llama_server_tokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err) { - assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); - *json_resp = NULL; - err->id = 0; - err->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - const json body = json::parse(json_req); - std::vector tokens; - if (body.count("content") != 0) { - tokens = llama->tokenize(body["content"], false); - } - const json data = format_tokenizer_response(tokens); - std::string result_json = data.dump(); - const std::string::size_type size = result_json.size() + 1; - *json_resp = new char[size]; - snprintf(*json_resp, size, "%s", result_json.c_str()); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unknown exception during tokenize"); - } -} - -void llama_server_release_json_resp(char **json_resp) { - if (json_resp == NULL || *json_resp == NULL) { - return; - } - delete[] *json_resp; -} - -void llama_server_detokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err) { - assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); - *json_resp = NULL; - err->id = 0; - err->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - const json body = json::parse(json_req); - std::string content; - if (body.count("tokens") != 0) { - const std::vector tokens = body["tokens"]; - content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend()); - } - const json data = format_detokenized_response(content); - std::string result_json = data.dump(); - const std::string::size_type size = result_json.size() + 1; - *json_resp = new char[size]; - snprintf(*json_resp, size, "%s", result_json.c_str()); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unknown exception during detokenize"); - } -} - -void llama_server_embedding(const char *json_req, char **json_resp, - ext_server_resp_t *err) { - assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); - *json_resp = NULL; - err->id = 0; - err->msg[0] = '\0'; - try { - if (shutting_down) { - throw std::runtime_error("server shutting down"); - } - const json body = json::parse(json_req); - json prompt; - if (body.count("content") != 0) { - prompt = body["content"]; - } else { - prompt = ""; - } - const int task_id = llama->queue_tasks.get_new_id(); - llama->queue_results.add_waiting_task_id(task_id); - llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1); - atomicRecv ar(recv_counter); - task_result result = llama->queue_results.recv(task_id); - std::string result_json = result.result_json.dump(); - const std::string::size_type size = result_json.size() + 1; - *json_resp = new char[size]; - snprintf(*json_resp, size, "%s", result_json.c_str()); - llama->queue_results.remove_waiting_task_id(task_id); - } catch (std::exception &e) { - err->id = -1; - snprintf(err->msg, err->msg_len, "exception %s", e.what()); - } catch (...) { - err->id = -1; - snprintf(err->msg, err->msg_len, "Unknown exception during embedding"); - } -} \ No newline at end of file diff --git a/llm/ext_server/ext_server.h b/llm/ext_server/ext_server.h deleted file mode 100644 index 9b9ce2ecd..000000000 --- a/llm/ext_server/ext_server.h +++ /dev/null @@ -1,95 +0,0 @@ -#if defined(LLAMA_SERVER_LIBRARY) -#ifndef LLAMA_SERVER_H -#define LLAMA_SERVER_H -#include -#include -#include -#include - -int __main(int argc, char **argv); - -// This exposes extern C entrypoints into the llama_server -// To enable the server compile with LLAMA_SERVER_LIBRARY - -#ifdef __cplusplus -extern "C" { -#endif -typedef struct ext_server_resp { - int id; // < 0 on error - size_t msg_len; // caller must allocate msg and set msg_len - char *msg; -} ext_server_resp_t; - -// Allocated and freed by caller -typedef struct ext_server_lora_adapter { - char *adapter; - float scale; - struct ext_server_lora_adapter *next; -} ext_server_lora_adapter_t; - -// Allocated and freed by caller -typedef struct ext_server_params { - char *model; - uint32_t n_ctx; // token context window, 0 = from model - uint32_t n_batch; // prompt processing maximum batch size - uint32_t n_threads; // number of threads to use for generation - int32_t n_parallel; // number of parallel sequences to decodewra - float rope_freq_base; // RoPE base frequency, 0 = from model - float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model - bool memory_f16; // use f16 instead of f32 for memory kv - int32_t n_gpu_layers; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu; // the GPU that is used for scratch and small tensors - bool use_mlock; // force system to keep model in RAM - bool use_mmap; // use mmap if possible - int numa; // attempt optimizations that help on some NUMA systems - bool embedding; // get only sentence embedding - ext_server_lora_adapter_t *lora_adapters; - char *mmproj; - bool verbose_logging; // Enable verbose logging of the server -} ext_server_params_t; - -typedef struct ext_server_task_result { - int id; - bool stop; - bool error; - char *json_resp; // null terminated, memory managed by ext_server -} ext_server_task_result_t; - -// Initialize the server once per process -// err->id = 0 for success and err->msg[0] = NULL -// err->id != 0 for failure, and err->msg contains error message -void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err); - -// Run the main loop, called once per init -void llama_server_start(); -// Stop the main loop and free up resources allocated in init and start. Init -// must be called again to reuse -void llama_server_stop(); - -// json_req null terminated string, memory managed by caller -// resp->id >= 0 on success (task ID) -// resp->id < 0 on error, and resp->msg contains error message -void llama_server_completion(const char *json_req, ext_server_resp_t *resp); - -// Caller must call llama_server_release_task_result to free resp->json_resp -void llama_server_completion_next_result(const int task_id, - ext_server_task_result_t *result); -void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err); -void llama_server_release_task_result(ext_server_task_result_t *result); - -// Caller must call llama_server_releaes_json_resp to free json_resp if err.id < -// 0 -void llama_server_tokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err); -void llama_server_detokenize(const char *json_req, char **json_resp, - ext_server_resp_t *err); -void llama_server_embedding(const char *json_req, char **json_resp, - ext_server_resp_t *err); -void llama_server_release_json_resp(char **json_resp); - -#ifdef __cplusplus -} -#endif - -#endif -#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh deleted file mode 100644 index c6209f458..000000000 --- a/llm/generate/gen_common.sh +++ /dev/null @@ -1,125 +0,0 @@ -# common logic accross linux and darwin - -init_vars() { - case "${GOARCH}" in - "amd64") - ARCH="x86_64" - ;; - "arm64") - ARCH="arm64" - ;; - *) - ARCH=$(uname -m | sed -e "s/aarch64/arm64/g") - esac - - LLAMACPP_DIR=../llama.cpp - CMAKE_DEFS="" - CMAKE_TARGETS="--target ext_server" - if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then - CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" - else - # TODO - add additional optimization flags... - CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}" - fi - case $(uname -s) in - "Darwin") - LIB_EXT="dylib" - WHOLE_ARCHIVE="-Wl,-force_load" - NO_WHOLE_ARCHIVE="" - GCC_ARCH="-arch ${ARCH}" - ;; - "Linux") - LIB_EXT="so" - WHOLE_ARCHIVE="-Wl,--whole-archive" - NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive" - - # Cross compiling not supported on linux - Use docker - GCC_ARCH="" - ;; - *) - ;; - esac - if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then - CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" - fi -} - -git_module_setup() { - if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then - echo "Skipping submodule initialization" - return - fi - # Make sure the tree is clean after the directory moves - if [ -d "${LLAMACPP_DIR}/gguf" ]; then - echo "Cleaning up old submodule" - rm -rf ${LLAMACPP_DIR} - fi - git submodule init - git submodule update --force ${LLAMACPP_DIR} - -} - -apply_patches() { - # Wire up our CMakefile - if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then - echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt - fi - - if [ -n "$(ls -A ../patches/*.diff)" ]; then - # apply temporary patches until fix is upstream - for patch in ../patches/*.diff; do - for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do - (cd ${LLAMACPP_DIR}; git checkout ${file}) - done - done - for patch in ../patches/*.diff; do - (cd ${LLAMACPP_DIR} && git apply ${patch}) - done - fi - - # Avoid duplicate main symbols when we link into the cgo binary - sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp && - mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp -} - -build() { - cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} - cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 - mkdir -p ${BUILD_DIR}/lib/ - g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \ - ${GCC_ARCH} \ - ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \ - ${BUILD_DIR}/common/libcommon.a \ - ${BUILD_DIR}/libllama.a \ - -Wl,-rpath,\$ORIGIN \ - -lpthread -ldl -lm \ - ${EXTRA_LIBS} -} - -compress_libs() { - echo "Compressing payloads to reduce overall binary size..." - pids="" - rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz - for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do - gzip -n --best -f ${lib} & - pids+=" $!" - done - echo - for pid in ${pids}; do - wait $pid - done - echo "Finished compression" -} - -# Keep the local tree clean after we're done with the build -cleanup() { - (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp) - - if [ -n "$(ls -A ../patches/*.diff)" ]; then - for patch in ../patches/*.diff; do - for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do - (cd ${LLAMACPP_DIR}; git checkout ${file}) - done - done - fi -} diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh deleted file mode 100755 index 4b806b02a..000000000 --- a/llm/generate/gen_darwin.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -# This script is intended to run inside the go generate -# working directory must be ./llm/generate/ - -# TODO - add hardening to detect missing tools (cmake, etc.) - -set -ex -set -o pipefail -echo "Starting darwin generate script" -source $(dirname $0)/gen_common.sh -init_vars -git_module_setup -apply_patches - -sign() { - if [ -n "$APPLE_IDENTITY" ]; then - codesign -f --timestamp --deep --options=runtime --sign "$APPLE_IDENTITY" --identifier ai.ollama.ollama $1 - fi -} - -COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin" - -case "${GOARCH}" in -"amd64") - COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off" - - # - # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) - # - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu" - echo "Building LCD CPU" - build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib - compress_libs - - # - # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance - # Approximately 400% faster than LCD on same CPU - # - init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx" - echo "Building AVX CPU" - build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib - compress_libs - - # - # ~2013 CPU Dynamic library - # Approximately 10% faster than AVX on same CPU - # - init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2" - echo "Building AVX2 CPU" - EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" - build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib - compress_libs - ;; -"arm64") - CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal" - EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" - build - sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib - compress_libs - ;; -*) - echo "GOARCH must be set" - echo "this script is meant to be run from within go generate" - exit 1 - ;; -esac - -cleanup diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh deleted file mode 100755 index e6a7d077b..000000000 --- a/llm/generate/gen_linux.sh +++ /dev/null @@ -1,196 +0,0 @@ -#!/bin/bash -# This script is intended to run inside the go generate -# working directory must be llm/generate/ - -# First we build one or more CPU based LLM libraries -# -# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required -# library dependencies -# -# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM -# libraries are quite large, and also dynamically load data files at runtime -# which in turn are large, so we don't attempt to cary them as payload - -set -ex -set -o pipefail - -# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference -amdGPUs() { - if [ -n "${AMDGPU_TARGETS}" ]; then - echo "${AMDGPU_TARGETS}" - return - fi - GPU_LIST=( - "gfx900" - "gfx906:xnack-" - "gfx908:xnack-" - "gfx90a:xnack+" - "gfx90a:xnack-" - "gfx1010" - "gfx1012" - "gfx1030" - "gfx1100" - "gfx1101" - "gfx1102" - ) - ( - IFS=$';' - echo "'${GPU_LIST[*]}'" - ) -} - -echo "Starting linux generate script" -if [ -z "${CUDACXX}" ]; then - if [ -x /usr/local/cuda/bin/nvcc ]; then - export CUDACXX=/usr/local/cuda/bin/nvcc - else - # Try the default location in case it exists - export CUDACXX=$(command -v nvcc) - fi -fi -COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" -source $(dirname $0)/gen_common.sh -init_vars -git_module_setup -apply_patches - -if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then - # Users building from source can tune the exact flags we pass to cmake for configuring - # llama.cpp, and we'll build only 1 CPU variant in that case as the default. - if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then - echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" - echo "Building custom CPU" - build - compress_libs - else - # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 - # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer - # -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX) - # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen - # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver - # Note: the following seem to yield slower results than AVX2 - ymmv - # -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT) - # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake - # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake - - COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off" - if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then - # - # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) - # - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" - echo "Building LCD CPU" - build - compress_libs - fi - - if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then - # - # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance - # Approximately 400% faster than LCD on same CPU - # - init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx" - echo "Building AVX CPU" - build - compress_libs - fi - - if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then - # - # ~2013 CPU Dynamic library - # Approximately 10% faster than AVX on same CPU - # - init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2" - echo "Building AVX2 CPU" - build - compress_libs - fi - fi -else - echo "Skipping CPU generation step as requested" -fi - -# If needed, look for the default CUDA toolkit location -if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then - CUDA_LIB_DIR=/usr/local/cuda/lib64 -fi - -# If needed, look for CUDA on Arch Linux -if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then - CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib -fi - -# Allow override in case libcudart is in the wrong place -if [ -z "${CUDART_LIB_DIR}" ]; then - CUDART_LIB_DIR="${CUDA_LIB_DIR}" -fi - -if [ -d "${CUDA_LIB_DIR}" ]; then - echo "CUDA libraries detected - building dynamic CUDA library" - init_vars - CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) - if [ -n "${CUDA_MAJOR}" ]; then - CUDA_VARIANT=_v${CUDA_MAJOR} - fi - CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}" - EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" - build - - # Cary the CUDA libs as payloads to help reduce dependency burden on users - # - # TODO - in the future we may shift to packaging these separately and conditionally - # downloading them in the install script. - DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )" - for lib in libcudart.so libcublas.so libcublasLt.so ; do - DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) - if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then - cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/" - elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then - cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/" - elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then - cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/" - else - cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/" - fi - done - compress_libs - -fi - -if [ -z "${ROCM_PATH}" ]; then - # Try the default location in case it exists - ROCM_PATH=/opt/rocm -fi - -if [ -z "${CLBlast_DIR}" ]; then - # Try the default location in case it exists - if [ -d /usr/lib/cmake/CLBlast ]; then - export CLBlast_DIR=/usr/lib/cmake/CLBlast - fi -fi - -if [ -d "${ROCM_PATH}" ]; then - echo "ROCm libraries detected - building dynamic ROCm library" - if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then - ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true) - fi - init_vars - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}" - EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" - build - - # Note: the ROCM libs and runtime library files are too large to embed, so we depend on - # them being present at runtime on the host - compress_libs -fi - -cleanup diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 deleted file mode 100644 index e03134209..000000000 --- a/llm/generate/gen_windows.ps1 +++ /dev/null @@ -1,209 +0,0 @@ -#!powershell - -$ErrorActionPreference = "Stop" - -function init_vars { - $script:SRC_DIR = $(resolve-path "..\..\") - $script:llamacppDir = "../llama.cpp" - $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A", "x64") - $script:cmakeTargets = @("ext_server") - $script:ARCH = "amd64" # arm not yet supported. - if ($env:CGO_CFLAGS -contains "-g") { - $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on") - $script:config = "RelWithDebInfo" - } else { - $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off") - $script:config = "Release" - } - # Try to find the CUDA dir - if ($env:CUDA_LIB_DIR -eq $null) { - $d=(get-command -ea 'silentlycontinue' nvcc).path - if ($d -ne $null) { - $script:CUDA_LIB_DIR=($d| split-path -parent) - $script:CUDA_INCLUDE_DIR=($script:CUDA_LIB_DIR|split-path -parent)+"\include" - } - } else { - $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR - } - $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path - $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path - if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) { - $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" - } else { - $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES - } - # Note: 10 Windows Kit signtool crashes with GCP's plugin - ${script:SignTool}="C:\Program Files (x86)\Windows Kits\8.1\bin\x64\signtool.exe" - if ("${env:KEY_CONTAINER}") { - ${script:OLLAMA_CERT}=$(resolve-path "${script:SRC_DIR}\ollama_inc.crt") - } -} - -function git_module_setup { - # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo - & git submodule init - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & git submodule update --force "${script:llamacppDir}" - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} -} - -function apply_patches { - # Wire up our CMakefile - if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) { - Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama' - } - - # Apply temporary patches until fix is upstream - $patches = Get-ChildItem "../patches/*.diff" - foreach ($patch in $patches) { - # Extract file paths from the patch file - $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object { - $parts = $_ -split ' ' - ($parts[1] -split '/', 2)[1] - } - - # Checkout each file - Set-Location -Path ${script:llamacppDir} - foreach ($file in $filePaths) { - git checkout $file - } - } - - # Apply each patch - foreach ($patch in $patches) { - Set-Location -Path ${script:llamacppDir} - git apply $patch.FullName - } - - # Avoid duplicate main symbols when we link into the cgo binary - $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp" - $content = $content -replace 'int main\(', 'int __main(' - Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content -} - -function build { - write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs" - & cmake --version - & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })" - & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} -} - -function install { - rm -ea 0 -recurse -force -path "${script:buildDir}/lib" - md "${script:buildDir}/lib" -ea 0 > $null - cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib" - cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib" - # Display the dll dependencies in the build log - if ($script:DUMPBIN -ne $null) { - & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll" - } -} - -function sign { - if ("${env:KEY_CONTAINER}") { - write-host "Signing ${script:buildDir}/lib/*.dll" - foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){ - & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` - /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file - if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - } - } -} - -function compress_libs { - if ($script:GZIP -eq $null) { - write-host "gzip not installed, not compressing files" - return - } - write-host "Compressing dlls..." - $libs = dir "${script:buildDir}/lib/*.dll" - foreach ($file in $libs) { - & "$script:GZIP" --best -f $file - } -} - -function cleanup { - $patches = Get-ChildItem "../patches/*.diff" - foreach ($patch in $patches) { - # Extract file paths from the patch file - $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object { - $parts = $_ -split ' ' - ($parts[1] -split '/', 2)[1] - } - - # Checkout each file - Set-Location -Path ${script:llamacppDir} - foreach ($file in $filePaths) { - git checkout $file - } - } - Set-Location "${script:llamacppDir}/examples/server" - git checkout CMakeLists.txt server.cpp - -} - -init_vars -git_module_setup -apply_patches - -# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer -# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX) -# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen -# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver - -$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") - -init_vars -$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs -$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu" -write-host "Building LCD CPU" -build -install -sign -compress_libs - -init_vars -$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs -$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx" -write-host "Building AVX CPU" -build -install -sign -compress_libs - -init_vars -$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs -$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2" -write-host "Building AVX2 CPU" -build -install -sign -compress_libs - -if ($null -ne $script:CUDA_LIB_DIR) { - # Then build cuda as a dynamically loaded library - $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe" - $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename - if ($null -ne $script:CUDA_VERSION) { - $script:CUDA_VARIANT="_"+$script:CUDA_VERSION - } - init_vars - $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT" - $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}") - build - install - sign - compress_libs -} -# TODO - actually implement ROCm support on windows -$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm" - -rm -ea 0 -recurse -force -path "${script:buildDir}/lib" -md "${script:buildDir}/lib" -ea 0 > $null -echo $null >> "${script:buildDir}/lib/.generated" - -cleanup -write-host "`ngo generate completed" diff --git a/llm/generate/generate_darwin.go b/llm/generate/generate_darwin.go deleted file mode 100644 index 322879e91..000000000 --- a/llm/generate/generate_darwin.go +++ /dev/null @@ -1,3 +0,0 @@ -package generate - -//go:generate sh ./gen_darwin.sh diff --git a/llm/generate/generate_linux.go b/llm/generate/generate_linux.go deleted file mode 100644 index 2b7e116db..000000000 --- a/llm/generate/generate_linux.go +++ /dev/null @@ -1,3 +0,0 @@ -package generate - -//go:generate bash ./gen_linux.sh diff --git a/llm/generate/generate_windows.go b/llm/generate/generate_windows.go deleted file mode 100644 index d2ee5428a..000000000 --- a/llm/generate/generate_windows.go +++ /dev/null @@ -1,3 +0,0 @@ -package generate - -//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1 diff --git a/llm/llama.cpp b/llm/llama.cpp deleted file mode 160000 index c29af7e22..000000000 --- a/llm/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c29af7e2252d288f2ea58a7d437c1cb7c0abf160 diff --git a/llm/llm.go b/llm/llm.go index 81bab1225..6805ea531 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -6,6 +6,7 @@ import ( "log/slog" "os" "runtime" + "time" "github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/gpu" @@ -165,3 +166,12 @@ func newLlmServer(gpuInfo gpu.GpuInfo, workDir, model string, adapters, projecto return nil, err2 } + +func parseDurationMs(ms float64) time.Duration { + dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) + if err != nil { + panic(err) + } + + return dur +} diff --git a/llm/llm_darwin_amd64.go b/llm/llm_darwin_amd64.go new file mode 100644 index 000000000..c8db2ab77 --- /dev/null +++ b/llm/llm_darwin_amd64.go @@ -0,0 +1,14 @@ +//go:generate cmake -S server -B server/build/cpu -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off +//go:generate cmake -S server -B server/build/cpu_avx -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on +//go:generate cmake -S server -B server/build/cpu_avx2 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=on +//go:generate cmake --build server/build/cpu --target server -- -j4 +//go:generate cmake --build server/build/cpu_avx --target server -- -j4 +//go:generate cmake --build server/build/cpu_avx2 --target server -- -j4 +package llm + +import "embed" + +//go:embed server/build/cpu/server +//go:embed server/build/cpu_avx/server +//go:embed server/build/cpu_avx2/server +var libEmbed embed.FS diff --git a/llm/llm_darwin_arm64.go b/llm/llm_darwin_arm64.go new file mode 100644 index 000000000..1ab9b0e9f --- /dev/null +++ b/llm/llm_darwin_arm64.go @@ -0,0 +1,8 @@ +//go:generate cmake -S server -B server/build/metal -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 +//go:generate cmake --build server/build/metal --target server -- -j4 +package llm + +import "embed" + +//go:embed server/build/metal/ggml-metal.metal server/build/metal/server +var libEmbed embed.FS diff --git a/llm/patches/02-cudaleaks.diff b/llm/patches/02-cudaleaks.diff deleted file mode 100644 index 206bb2705..000000000 --- a/llm/patches/02-cudaleaks.diff +++ /dev/null @@ -1,114 +0,0 @@ -diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 2b2f4a0f..25857bdd 100644 ---- a/examples/server/server.cpp -+++ b/examples/server/server.cpp -@@ -31,6 +31,10 @@ - #include - #include - -+#ifdef GGML_USE_CUBLAS -+extern "C" GGML_CALL void ggml_free_cublas(void); -+#endif -+ - using json = nlohmann::json; - - struct server_params { -@@ -363,6 +367,9 @@ struct llama_server_context - llama_free_model(model); - model = nullptr; - } -+#ifdef GGML_USE_CUBLAS -+ ggml_free_cublas(); -+#endif - } - - bool load_model(const gpt_params ¶ms_) -@@ -3494,6 +3501,7 @@ int main(int argc, char **argv) - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); -+ sigaction(SIGUSR1, &sigint_action, NULL); - #elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; -diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 0c6501e9..75c12723 100644 ---- a/ggml-cuda.cu -+++ b/ggml-cuda.cu -@@ -43,6 +43,7 @@ - #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) - #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 - #define cublasCreate hipblasCreate -+#define cublasDestroy hipblasDestroy - #define cublasGemmEx hipblasGemmEx - #define cublasGemmBatchedEx hipblasGemmBatchedEx - #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -@@ -8694,10 +8695,10 @@ GGML_CALL bool ggml_cublas_loaded(void) { - return g_cublas_loaded; - } - --GGML_CALL void ggml_init_cublas() { -- static bool initialized = false; -+static bool g_cublas_initialized = false; - -- if (!initialized) { -+GGML_CALL void ggml_init_cublas() { -+ if (!g_cublas_initialized) { - - #ifdef __HIP_PLATFORM_AMD__ - // Workaround for a rocBLAS bug when using multiple graphics cards: -@@ -8707,7 +8708,7 @@ GGML_CALL void ggml_init_cublas() { - #endif - - if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) { -- initialized = true; -+ g_cublas_initialized = true; - g_cublas_loaded = false; - fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); - return; -@@ -8778,7 +8779,7 @@ GGML_CALL void ggml_init_cublas() { - // configure logging to stdout - // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); - -- initialized = true; -+ g_cublas_initialized = true; - g_cublas_loaded = true; - } - } -@@ -12345,3 +12346,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { - } - return device_count; - } -+ -+extern "C" GGML_CALL void ggml_free_cublas(void); -+GGML_CALL void ggml_free_cublas(void) { -+ for (int id = 0; id < g_device_count; ++id) { -+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -+ if (g_device_caps[id].vmm) { -+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id])); -+ g_cuda_pool_size[id] = 0; -+ g_cuda_pool_addr[id] = 0; -+ } -+#endif -+ // TODO: free legacy non-vmm memory -+ // destroy cublas handle -+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id])); -+ g_cublas_handles[id] = nullptr; -+ } -+ -+ g_cublas_initialized = false; -+} -diff --git a/ggml-cuda.h b/ggml-cuda.h -index b1ebd61d..6dd58ddf 100644 ---- a/ggml-cuda.h -+++ b/ggml-cuda.h -@@ -23,6 +23,9 @@ GGML_API GGML_CALL void ggml_init_cublas(void); - // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. - GGML_API GGML_CALL bool ggml_cublas_loaded(void); - -+// Release CUDA resources -+GGML_API GGML_CALL void ggml_free_cublas(void); -+ - GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size); - GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr); - diff --git a/llm/payload_darwin_amd64.go b/llm/payload_darwin_amd64.go deleted file mode 100644 index a1c70ba95..000000000 --- a/llm/payload_darwin_amd64.go +++ /dev/null @@ -1,8 +0,0 @@ -package llm - -import ( - "embed" -) - -//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib* -var libEmbed embed.FS diff --git a/llm/payload_darwin_arm64.go b/llm/payload_darwin_arm64.go deleted file mode 100644 index aa70c931d..000000000 --- a/llm/payload_darwin_arm64.go +++ /dev/null @@ -1,8 +0,0 @@ -package llm - -import ( - "embed" -) - -//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib* -var libEmbed embed.FS diff --git a/llm/payload_linux.go b/llm/payload_linux.go deleted file mode 100644 index fc366209c..000000000 --- a/llm/payload_linux.go +++ /dev/null @@ -1,8 +0,0 @@ -package llm - -import ( - "embed" -) - -//go:embed llama.cpp/build/linux/*/*/lib/*.so* -var libEmbed embed.FS diff --git a/llm/payload_test.go b/llm/payload_test.go deleted file mode 100644 index 44537b0a3..000000000 --- a/llm/payload_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package llm - -import ( - "testing" - - "github.com/jmorganca/ollama/gpu" - "github.com/stretchr/testify/assert" -) - -func TestGetDynLibs(t *testing.T) { - availableDynLibs = map[string]string{ - "cpu": "X_cpu", - } - assert.Equal(t, false, rocmDynLibPresent()) - res := getDynLibs(gpu.GpuInfo{Library: "cpu"}) - assert.Len(t, res, 1) - assert.Equal(t, availableDynLibs["cpu"], res[0]) - - variant := gpu.GetCPUVariant() - if variant != "" { - variant = "_" + variant - } - availableDynLibs = map[string]string{ - "rocm_v5": "X_rocm_v5", - "rocm_v6": "X_rocm_v6", - "cpu" + variant: "X_cpu", - } - assert.Equal(t, true, rocmDynLibPresent()) - res = getDynLibs(gpu.GpuInfo{Library: "rocm"}) - assert.Len(t, res, 3) - assert.Equal(t, availableDynLibs["rocm_v5"], res[0]) - assert.Equal(t, availableDynLibs["rocm_v6"], res[1]) - assert.Equal(t, availableDynLibs["cpu"+variant], res[2]) - - res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) - assert.Len(t, res, 3) - assert.Equal(t, availableDynLibs["rocm_v6"], res[0]) - assert.Equal(t, availableDynLibs["rocm_v5"], res[1]) - assert.Equal(t, availableDynLibs["cpu"+variant], res[2]) - - res = getDynLibs(gpu.GpuInfo{Library: "cuda"}) - assert.Len(t, res, 1) - assert.Equal(t, availableDynLibs["cpu"+variant], res[0]) - - res = getDynLibs(gpu.GpuInfo{Library: "default"}) - assert.Len(t, res, 1) - assert.Equal(t, "default", res[0]) - - availableDynLibs = map[string]string{ - "rocm": "X_rocm_v5", - "cpu" + variant: "X_cpu", - } - assert.Equal(t, true, rocmDynLibPresent()) - res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"}) - assert.Len(t, res, 2) - assert.Equal(t, availableDynLibs["rocm"], res[0]) - assert.Equal(t, availableDynLibs["cpu"+variant], res[1]) -} diff --git a/llm/payload_windows.go b/llm/payload_windows.go deleted file mode 100644 index d195745a9..000000000 --- a/llm/payload_windows.go +++ /dev/null @@ -1,8 +0,0 @@ -package llm - -import ( - "embed" -) - -//go:embed llama.cpp/build/windows/*/*/lib/*.dll* -var libEmbed embed.FS diff --git a/llm/server/.gitignore b/llm/server/.gitignore new file mode 100644 index 000000000..c795b054e --- /dev/null +++ b/llm/server/.gitignore @@ -0,0 +1 @@ +build \ No newline at end of file diff --git a/llm/server/CMakeLists.txt b/llm/server/CMakeLists.txt new file mode 100644 index 000000000..923cc6c1d --- /dev/null +++ b/llm/server/CMakeLists.txt @@ -0,0 +1,93 @@ +cmake_minimum_required(VERSION 3.14) + +project(llm) + +include(FetchContent) + +set(add_token_patch + git apply ${CMAKE_CURRENT_SOURCE_DIR}/patches/add_token.patch +) + +set(FETCHCONTENT_BASE_DIR "${CMAKE_SOURCE_DIR}/build/llama.cpp") + +FetchContent_Declare( + llama_cpp + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git + GIT_TAG c29af7e2252d288f2ea58a7d437c1cb7c0abf160 + + # this could be risky if the patch doesn't apply + PATCH_COMMAND ${add_token_patch} || true +) + +FetchContent_MakeAvailable(llama_cpp) +add_subdirectory(${llama_cpp_SOURCE_DIR}/examples/llava) + +# code signing +function(sign target) + if(APPLE) + if(DEFINED ENV{APPLE_IDENTITY}) + add_custom_command(TARGET ${target} POST_BUILD + COMMAND codesign + -f + --timestamp + --deep + --options=runtime + --sign "$ENV{APPLE_IDENTITY}" + --identifier ai.ollama.ollama + $ + COMMENT "Signing macOS binary: ${target}" + ) + endif() + elseif(WIN32) + find_program(SIGNTOOL_EXE NAMES signtool PATHS "C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x64" NO_DEFAULT_PATH) + set(KEY_CONTAINER "$ENV{KEY_CONTAINER}") + set(OLLAMA_CERT "$ENV{OLLAMA_CERT}") + + if(SIGNTOOL_EXE AND KEY_CONTAINER AND OLLAMA_CERT) + add_custom_command(TARGET ${target} POST_BUILD + COMMAND "${SIGNTOOL_EXE}" + "sign" + "/v" + "/fd" "sha256" + "/t" "http://timestamp.digicert.com" + "/f" "${OLLAMA_CERT}" + "/csp" "Google Cloud KMS Provider" + "/kc" "${KEY_CONTAINER}" + "$" + COMMENT "Signing Windows binary: ${target}" + ) + endif() + endif() +endfunction() + +set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75;80") + +function(gzip target) + set(gzip_target "gzip_${target}") + add_custom_target(${gzip_target} ALL + COMMAND gzip -k -f ${target} + COMMENT "Gzipping ${target}" + VERBATIM + ) + add_dependencies(${gzip_target} ${target}) +endfunction() + +function(link_windows_libraries target) + if (WIN32) + target_link_libraries(${target} PRIVATE ws2_32) + endif() +endfunction() + +add_executable(server ${llama_cpp_SOURCE_DIR}/examples/server/server.cpp ${llama_cpp_SOURCE_DIR}) +target_compile_definitions(server PRIVATE) +target_link_libraries(server PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(server PRIVATE cxx_std_17) +link_windows_libraries(server) +sign(server) +gzip(server) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") + configure_file(${llama_cpp_SOURCE_DIR}/ggml-metal.metal ${CMAKE_BINARY_DIR}/ggml-metal.metal COPYONLY) +endif() + +# TODO: ROCm diff --git a/llm/patches/01-cache.diff b/llm/server/patches/01-cache.diff similarity index 100% rename from llm/patches/01-cache.diff rename to llm/server/patches/01-cache.diff diff --git a/llm/utils.go b/llm/utils.go deleted file mode 100644 index 4dc03c806..000000000 --- a/llm/utils.go +++ /dev/null @@ -1,15 +0,0 @@ -package llm - -import ( - "fmt" - "time" -) - -func parseDurationMs(ms float64) time.Duration { - dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) - if err != nil { - panic(err) - } - - return dur -}