add faq on models downloaded from hf

Signed-off-by: Matt Williams <m@technovangelist.com>
Merge pull request #1790 from dhiltgen/llm_code_shuffle
2024-01-04 16:55:56 -08:00 · 2024-01-04 13:47:25 -08:00 · 2024-01-04 13:40:16 -08:00 · 2024-01-04 13:14:28 -08:00 · 2024-01-04 13:00:13 -08:00 · 2024-01-04 12:12:21 -08:00
21 changed files with 79 additions and 51 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,7 +2,7 @@
 ollama
 app
 dist
-llm/llama.cpp/gguf
+llm/llama.cpp
 .env
 .cache
 test_data
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,5 +1,4 @@
-[submodule "llm/llama.cpp/gguf"]
-    path = llm/llama.cpp/gguf
-    url = https://github.com/ggerganov/llama.cpp.git
-    ignore = dirty
-    shallow = true
+[submodule "llama.cpp"]
+	path = llm/llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git
+	shallow = true
--- a/api/client.go
+++ b/api/client.go
@@ -309,6 +309,13 @@ func (c *Client) Heartbeat(ctx context.Context) error {
 	}
 	return nil
 }
+func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
+	var resp EmbeddingResponse
+	if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}

 func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
 	if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -66,6 +66,16 @@ Refer to the section above for how to use environment variables on your platform

 If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory. Refer to the section above for how to use environment variables on your platform.

+## Can I use models I downloaded from Hugging Face in Ollama? 
+
+There are a lot of models available on Hugging Face. Many of them will work with Ollama, but not all of them yet. You can look for models that use the library **PyTorch**, then in the repo look at the `config.json` file. In there you should see an architecture. For now, we support models that use the following architectures: Llama, Mistral, Falcon, RW, and BigCode.
+
+## Can I use models I downloaded in Ollama in other applications?
+
+Yes, as long as those applications work with GGUF models. You can find the models in the directories listed above. Under `models`, there is a manifests directory. Follow that path down to find the model you want to use. There will be a file for the model and tag you intend to use. In that file, you will see a layer called: `application/vnd.ollama.image.model`.
+
+The next line will show a sha256 hash. That happens to also be the filename for the model weights file that you can find in `.ollama/models/blobs`. You can use that file in any application that supports gguf. But it is important not to move the file from this location otherwise Ollama won't be able to use it.
+
 ## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?

 No, Ollama runs entirely locally, and conversation data will never leave your machine.
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -2,7 +2,7 @@

 set(TARGET ext_server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-add_library(${TARGET} STATIC ../../../ext_server.cpp)
+add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_include_directories(${TARGET} PRIVATE ../..)
 target_include_directories(${TARGET} PRIVATE ../../..)
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -0,0 +1,4 @@
+# Extern C Server
+
+This directory contains a thin facade we layer on top of the Llama.cpp server
+to expose `extern C` interfaces to access the functionality through direct API calls in-process
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
--- a/llm/ext_server_common.go
+++ b/llm/ext_server_common.go
@@ -1,7 +1,7 @@
 package llm

 /*
-#cgo CFLAGS: -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
+#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
 #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
 #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
@@ -10,17 +10,17 @@ package llm
 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 #cgo darwin LDFLAGS: -lc++ -framework Accelerate
 #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a
-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
+#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
 #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a
-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
 #cgo linux windows LDFLAGS: -lpthread

--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
@@ -1,7 +1,7 @@
 # common logic accross linux and darwin

 init_vars() {
-    LLAMACPP_DIR=gguf
+    LLAMACPP_DIR=../llama.cpp
    PATCHES="0001-Expose-callable-API-for-server.patch"
    CMAKE_DEFS=""
    CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
@@ -18,19 +18,24 @@ git_module_setup() {
        echo "Skipping submodule initialization"
        return
    fi
+    # Make sure the tree is clean after the directory moves
+    if [ -d "${LLAMACPP_DIR}/gguf" ]; then
+        echo "Cleaning up old submodule"
+        rm -rf ${LLAMACPP_DIR}
+    fi
    git submodule init
-    git submodule update --force gguf
+    git submodule update --force ${LLAMACPP_DIR}

 }

 apply_patches() {
    # Wire up our CMakefile
-    if ! grep ollama gguf/examples/server/CMakeLists.txt; then
-        echo 'include (../../../CMakeLists.txt) # ollama' >>gguf/examples/server/CMakeLists.txt
+    if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
+        echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
    fi
    # Avoid duplicate main symbols when we link into the cgo binary
-    sed -e 's/int main(/int __main(/g' <./gguf/examples/server/server.cpp >./gguf/examples/server/server.cpp.tmp &&
-        mv ./gguf/examples/server/server.cpp.tmp ./gguf/examples/server/server.cpp
+    sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
+        mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
 }

 build() {
@@ -49,5 +54,5 @@ install() {

 # Keep the local tree clean after we're done with the build
 cleanup() {
-    (cd gguf/examples/server/ && git checkout CMakeLists.txt server.cpp)
+    (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
 }
--- a/llm/llama.cpp/gen_darwin.sh
+++ b/llm/llama.cpp/gen_darwin.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
-# working directory must be ../llm/llama.cpp
+# working directory must be ./llm/generate/

 # TODO - add hardening to detect missing tools (cmake, etc.)

@@ -10,7 +10,7 @@ echo "Starting darwin generate script"
 source $(dirname $0)/gen_common.sh
 init_vars
 CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
-BUILD_DIR="gguf/build/darwin/metal"
+BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
 case "${GOARCH}" in
 "amd64")
    CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 # This script is intended to run inside the go generate
-# working directory must be llm/llama.cpp
+# working directory must be llm/generate/

 # First we build our default built-in library which will be linked into the CGO
 # binary as a normal dependency. This default build is CPU based.
@@ -52,7 +52,7 @@ apply_patches
 # CPU first for the default library
 #
 CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-BUILD_DIR="gguf/build/linux/cpu"
+BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"

 build
 install
@@ -64,7 +64,7 @@ if [ -d /usr/local/cuda/lib64/ ]; then
    echo "CUDA libraries detected - building dynamic CUDA library"
    init_vars
    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
-    BUILD_DIR="gguf/build/linux/cuda"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
    CUDA_LIB_DIR=/usr/local/cuda/lib64
    build
    install
@@ -98,7 +98,7 @@ if [ -d "${ROCM_PATH}" ]; then
    echo "ROCm libraries detected - building dynamic ROCm library"
    init_vars
    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
-    BUILD_DIR="gguf/build/linux/rocm"
+    BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
    build
    install
    gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
--- a/llm/llama.cpp/gen_windows.ps1
+++ b/llm/llama.cpp/gen_windows.ps1
@@ -3,6 +3,7 @@
 $ErrorActionPreference = "Stop"

 function init_vars {
+    $script:llamacppDir = "../llama.cpp"
    $script:patches = @("0001-Expose-callable-API-for-server.patch")
    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
    $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
@@ -19,25 +20,25 @@ function git_module_setup {
    # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
    & git submodule init
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
-    & git submodule update --force gguf
+    & git submodule update --force "${script:llamacppDir}"
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
 }

 function apply_patches {
    # Wire up our CMakefile
-    if (!(Select-String -Path "gguf/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
-        Add-Content -Path "gguf/examples/server/CMakeLists.txt" -Value 'include (../../../CMakeLists.txt) # ollama'
+    if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
+        Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
    }
    # Avoid duplicate main symbols when we link into the cgo binary
-    $content = Get-Content -Path "./gguf/examples/server/server.cpp"
+    $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
    $content = $content -replace 'int main\(', 'int __main('
-    Set-Content -Path "./gguf/examples/server/server.cpp" -Value $content
+    Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
 }

 function build {
-    write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
+    write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
    & cmake --version
-    & cmake -S gguf -B $script:buildDir $script:cmakeDefs
+    & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
    if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
    & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
@@ -55,7 +56,7 @@ function install {
 }

 function cleanup {
-    Set-Location "gguf/examples/server"
+    Set-Location "${script:llamacppDir}/examples/server"
    git checkout CMakeLists.txt server.cpp
 }

@@ -64,20 +65,20 @@ git_module_setup
 apply_patches

 # first build CPU based
-$script:buildDir="gguf/build/windows/cpu"
+$script:buildDir="${script:llamacppDir}/build/windows/cpu"

 build
 install

 # Then build cuda as a dynamically loaded library
 init_vars
-$script:buildDir="gguf/build/windows/cuda"
+$script:buildDir="${script:llamacppDir}/build/windows/cuda"
 $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
 build
 install

 # TODO - actually implement ROCm support on windows
-$script:buildDir="gguf/build/windows/rocm"
+$script:buildDir="${script:llamacppDir}/build/windows/rocm"

 rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
 md "${script:buildDir}/lib" -ea 0 > $null
--- a/llm/llama.cpp/generate_darwin.go
+++ b/llm/llama.cpp/generate_darwin.go
@@ -1,3 +1,3 @@
-package llm
+package generate

 //go:generate sh ./gen_darwin.sh
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -1,3 +1,3 @@
-package llm
+package generate

 //go:generate bash ./gen_linux.sh
--- a/llm/llama.cpp/generate_windows.go
+++ b/llm/llama.cpp/generate_windows.go
@@ -1,3 +1,3 @@
-package llm
+package generate

 //go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
--- a/llm/llama.cpp/gguf
+++ b/llm/llama.cpp/gguf
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -13,7 +13,7 @@ import (
 	"github.com/jmorganca/ollama/api"
 )

-//go:embed llama.cpp/gguf/ggml-metal.metal
+//go:embed llama.cpp/ggml-metal.metal
 var libEmbed embed.FS

 func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
@@ -22,7 +22,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin
 }

 func nativeInit(workdir string) error {
-	err := extractPayloadFiles(workdir, "llama.cpp/gguf/ggml-metal.metal")
+	err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
 	if err != nil {
 		if err == payloadMissing {
 			// TODO perhaps consider this a hard failure on arm macs?
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -34,6 +34,8 @@ type shimExtServer struct {
 var shimMutex sync.Mutex
 var llm *shimExtServer

+const pathComponentCount = 6
+
 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
 	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
 }
@@ -112,7 +114,7 @@ func (llm *shimExtServer) Close() {
 }

 func nativeInit(workdir string) error {
-	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/*/*/lib/*")
+	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
 	if err != nil {
 		if err == payloadMissing {
 			log.Printf("%s", payloadMissing)
@@ -151,13 +153,13 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {

 	for _, file := range files {
 		pathComps := strings.Split(file, "/")
-		if len(pathComps) != 7 {
+		if len(pathComps) != pathComponentCount {
 			log.Printf("unexpected payload components: %v", pathComps)
 			continue
 		}
-		// llama.cpp/gguf/build/$OS/$VARIANT/lib/$LIBRARY
+		// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
 		// Include the variant in the path to avoid conflicts between multiple server libs
-		targetDir := filepath.Join(workDir, pathComps[4])
+		targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
 		srcFile, err := libEmbed.Open(file)
 		if err != nil {
 			return nil, fmt.Errorf("read payload %s: %v", file, err)
--- a/llm/shim_ext_server_linux.go
+++ b/llm/shim_ext_server_linux.go
@@ -10,7 +10,7 @@ import (
 	"strings"
 )

-//go:embed llama.cpp/gguf/build/*/*/lib/*.so
+//go:embed llama.cpp/build/*/*/lib/*.so
 var libEmbed embed.FS

 func updatePath(dir string) {
--- a/llm/shim_ext_server_windows.go
+++ b/llm/shim_ext_server_windows.go
@@ -8,7 +8,7 @@ import (
 	"strings"
 )

-//go:embed llama.cpp/gguf/build/windows/*/lib/*.dll
+//go:embed llama.cpp/build/windows/*/lib/*.dll
 var libEmbed embed.FS

 func updatePath(dir string) {
Author	SHA1	Message	Date
Matt Williams	a314b6c2a9	add faq on models downloaded from hf Signed-off-by: Matt Williams <m@technovangelist.com>	2024-01-04 16:55:56 -08:00
Daniel Hiltgen	cd8fad3398	Merge pull request #1790 from dhiltgen/llm_code_shuffle Cleaup stale submodule	2024-01-04 13:47:25 -08:00
Daniel Hiltgen	9983fa5f4e	Cleaup stale submodule If the tree has a stale submodule, make sure we clean it up first	2024-01-04 13:40:16 -08:00
Daniel Hiltgen	dfda91c2ee	Merge pull request #1788 from dhiltgen/llm_code_shuffle Revamp code layout for the llm directory and llama.cpp submodule	2024-01-04 13:14:28 -08:00
Daniel Hiltgen	fac9060da5	Init submodule with new path	2024-01-04 13:00:13 -08:00
Daniel Hiltgen	a554616f8e	remove old llama.cpp submodule path	2024-01-04 12:12:21 -08:00
Daniel Hiltgen	77d96da94b	Code shuffle to clean up the llm dir	2024-01-04 12:12:05 -08:00
Brian Murray	0d6e3565ae	Add embeddings to API (#1773 )	2024-01-04 15:00:52 -05:00