Compare commits
8 Commits
mattw/quan
...
mattw/allm
Author | SHA1 | Date | |
---|---|---|---|
![]() |
a314b6c2a9 | ||
![]() |
cd8fad3398 | ||
![]() |
9983fa5f4e | ||
![]() |
dfda91c2ee | ||
![]() |
fac9060da5 | ||
![]() |
a554616f8e | ||
![]() |
77d96da94b | ||
![]() |
0d6e3565ae |
@@ -2,7 +2,7 @@
|
||||
ollama
|
||||
app
|
||||
dist
|
||||
llm/llama.cpp/gguf
|
||||
llm/llama.cpp
|
||||
.env
|
||||
.cache
|
||||
test_data
|
9
.gitmodules
vendored
9
.gitmodules
vendored
@@ -1,5 +1,4 @@
|
||||
[submodule "llm/llama.cpp/gguf"]
|
||||
path = llm/llama.cpp/gguf
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
[submodule "llama.cpp"]
|
||||
path = llm/llama.cpp
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
shallow = true
|
@@ -309,6 +309,13 @@ func (c *Client) Heartbeat(ctx context.Context) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
func (c *Client) Embeddings(ctx context.Context, req *EmbeddingRequest) (*EmbeddingResponse, error) {
|
||||
var resp EmbeddingResponse
|
||||
if err := c.do(ctx, http.MethodPost, "/api/embeddings", req, &resp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
func (c *Client) CreateBlob(ctx context.Context, digest string, r io.Reader) error {
|
||||
if err := c.do(ctx, http.MethodHead, fmt.Sprintf("/api/blobs/%s", digest), nil, nil); err != nil {
|
||||
|
10
docs/faq.md
10
docs/faq.md
@@ -66,6 +66,16 @@ Refer to the section above for how to use environment variables on your platform
|
||||
|
||||
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory. Refer to the section above for how to use environment variables on your platform.
|
||||
|
||||
## Can I use models I downloaded from Hugging Face in Ollama?
|
||||
|
||||
There are a lot of models available on Hugging Face. Many of them will work with Ollama, but not all of them yet. You can look for models that use the library **PyTorch**, then in the repo look at the `config.json` file. In there you should see an architecture. For now, we support models that use the following architectures: Llama, Mistral, Falcon, RW, and BigCode.
|
||||
|
||||
## Can I use models I downloaded in Ollama in other applications?
|
||||
|
||||
Yes, as long as those applications work with GGUF models. You can find the models in the directories listed above. Under `models`, there is a manifests directory. Follow that path down to find the model you want to use. There will be a file for the model and tag you intend to use. In that file, you will see a layer called: `application/vnd.ollama.image.model`.
|
||||
|
||||
The next line will show a sha256 hash. That happens to also be the filename for the model weights file that you can find in `.ollama/models/blobs`. You can use that file in any application that supports gguf. But it is important not to move the file from this location otherwise Ollama won't be able to use it.
|
||||
|
||||
## Does Ollama send my prompts and answers back to Ollama.ai to use in any way?
|
||||
|
||||
No, Ollama runs entirely locally, and conversation data will never leave your machine.
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
set(TARGET ext_server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
add_library(${TARGET} STATIC ../../../ext_server.cpp)
|
||||
add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp)
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_include_directories(${TARGET} PRIVATE ../..)
|
||||
target_include_directories(${TARGET} PRIVATE ../../..)
|
4
llm/ext_server/README.md
Normal file
4
llm/ext_server/README.md
Normal file
@@ -0,0 +1,4 @@
|
||||
# Extern C Server
|
||||
|
||||
This directory contains a thin facade we layer on top of the Llama.cpp server
|
||||
to expose `extern C` interfaces to access the functionality through direct API calls in-process
|
@@ -1,7 +1,7 @@
|
||||
package llm
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server
|
||||
#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
|
||||
#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
|
||||
#cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
|
||||
#cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable
|
||||
@@ -10,17 +10,17 @@ package llm
|
||||
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||||
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
|
||||
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
|
||||
#cgo linux CFLAGS: -D_GNU_SOURCE
|
||||
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
||||
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
|
||||
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
|
||||
#cgo linux windows LDFLAGS: -lpthread
|
||||
|
||||
|
@@ -1,7 +1,7 @@
|
||||
# common logic accross linux and darwin
|
||||
|
||||
init_vars() {
|
||||
LLAMACPP_DIR=gguf
|
||||
LLAMACPP_DIR=../llama.cpp
|
||||
PATCHES="0001-Expose-callable-API-for-server.patch"
|
||||
CMAKE_DEFS=""
|
||||
CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
|
||||
@@ -18,19 +18,24 @@ git_module_setup() {
|
||||
echo "Skipping submodule initialization"
|
||||
return
|
||||
fi
|
||||
# Make sure the tree is clean after the directory moves
|
||||
if [ -d "${LLAMACPP_DIR}/gguf" ]; then
|
||||
echo "Cleaning up old submodule"
|
||||
rm -rf ${LLAMACPP_DIR}
|
||||
fi
|
||||
git submodule init
|
||||
git submodule update --force gguf
|
||||
git submodule update --force ${LLAMACPP_DIR}
|
||||
|
||||
}
|
||||
|
||||
apply_patches() {
|
||||
# Wire up our CMakefile
|
||||
if ! grep ollama gguf/examples/server/CMakeLists.txt; then
|
||||
echo 'include (../../../CMakeLists.txt) # ollama' >>gguf/examples/server/CMakeLists.txt
|
||||
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
|
||||
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
|
||||
fi
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
sed -e 's/int main(/int __main(/g' <./gguf/examples/server/server.cpp >./gguf/examples/server/server.cpp.tmp &&
|
||||
mv ./gguf/examples/server/server.cpp.tmp ./gguf/examples/server/server.cpp
|
||||
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
|
||||
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
|
||||
}
|
||||
|
||||
build() {
|
||||
@@ -49,5 +54,5 @@ install() {
|
||||
|
||||
# Keep the local tree clean after we're done with the build
|
||||
cleanup() {
|
||||
(cd gguf/examples/server/ && git checkout CMakeLists.txt server.cpp)
|
||||
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
|
||||
}
|
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
# This script is intended to run inside the go generate
|
||||
# working directory must be ../llm/llama.cpp
|
||||
# working directory must be ./llm/generate/
|
||||
|
||||
# TODO - add hardening to detect missing tools (cmake, etc.)
|
||||
|
||||
@@ -10,7 +10,7 @@ echo "Starting darwin generate script"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="gguf/build/darwin/metal"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal"
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
# This script is intended to run inside the go generate
|
||||
# working directory must be llm/llama.cpp
|
||||
# working directory must be llm/generate/
|
||||
|
||||
# First we build our default built-in library which will be linked into the CGO
|
||||
# binary as a normal dependency. This default build is CPU based.
|
||||
@@ -52,7 +52,7 @@ apply_patches
|
||||
# CPU first for the default library
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="gguf/build/linux/cpu"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
|
||||
|
||||
build
|
||||
install
|
||||
@@ -64,7 +64,7 @@ if [ -d /usr/local/cuda/lib64/ ]; then
|
||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="gguf/build/linux/cuda"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
|
||||
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
||||
build
|
||||
install
|
||||
@@ -98,7 +98,7 @@ if [ -d "${ROCM_PATH}" ]; then
|
||||
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
BUILD_DIR="gguf/build/linux/rocm"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
|
||||
build
|
||||
install
|
||||
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
|
@@ -3,6 +3,7 @@
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function init_vars {
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
$script:patches = @("0001-Expose-callable-API-for-server.patch")
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
|
||||
$script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
|
||||
@@ -19,25 +20,25 @@ function git_module_setup {
|
||||
# TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo
|
||||
& git submodule init
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
& git submodule update --force gguf
|
||||
& git submodule update --force "${script:llamacppDir}"
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
|
||||
function apply_patches {
|
||||
# Wire up our CMakefile
|
||||
if (!(Select-String -Path "gguf/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
|
||||
Add-Content -Path "gguf/examples/server/CMakeLists.txt" -Value 'include (../../../CMakeLists.txt) # ollama'
|
||||
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
|
||||
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
|
||||
}
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
$content = Get-Content -Path "./gguf/examples/server/server.cpp"
|
||||
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
|
||||
$content = $content -replace 'int main\(', 'int __main('
|
||||
Set-Content -Path "./gguf/examples/server/server.cpp" -Value $content
|
||||
Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content
|
||||
}
|
||||
|
||||
function build {
|
||||
write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs"
|
||||
write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs"
|
||||
& cmake --version
|
||||
& cmake -S gguf -B $script:buildDir $script:cmakeDefs
|
||||
& cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
|
||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
|
||||
@@ -55,7 +56,7 @@ function install {
|
||||
}
|
||||
|
||||
function cleanup {
|
||||
Set-Location "gguf/examples/server"
|
||||
Set-Location "${script:llamacppDir}/examples/server"
|
||||
git checkout CMakeLists.txt server.cpp
|
||||
}
|
||||
|
||||
@@ -64,20 +65,20 @@ git_module_setup
|
||||
apply_patches
|
||||
|
||||
# first build CPU based
|
||||
$script:buildDir="gguf/build/windows/cpu"
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
|
||||
|
||||
build
|
||||
install
|
||||
|
||||
# Then build cuda as a dynamically loaded library
|
||||
init_vars
|
||||
$script:buildDir="gguf/build/windows/cuda"
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cuda"
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
|
||||
build
|
||||
install
|
||||
|
||||
# TODO - actually implement ROCm support on windows
|
||||
$script:buildDir="gguf/build/windows/rocm"
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/rocm"
|
||||
|
||||
rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
|
||||
md "${script:buildDir}/lib" -ea 0 > $null
|
@@ -1,3 +1,3 @@
|
||||
package llm
|
||||
package generate
|
||||
|
||||
//go:generate sh ./gen_darwin.sh
|
@@ -1,3 +1,3 @@
|
||||
package llm
|
||||
package generate
|
||||
|
||||
//go:generate bash ./gen_linux.sh
|
@@ -1,3 +1,3 @@
|
||||
package llm
|
||||
package generate
|
||||
|
||||
//go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1
|
@@ -13,7 +13,7 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/gguf/ggml-metal.metal
|
||||
//go:embed llama.cpp/ggml-metal.metal
|
||||
var libEmbed embed.FS
|
||||
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
|
||||
@@ -22,7 +22,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/gguf/ggml-metal.metal")
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
// TODO perhaps consider this a hard failure on arm macs?
|
||||
|
@@ -34,6 +34,8 @@ type shimExtServer struct {
|
||||
var shimMutex sync.Mutex
|
||||
var llm *shimExtServer
|
||||
|
||||
const pathComponentCount = 6
|
||||
|
||||
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_init(llm.s, sparams, err)
|
||||
}
|
||||
@@ -112,7 +114,7 @@ func (llm *shimExtServer) Close() {
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/*/*/lib/*")
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
log.Printf("%s", payloadMissing)
|
||||
@@ -151,13 +153,13 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
|
||||
for _, file := range files {
|
||||
pathComps := strings.Split(file, "/")
|
||||
if len(pathComps) != 7 {
|
||||
if len(pathComps) != pathComponentCount {
|
||||
log.Printf("unexpected payload components: %v", pathComps)
|
||||
continue
|
||||
}
|
||||
// llama.cpp/gguf/build/$OS/$VARIANT/lib/$LIBRARY
|
||||
// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
|
||||
// Include the variant in the path to avoid conflicts between multiple server libs
|
||||
targetDir := filepath.Join(workDir, pathComps[4])
|
||||
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read payload %s: %v", file, err)
|
||||
|
@@ -10,7 +10,7 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/gguf/build/*/*/lib/*.so
|
||||
//go:embed llama.cpp/build/*/*/lib/*.so
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
|
@@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/gguf/build/windows/*/lib/*.dll
|
||||
//go:embed llama.cpp/build/windows/*/lib/*.dll
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
|
Reference in New Issue
Block a user